In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
file_path = '../data/AirQualityUCI.xlsx'  # Ensure the file path is correct
data = pd.read_excel(file_path)

# Replace missing values (-200) with NaN
data.replace(-200, np.nan, inplace=True)

# Drop 'Date' and 'Time' columns, as they are not numeric
data = data.drop(columns=['Date', 'Time'])

# Fill missing values with the mean of the respective columns
data.fillna(data.mean(), inplace=True)

# Normalize the numerical features using MinMaxScaler
numeric_columns = data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()

# Apply the scaler to the numeric columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Verify the data after normalization
print(data.head())



     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)   NOx(GT)  \
0  0.211864     0.511849  0.120981  0.184498       0.361737  0.111036   
1  0.161017     0.463196  0.088832  0.145428       0.312167  0.068382   
2  0.177966     0.542011  0.068528  0.139148       0.303701  0.087339   
3  0.177966     0.522980  0.061760  0.142780       0.308617  0.115098   
4  0.127119     0.448833  0.037225  0.100156       0.247030  0.087339   

   PT08.S3(NOx)   NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)         T        RH  \
0      0.311024  0.328694      0.513040     0.454654  0.333333  0.499057   
1      0.360796  0.266509      0.453125     0.326382  0.326882  0.484287   
2      0.346500  0.331655      0.451214     0.370588  0.296774  0.563168   
3      0.326168  0.355345      0.464366     0.426741  0.277419  0.638906   
4      0.374034  0.337578      0.422212     0.386228  0.280645  0.633564   

         AH  
0  0.280046  
1  0.264279  
2  0.276374  
3  0.294198  
4  0.295215  


In [2]:
from sklearn.model_selection import train_test_split

# Define the target (dependent) variable and the features (independent variables)
X = data.drop(columns=['CO(GT)'])  # Independent variables (features)
y = data['CO(GT)']  # Dependent variable (target)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting datasets
print(f'Training set size: {X_train.shape}, Test set size: {X_test.shape}')


Training set size: (7485, 12), Test set size: (1872, 12)


In [3]:
# Print the column names to verify if 'CO(GT)' exists
print(data.columns)


Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH', 'AH'],
      dtype='object')


In [4]:
# Check the number of rows in the training data
print(f"Training data shape: {X_train.shape}")

# Check if the time_idx is continuous and has enough data points
print(f"Min time_idx: {X_train['time_idx'].min()}, Max time_idx: {X_train['time_idx'].max()}")


Training data shape: (7485, 12)


KeyError: 'time_idx'

In [5]:
# Mevcut grup bilgilerini inceleyin
print(X_train['time_idx'].unique())

# Yeterli veri olup olmadığını kontrol edin
print(X_train.groupby('time_idx').size())


KeyError: 'time_idx'

In [6]:
print(data.columns)


Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH', 'AH'],
      dtype='object')


In [7]:

backup_data = pd.read_excel('../data/AirQualityUCI.xlsx')

# Yedek veri setinden 'Date' ve 'Time' sütunlarını alın
date_time_columns = backup_data[['Date', 'Time']]

# Mevcut veri setimize 'Date' ve 'Time' sütunlarını geri ekleyelim
data['Date'] = date_time_columns['Date']
data['Time'] = date_time_columns['Time']

# 'Date' ve 'Time' sütunlarını string formatında birleştirip datetime formatına çevirin
data['datetime'] = pd.to_datetime(data['Date'].astype(str) + ' ' + data['Time'].astype(str), format='%Y-%m-%d %H:%M:%S')

# 'time_idx' sütununu oluşturma
data['time_idx'] = data['datetime'].astype('category').cat.codes

# Oluşturulan 'time_idx' sütununu ve veri setini kontrol edelim
print(data[['Date', 'Time', 'time_idx']].head())
print(data.head())



        Date      Time  time_idx
0 2004-03-10  18:00:00         0
1 2004-03-10  19:00:00         1
2 2004-03-10  20:00:00         2
3 2004-03-10  21:00:00         3
4 2004-03-10  22:00:00         4
     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)   NOx(GT)  \
0  0.211864     0.511849  0.120981  0.184498       0.361737  0.111036   
1  0.161017     0.463196  0.088832  0.145428       0.312167  0.068382   
2  0.177966     0.542011  0.068528  0.139148       0.303701  0.087339   
3  0.177966     0.522980  0.061760  0.142780       0.308617  0.115098   
4  0.127119     0.448833  0.037225  0.100156       0.247030  0.087339   

   PT08.S3(NOx)   NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)         T        RH  \
0      0.311024  0.328694      0.513040     0.454654  0.333333  0.499057   
1      0.360796  0.266509      0.453125     0.326382  0.326882  0.484287   
2      0.346500  0.331655      0.451214     0.370588  0.296774  0.563168   
3      0.326168  0.355345      0.464366     0.426741  0.277

In [8]:
# Gereksiz sütunları çıkaralım
data = data.drop(columns=['Date', 'Time', 'datetime'], errors='ignore')

# Güncellenmiş veri setini kontrol edelim
print(data.head())


     CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)   NOx(GT)  \
0  0.211864     0.511849  0.120981  0.184498       0.361737  0.111036   
1  0.161017     0.463196  0.088832  0.145428       0.312167  0.068382   
2  0.177966     0.542011  0.068528  0.139148       0.303701  0.087339   
3  0.177966     0.522980  0.061760  0.142780       0.308617  0.115098   
4  0.127119     0.448833  0.037225  0.100156       0.247030  0.087339   

   PT08.S3(NOx)   NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)         T        RH  \
0      0.311024  0.328694      0.513040     0.454654  0.333333  0.499057   
1      0.360796  0.266509      0.453125     0.326382  0.326882  0.484287   
2      0.346500  0.331655      0.451214     0.370588  0.296774  0.563168   
3      0.326168  0.355345      0.464366     0.426741  0.277419  0.638906   
4      0.374034  0.337578      0.422212     0.386228  0.280645  0.633564   

         AH  time_idx  
0  0.280046         0  
1  0.264279         1  
2  0.276374         2  
3  0.294

In [9]:
# Sütun isimlerinde nokta (.) karakterini alt çizgi (_) ile değiştirelim
data.columns = data.columns.str.replace('.', '_')

# Sütun isimlerinin güncellendiğini kontrol edelim
print(data.columns)

Index(['CO(GT)', 'PT08_S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08_S2(NMHC)',
       'NOx(GT)', 'PT08_S3(NOx)', 'NO2(GT)', 'PT08_S4(NO2)', 'PT08_S5(O3)',
       'T', 'RH', 'AH', 'time_idx'],
      dtype='object')


In [12]:
import torch
from torch import nn
from torch.optim import Adam
from pytorch_lightning import LightningModule, Trainer
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np

# Örnek veriler
x = np.random.rand(1000, 10).astype(np.float32)
y = np.random.rand(1000, 1).astype(np.float32)
dataset = TensorDataset(torch.tensor(x), torch.tensor(y))

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Model tanımı
class MyModel(LightningModule):
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer_1 = nn.Linear(10, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 1)
        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        x = torch.relu(self.layer_1(x))
        x = torch.relu(self.layer_2(x))
        x = self.layer_3(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        self.log("val_loss", loss)

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)

# Modeli oluştur
model = MyModel()

# Trainer tanımı ve eğitme işlemi
trainer = Trainer(
    max_epochs=30,
    accelerator="gpu" if torch.cuda.is_available() else "cpu"
)

# Modeli eğit
trainer.fit(model, train_dataloader, val_dataloader)



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | layer_1 | Linear  | 704    | train
1 | layer_2 | Linear  | 4.2 K  | train
2 | layer_3 | Linear  | 65     | train
3 | loss_fn | MSELoss | 0      | train
--------------------------------------------
4.9 K     Trainable params
0         Non-trainable params
4.9 K     Total params
0.020     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\egeme\PycharmProjects\pythonProject1\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
C:\Users\egeme\PycharmProjects\pythonProject1\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
C:\Users\egeme\PycharmProjects\pythonProject1\.venv\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:298: The number of training batches (25) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
