# Example for the Beijing Air Quality 2.5 Dataset

https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data

In [1]:
import torch

import pandas as pd

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

In [2]:
dataset_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00381/PRSA_data_2010.1.1-2014.12.31.csv'
df = pd.read_csv(dataset_link)

print(len(df), len(df.dropna()))

df_temp = df.copy()
df_temp = df_temp.dropna()
df_temp = df_temp.reset_index()


# df['Datetime'] = pd.to_datetime(df[['day', 'month', 'year']]) + pd.to_timedelta(df['hour'], 'h')

df_temp = df_temp.drop(['index', 'No', 'day', 'month', 'year', 'hour'], axis=1)
df_temp['cbwd'] = df_temp['cbwd'].astype('category').cat.codes

# for feature_name in df_temp.columns:
#    max_value = df_temp[feature_name].max()
#    min_value = df_temp[feature_name].min()
    
#    df_temp[feature_name] = (df_temp[feature_name] - min_value) / (max_value - min_value)

43824 41757


In [3]:
df_temp.iloc[:10]

Unnamed: 0,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,129.0,-16,-4.0,1020.0,2,1.79,0,0
1,148.0,-15,-4.0,1020.0,2,2.68,0,0
2,159.0,-11,-5.0,1021.0,2,3.57,0,0
3,181.0,-7,-5.0,1022.0,2,5.36,1,0
4,138.0,-7,-5.0,1022.0,2,6.25,2,0
5,109.0,-7,-6.0,1022.0,2,7.14,3,0
6,105.0,-7,-6.0,1023.0,2,8.93,4,0
7,124.0,-7,-5.0,1024.0,2,10.72,0,0
8,120.0,-8,-6.0,1024.0,2,12.51,0,0
9,132.0,-7,-5.0,1025.0,2,14.3,0,0


In [4]:
train_size = int(len(df_temp) * 0.8)

train = df_temp.iloc[:train_size]
test = df_temp.iloc[train_size + 1:]

X_train = train.drop(['pm2.5'], axis=1)
y_train = train['pm2.5']

X_test = test.drop(['pm2.5'], axis=1)
y_test = test['pm2.5']

print(len(train), len(test))
print(len(X_train), len(y_train), len(X_test), len(y_test))

33405 8351
33405 33405 8351 8351


In [5]:
def create_sequences(X, y, n_steps=24):
    dataset = []
    
    for i in range(len(X) - (n_steps + 2)):
        sample = [
            X.iloc[i:i + n_steps].values,
            y.iloc[i + n_steps + 1]
        ]
        
        dataset.append(sample)
        
    return dataset

In [6]:
class StepDataset(Dataset):

    def __init__(self, X, y, n_steps=24):
        self.X = X
        self.y = y
        
        self.internal_dataset = create_sequences(X, y, n_steps=24)
    
    def __len__(self):
        return len(self.internal_dataset)
    
    def __getitem__(self, idx):
        inputs = self.internal_dataset[idx][0]
        label = self.internal_dataset[idx][1]
        
        return inputs, label

In [7]:
dataset_train = StepDataset(X_train, y_train)
dataset_test = StepDataset(X_test, y_test)

dataloader_train = DataLoader(dataset_train, batch_size=12, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=12, shuffle=False)

In [8]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.conv1d = nn.Sequential(
            nn.Conv1d(7, 32, kernel_size=3),
            nn.ReLU(inplace=True)
        )
        self.conv2d = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3),
            nn.ReLU(inplace=True)
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(20 * 64, 50),
            nn.Dropout(0.3),
            nn.ReLU(inplace=True)
        )
        self.fc2 = nn.Linear(50, 1)
        
    def forward(self, x):
        x = self.conv1d(x)
        x = self.conv2d(x)
        batch_size = x.shape[0]
        x = x.view(batch_size, -1)
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [9]:
def trainer(model, dataloader_train, criterion):
    running_loss = 0

    model.train()

    for idx, (inputs, labels) in enumerate(dataloader_train):
        inputs = inputs.float().permute(0, 2, 1).to(device)
        labels = labels.float().to(device)

        optimizer.zero_grad()
        preds = model(inputs)
        loss = criterion(preds.reshape(-1), labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    train_loss = running_loss / len(dataloader_train)
    
    return train_loss

def validator(model, dataloader_test, criterion):
    running_loss = 0

    model.eval()

    for idx, (inputs, labels) in enumerate(dataloader_test):
        inputs = inputs.float().permute(0, 2, 1).to(device)
        labels = labels.float().to(device)

        preds = model(inputs)
        loss = criterion(preds.reshape(-1), labels)
        
        running_loss += loss.item()

    train_loss = running_loss / len(dataloader_train)
    
    return train_loss

In [10]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss = RMSELoss()

In [12]:
epochs = 1000

for epoch in range(epochs):
    train_loss = trainer(model, dataloader_train, loss)
    if epoch % 10 == 0:
        print('Validation', validator(model, dataloader_test, loss))
    print('Train', train_loss)

Validation 17.20979735870142
Train 72.1130561766909
Train 69.7252706415271
Train 67.74193376932617
Train 66.14478687396759
Train 65.23106870421567
Train 64.41461335070093
Train 63.49323950004098
Train 62.4588500299872
Train 60.82255411542293
Train 59.935821759537944
Validation 23.175282975652078
Train 59.37668974394425
Train 58.68498785233515
Train 58.496706160343784
Train 57.997851409644724
Train 57.63929932439696
Train 57.56376549967924
Train 56.88487082861209
Train 56.65983217324737
Train 56.54654724284267
Train 56.241874590612504
Validation 19.869856063193506
Train 55.86395411230865
Train 55.55405891378692
Train 55.43719615051851
Train 55.2940942348978
Train 55.09492959006067
Train 54.82911273697826
Train 54.549565406603236
Train 54.6026450374159
Train 54.46109511704859
Train 54.049689936003695
Validation 18.01911223594616
Train 54.03979208082025
Train 53.906990641231246
Train 53.595165890039596
Train 53.801030064640415
Train 53.59783404255668
Train 53.463068065927835
Train 53.2225

KeyboardInterrupt: 