In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
#Solution
import pandas as pd
import numpy as np

In [5]:
file_url = 'https://code.datasciencedojo.com/datasciencedojo/datasets/raw/master/Beijing%20PM2.5/PRSA_data_2010.1.1-2014.12.31.csv'

In [6]:
df = pd.read_csv(file_url)

In [7]:
df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [8]:
df.shape

(43824, 13)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No      43824 non-null  int64  
 1   year    43824 non-null  int64  
 2   month   43824 non-null  int64  
 3   day     43824 non-null  int64  
 4   hour    43824 non-null  int64  
 5   pm2.5   41757 non-null  float64
 6   DEWP    43824 non-null  int64  
 7   TEMP    43824 non-null  float64
 8   PRES    43824 non-null  float64
 9   cbwd    43824 non-null  object 
 10  Iws     43824 non-null  float64
 11  Is      43824 non-null  int64  
 12  Ir      43824 non-null  int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 4.3+ MB


In [10]:
df.describe()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir
count,43824.0,43824.0,43824.0,43824.0,43824.0,41757.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0
mean,21912.5,2012.0,6.523549,15.72782,11.5,98.613215,1.817246,12.448521,1016.447654,23.88914,0.052734,0.194916
std,12651.043435,1.413842,3.448572,8.799425,6.922266,92.050387,14.43344,12.198613,10.268698,50.010635,0.760375,1.415867
min,1.0,2010.0,1.0,1.0,0.0,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,10956.75,2011.0,4.0,8.0,5.75,29.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,21912.5,2012.0,7.0,16.0,11.5,72.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,32868.25,2013.0,10.0,23.0,17.25,137.0,15.0,23.0,1025.0,21.91,0.0,0.0
max,43824.0,2014.0,12.0,31.0,23.0,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [11]:
df.to_csv('../data/raw/pollution.csv', index=False)

In [12]:
df_cleaned = df.copy()

In [13]:
df_cleaned.drop('No', axis=1, inplace=True)

In [14]:
df_cleaned.dropna(inplace=True)

In [15]:
# Solution
df_cleaned.reset_index(drop=True, inplace=True)

In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [17]:
num_cols = ['year', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir']

In [18]:
sc = StandardScaler()

In [19]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [20]:
cat_cols = ['month', 'day', 'hour', 'cbwd']

In [21]:
ohe = OneHotEncoder(sparse=False)

In [22]:
X_cat = pd.DataFrame(ohe.fit_transform(df_cleaned[cat_cols]))

In [23]:
X_cat.columns = ohe.get_feature_names(cat_cols)

In [24]:
df_cleaned.drop(cat_cols, axis=1, inplace=True)

In [25]:
X = pd.concat([df_cleaned, X_cat], axis=1)

In [26]:
from sklearn.model_selection import train_test_split

In [26]:
def split_sets_by_time(df, target_col, test_ratio=0.2):
    """Split sets by indexes for an ordered dataframe

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    target_col : str
        Name of the target column
    test_ratio : float
        Ratio used for the validation and testing sets (default: 0.2)

    Returns
    -------
    Numpy Array
        Features for the training set
    Numpy Array
        Target for the training set
    Numpy Array
        Features for the validation set
    Numpy Array
        Target for the validation set
    Numpy Array
        Features for the testing set
    Numpy Array
        Target for the testing set
    """
    
    df_copy = df.copy()
    target = df_copy.pop(target_col)
    cutoff = int(len(target) / 5)
    
    X_train, y_train = subset_x_y(target=target, features=df_copy, start_index=0, end_index=-cutoff*2)
    X_val, y_val     = subset_x_y(target=target, features=df_copy, start_index=-cutoff*2, end_index=-cutoff)
    X_test, y_test   = subset_x_y(target=target, features=df_copy, start_index=-cutoff, end_index=len(target))

    return X_train, y_train, X_val, y_val, X_test, y_test

In [27]:
def save_sets(X_train=None, y_train=None, X_val=None, y_val=None, X_test=None, y_test=None, path='../data/processed/'):
    """Save the different sets locally

    Parameters
    ----------
    X_train: Numpy Array
        Features for the training set
    y_train: Numpy Array
        Target for the training set
    X_val: Numpy Array
        Features for the validation set
    y_val: Numpy Array
        Target for the validation set
    X_test: Numpy Array
        Features for the testing set
    y_test: Numpy Array
        Target for the testing set
    path : str
        Path to the folder where the sets will be saved (default: '../data/processed/')

    Returns
    -------
    """
    import numpy as np

    if X_train is not None:
      np.save(f'{path}X_train', X_train)
    if X_val is not None:
      np.save(f'{path}X_val',   X_val)
    if X_test is not None:
      np.save(f'{path}X_test',  X_test)
    if y_train is not None:
      np.save(f'{path}y_train', y_train)
    if y_val is not None:
      np.save(f'{path}y_val',   y_val)
    if y_test is not None:
      np.save(f'{path}y_test',  y_test)

In [40]:
### use functions
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_by_time(X, target_col='pm2.5', test_ratio=0.2)

NameError: name 'subset_x_y' is not defined

In [31]:
y_train.shape

(29229,)

In [28]:
train_data_target = X.pop('pm2.5')

train_ratio = 0.70
validation_ratio = 0.15
test_ratio = 0.15

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), train_data_target.to_numpy(), test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

KeyError: 'pm2.5'

In [29]:
import pandas as pd
import numpy as np

class NullModel:
    """
    Class used as baseline model for both regression and classification
    ...

    Attributes
    ----------
    target_type : str
        Type of ML problem (default regression)
    y : Numpy Array-like
        Target variable
    pred_value : Float
        Value to be used for prediction
    preds : Numpy Array
        Predicted array

    Methods
    -------
    fit(y)
        Store the input target variable and calculate the predicted value to be used based on the problem type
    predict(y)
        Generate the predictions
    fit_predict(y)
        Perform a fit followed by predict
    """
        
    
    def __init__(self, target_type: str = "regression"):
        self.target_type = target_type
        self.y = None
        self.pred_value = None
        self.preds = None
        
    def fit(self, y):
        self.y = y
        if self.target_type == "regression":
            self.pred_value = y.mean()
        else:
            from scipy.stats import mode
            self.pred_value = mode(y)[0][0]
    
    def predict(self, y):
        self.preds = np.full((len(y), 1), self.pred_value)
        return self.preds
    
    def fit_predict(self, y):
        self.fit(y)
        return self.predict(self.y)

In [30]:
def print_class_perf(y_preds, y_actuals, set_name=None, average='binary'):
    """Print the Accuracy and F1 score for the provided data

    Parameters
    ----------
    y_preds : Numpy Array
        Predicted target
    y_actuals : Numpy Array
        Actual target
    set_name : str
        Name of the set to be printed
    average : str
        Parameter  for F1-score averaging
    Returns
    -------
    """
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score

    print(f"Accuracy {set_name}: {accuracy_score(y_actuals, y_preds)}")
    print(f"F1 {set_name}: {f1_score(y_actuals, y_preds, average=average)}")

In [31]:
def print_reg_perf(y_preds, y_actuals, set_name=None):
    """Print the RMSE and MAE for the provided data

    Parameters
    ----------
    y_preds : Numpy Array
        Predicted target
    y_actuals : Numpy Array
        Actual target
    set_name : str
        Name of the set to be printed

    Returns
    -------
    """
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae
    
    print(f"RMSE {set_name}: {mse(y_actuals, y_preds, squared=False)}")
    print(f"MAE {set_name}: {mae(y_actuals, y_preds)}")

In [41]:
baseline_model = NullModel()
y_base = baseline_model.fit_predict(y_train)

In [42]:
print_reg_perf(y_base, y_train, set_name='Training')

RMSE Training: 91.97430455719193
MAE Training: 68.83353279913013


In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [33]:
class PytorchRegression(nn.Module):
    def __init__(self, num_features):
        super(PytorchRegression, self).__init__()

        self.layer_1 = nn.Linear(num_features, 128)
        self.layer_out = nn.Linear(128, 1)

    def forward(self, x):
        x = F.dropout(F.relu(self.layer_1(x)))
        x = self.layer_out(x)
        return (x)

In [53]:
model = PytorchRegression(X_train.shape[1])

In [61]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

In [62]:
device = get_device()
model.to(device)

  return torch._C._cuda_getDeviceCount() > 0


PytorchRegression(
  (layer_1): Linear(in_features=78, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=1, bias=True)
)

In [65]:
### awkward fix
device = torch.device('cpu')

In [54]:
print(model)

PytorchRegression(
  (layer_1): Linear(in_features=78, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=1, bias=True)
)


In [46]:
from torch.utils.data import Dataset, DataLoader

In [47]:
class PytorchDataset(Dataset):
    """
    Pytorch dataset
    ...

    Attributes
    ----------
    X_tensor : Pytorch tensor
        Features tensor
    y_tensor : Pytorch tensor
        Target tensor

    Methods
    -------
    __getitem__(index)
        Return features and target for a given index
    __len__
        Return the number of observations
    to_tensor(data)
        Convert Pandas Series to Pytorch tensor
    """
        
    def __init__(self, X, y):
        self.X_tensor = self.to_tensor(X)
        self.y_tensor = self.to_tensor(y)
    
    def __getitem__(self, index):
        return self.X_tensor[index], self.y_tensor[index]
        
    def __len__ (self):
        return len(self.X_tensor)
    
    def to_tensor(self, data):
        return torch.Tensor(np.array(data))

In [48]:
train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [49]:
from torch.utils.data import DataLoader

In [50]:
### Train Model

criterion = nn.MSELoss()

In [55]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [56]:
def train_regression(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, collate_fn=None):
    """Train a Pytorch regresssion model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        RMSE Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0

    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class)
        
        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()
        
        # Update Weights
        optimizer.step()
        
    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), np.sqrt(train_loss / len(train_data))

In [57]:
def test_regression(test_data, model, criterion, batch_size, device, collate_fn=None):
    """Calculate performance of a Pytorch regresssion model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        RMSE Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0

    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class)
            
            # Calculate global loss
            test_loss += loss.item()
            
    return test_loss / len(test_data), np.sqrt(test_loss / len(test_data))

In [58]:
N_EPOCHS = 5
BATCH_SIZE = 32

In [66]:
for epoch in range(N_EPOCHS):
    train_loss, train_rmse = train_regression(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_rmse = test_regression(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tRMSE: {train_rmse:.1f}')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tRMSE: {valid_rmse:.1f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0
	(train)	Loss: 335.1671	|	RMSE: 18.3
	(valid)	Loss: 272.4057	|	RMSE: 16.5


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1
	(train)	Loss: 267.6104	|	RMSE: 16.4
	(valid)	Loss: 272.4510	|	RMSE: 16.5
Epoch: 2
	(train)	Loss: 267.4702	|	RMSE: 16.4
	(valid)	Loss: 271.8549	|	RMSE: 16.5
Epoch: 3
	(train)	Loss: 267.3733	|	RMSE: 16.4
	(valid)	Loss: 272.1152	|	RMSE: 16.5
Epoch: 4
	(train)	Loss: 267.3358	|	RMSE: 16.4
	(valid)	Loss: 271.8675	|	RMSE: 16.5


In [67]:
torch.save(model, "../models/pytorch_reg_pm2_5.pt")

In [68]:
test_loss, test_rmse = test_regression(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tRMSE: {test_rmse:.1f}')

	Loss: 265.7728	|	RMSE: 16.3
