# Oversampling

This notebook has the code for oversampling the data with SMOTE. Nothing from this notebook made it to the final modelling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline

from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import cross_validate, StratifiedShuffleSplit, GridSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

from imblearn.over_sampling import SMOTENC

## Load the data

In [3]:
features = pd.read_csv("./../Training/X_train.csv")
labels = pd.read_csv("./../Training/y_train.csv")

In [4]:
print(f"The shape of training feature matrix is {features.shape}")
print(f"The shape of training labels is {labels.shape}")

The shape of training feature matrix is (33050, 44)
The shape of training labels is (33050, 2)


In [5]:
#Sanity check to see if the rows in X and y corresponds to the same entities
assert (features["Unique_ID"] == labels["Unique_ID"]).all()

## Train Test split

We need to set aside a portion of our training data to perform evaluations for selecting the best model. It is best to do it in the outset itself to avoid any data leakage or bias

In [6]:
y = labels["Dependent_Variable"]
X = features.drop("Unique_ID", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y) # Random state is set to ensure reproducibility

In [7]:
print(f"The shape of X_train is {X_train.shape}")
print(f"The shape of y_train is {y_train.shape}")
print()
print(f"The shape of X_test is {X_test.shape}")
print(f"The shape of y_test is {y_test.shape}")

The shape of X_train is (26440, 43)
The shape of y_train is (26440,)

The shape of X_test is (6610, 43)
The shape of y_test is (6610,)


## Helper functions for training and evaluation

In [8]:
def train_model(model, X_train, y_train, cv=5):
    '''Trains the  model
    Args:
        model: Instance of the model to train
        X_train: Training data feature matrix
        y_train: Training data label vector
        cv: Cross validation Scheme
        
    Returns:
        trained model
        cv_results
    '''

    
    cv_results = cross_validate(model, X_train, y_train,
                                cv=cv,
                                scoring='roc_auc', 
                                return_train_score=True)

    mean_train_score = cv_results["train_score"].mean()
    mean_val_score = cv_results["test_score"].mean()
    
    std_train_score = cv_results["train_score"].std()
    std_val_score = cv_results["test_score"].std()

    print(f"Cross validated training results for the model")
    print(f"Train score: {mean_train_score} +/- {std_train_score}" )
    print(f"Validation score: {mean_val_score} +/- {std_val_score}" )

    trained_model = model.fit(X_train, y_train)

    return trained_model, cv_results

def evaluate_model(model, X_test, y_test):

    '''Evaluates the  model
    Args:
        fitted_model: Instance of the model to train
        X_test: Test data feature matrix
        y_test: Test data label vector
        
    Returns:
        metrics: A dictionary containing auc_roc score, accuracy, fpr and tpr
    '''
    y_pred = model.predict(X_test)
    y_scores = model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_scores)
    acc = accuracy_score(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    print("AUC-ROC score on test set: ", auc)
    print("Accuracy score on test set: ", acc)
    metrics = {"auc_roc": auc, "accuracy": acc, "fpr": fpr, "tpr": tpr}
    
    return metrics

def tune_model(model, param_grid, cv=5):
    '''Do hyper parameter tuning using GridSearch strategy
    
        Args:
            model: Model to be tuned
            param_grid: dict of parameters
            X_train: Feature matrix
            y_train: Label matrix
        
        Returns: 
            best parameters
            best estimator
        '''

    search = GridSearchCV(model, param_grid = param_grid,
                        cv = cv,
                        scoring='roc_auc',
                        return_train_score=True)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    cv_results = search.cv_results_
    
    print("Best parameters: ", search.best_params_)
    
    print("-------------------Best model performance --------------------------")
    
    mean_train_score = search.cv_results_['mean_train_score'][search.best_index_]
    mean_val_score = search.cv_results_['mean_test_score'][search.best_index_]
    std_train_score = search.cv_results_['std_train_score'][search.best_index_]
    std_val_score = search.cv_results_['std_test_score'][search.best_index_]

    print(f"Score of the model on the train set:\n"
         f"{mean_train_score:.3f} +/- {std_train_score:.6f}")

    print(f"Score of the model on the validation set:\n"
        f"{mean_val_score:.3f} +/- {std_val_score:.6f}")
    
    
    
    return best_model, cv_results

def compare_models(model_list, preprocess_pipe, cv):
    '''Compare a list of models defined in model_list and return the auc, fpr and tpr
        Args:
            model_list: List of models to compare. Must be a list of dictionaries with keys;
                            name - name of the model
                            model - an instance of the estimator,
                            param_grid - hyper parameter grid for hp tuning with gridsearch

            preprocess_pipe: Pipeline object for preprocessing
            cv: cross validation scheme

        Returns:
            logged_metrics: A dictionary with model names as keys
    '''
    logged_metrics = {}
    for model in model_list:
        model_name = model["name"]
        print(f"Training {model_name} model...")
        model_pipe = Pipeline([('preprocess', preprocess_pipe),
                                ('model', model["model"])])
        
        print(f"Tuning hyper parameters...")
        
        param_grid = model["param_grid"]
        tuned_model, _ = tune_model(model_pipe, param_grid, cv=cv)
        metrics = evaluate_model(tuned_model, X_test, y_test)

        #store metrics for later use
        logged_metrics[model_name] = metrics

    return logged_metrics


In [9]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

## Preprocessing



In [29]:
# Categorical features
cat_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8']

# Numerical features -> N5 - N8 & N20, N27 removed because of high correlation. 
#                       N25 - N32 removed due to large number of missing values

num_features = ['N1', 'N2', 'N3', 'N4', 'N9', 'N10', 'N10.1', 'N11', 
                'N12', 'N14','N15','N16', 'N17', 'N18', 'N19', 'N20',
                 'N21', 'N22', 'N23', 'N24', 'N33', 'N34','N35']

Preprocessing is left the same as before

In [31]:
## Pipeline for categorical features

# select the categorical features from the input
select_cat_features = ColumnTransformer([('select_cat', 'passthrough', cat_features)])

#OneHot encoding
cat_transformers = Pipeline([('selector', select_cat_features),
                            ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist')),
                            ])

## Pipeline for numerical featurees

# select the numerical variables                            
select_num_features = ColumnTransformer([('select_num', 'passthrough', num_features)])

# Imputing for missing values, Scaling
num_transformers = Pipeline([('selector', select_num_features),
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler()),
                            ])

## combining both pipelines
preprocess_pipe = FeatureUnion([('cat', cat_transformers),
                                ('num', num_transformers),
                                ])

## Fine tuning Random Forest

In [33]:
rf_model = Pipeline([("preprocessing", preprocess_pipe),
                    ("model", RandomForestClassifier(random_state=42))])
trained_rf, _ = train_model(rf_model, X_train, y_train, cv=cv)
metrics = evaluate_model(trained_rf, X_test, y_test)

Cross validated training results for the model
Train score: 1.0 +/- 4.965068306494546e-17
Validation score: 0.7607794270530097 +/- 0.0035139542074323184
AUC-ROC score on test set:  0.7617593974432431
Accuracy score on test set:  0.7428139183055976


In [36]:
param_grid = {'model__min_samples_split':[4,8,10,12], 
            'model__n_estimators':[300, 350, 400, 500]}

best_model, cv_results = tune_model(rf_model, param_grid, cv=cv)


Best parameters:  {'model__min_samples_split': 8, 'model__n_estimators': 500}
-------------------Best model performance --------------------------
Score of the model on the train set:
1.000 +/- 0.000026
Score of the model on the validation set:
0.768 +/- 0.003582


## Oversampling

Eventhough the class imbalance is not severe, it might be worth while to try oversampling with SMOTE

Note: - Only the models with hyperparameters which were found to perform well earlier is used here. The reason why HP tuning was not done after Oversampling is due to the difficulty in incorporating SMOTE into model pipelines. If they are not integrated into pipelines, data leakage will occur while doing cross validation and therefore the results will be unrealiable.

In [38]:
features = cat_features + num_features

smote = SMOTENC(categorical_features=list(range(len(cat_features))))


X_train_selected = X_train[features]
X_test_selected = X_test[features]

imputer = SimpleImputer()

X_train_imputed = imputer.fit_transform(X_train_selected)
X_test_imputed = imputer.transform(X_test_selected)

X_train_os, y_train_os = smote.fit_resample(X_train_imputed, y_train)

In [48]:
X_train_os.shape

(36550, 31)

In [44]:
## Pipeline for categorical features

indices = list(range(len(features)))
# select the categorical features from the input
select_cat_features = ColumnTransformer([('select_cat', 'passthrough', indices[:len(cat_features)])])

#OneHot encoding
cat_transformers = Pipeline([('selector', select_cat_features),
                            ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist')),
                            ])

## Pipeline for numerical featurees

# select the numerical variables                            
select_num_features = ColumnTransformer([('select_num', 'passthrough', indices[len(cat_features):])])

# Imputing for Scaling
num_transformers = Pipeline([('selector', select_num_features),
                            ('scaler', StandardScaler()),
                            ])

## combining both pipelines
preprocess_pipe = FeatureUnion([('cat', cat_transformers),
                                ('num', num_transformers),
                                ])

### Random Forest

In [50]:
rf_model = Pipeline([("preprocessing", preprocess_pipe),
                    ("model", RandomForestClassifier(n_estimators = 500,
                                                    min_samples_split = 8,
                                                    random_state=42))])

trained_rf = rf_model.fit(X_train_os, y_train_os)
metrics = evaluate_model(trained_rf, X_test_imputed, y_test)

AUC-ROC score on test set:  0.7578638780465548
Accuracy score on test set:  0.7295007564296521


### XGBoost

In [52]:
xgb_model = Pipeline([("preprocessing", preprocess_pipe),
                    ("model", XGBClassifier(n_estimators = 100,
                                            max_depth = 5,
                                            random_state=42))])

trained_xgb = xgb_model.fit(X_train_os, y_train_os)
metrics = evaluate_model(trained_xgb, X_test_imputed, y_test)

AUC-ROC score on test set:  0.7637562706902888
Accuracy score on test set:  0.7413010590015129


Oversampling hasn't improved our models. Therefore we will stick with our old models