# Smote+TOMEK Method

This notebook will apply the SMOTE+TOMEK balancing method to the training data.

In [1]:

from google.colab import files


uploaded = files.upload()

Saving train.csv to train.csv
Saving val.csv to val.csv


In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
X_train = df_train.drop('Diabetes_binary', axis=1)
y_train = df_train['Diabetes_binary']

val_df = pd.read_csv('val.csv')
X_val = val_df.drop('Diabetes_binary', axis=1)
y_val = val_df['Diabetes_binary']

print("Pre-sampling class imbalance:")
print(y_train.value_counts(normalize=True))

Pre-sampling class imbalance:
0.0    0.860664
1.0    0.139336
Name: Diabetes_binary, dtype: float64


In [3]:
from imblearn.over_sampling import RandomOverSampler

random_over = RandomOverSampler(random_state=92)
X_random_over, y_random_over= random_over.fit_resample(X_train, y_train)

print("Class distribution after Random under sampling:")
print(y_random_over.value_counts(normalize=True))

%store X_random_over 
%store y_random_over

Class distribution after Random under sampling:
0.0    0.5
1.0    0.5
Name: Diabetes_binary, dtype: float64
Stored 'X_random_over' (DataFrame)
Stored 'y_random_over' (Series)


# MLP #
## Hyperparameter Optimization ##

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
mlp_gs = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (100,), (150,)],
    'activation': ['logistic', 'tanh', 'relu'],
}
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_random_over, y_random_over) # X is train samples and y is the corresponding labels

In [6]:
print("Best parameters found:\n", clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'hidden_layer_sizes': (150,)}


In [7]:
y_pred = clf.predict(X_val)
print(f"results on validation set: {classification_report(y_val, y_pred)}")

results on validation set:               precision    recall  f1-score   support

         0.0       0.95      0.70      0.80     43667
         1.0       0.29      0.77      0.42      7069

    accuracy                           0.71     50736
   macro avg       0.62      0.73      0.61     50736
weighted avg       0.86      0.71      0.75     50736

