In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Description

In [None]:
df = pd.read_csv("assign.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

# Data visualization

In [None]:
# plot correlation matrix
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)

In [None]:
import scipy.stats as stats
stats.pointbiserialr(df["y"], df["x9"])

In [None]:
sns.pairplot(df,hue='y',palette='coolwarm')

In [None]:
sns.boxplot(x="x7", y="x9", hue="y",data=df, palette="coolwarm")

In [None]:
sns.jointplot(x='x1',y='x9',data=df,kind='scatter', hue='y')

In [None]:
sns.jointplot(x='x10',y='x17',data=df,kind='scatter', hue='y')

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='x2',y='x13',data=df, hue='y', palette='rainbow')

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='x2',y='x16',data=df, hue='y', palette='rainbow')

In [None]:
plt.figure(figsize=(15,10))
sns.violinplot(x='x2',y='x17',data=df, hue='y', palette='rainbow')

In [None]:
df.hist(bins = 50, figsize = (20, 15))

In [None]:
sns.displot(df['x15'], bins=30)

In [None]:
sns.countplot(data=df, x='x3', hue='x8')

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data=df, x='x2', hue='x4')
plt.show()

In [None]:
sns.countplot(data=df, x='x3', hue='x4')

# Data Preprocessing
Now we need to process null value, but we need to identify what is the null value represent in the dataset.

In [None]:
print(df['x2'].value_counts())

In [None]:
print(df['x3'].value_counts())

In [None]:
print(df['x4'].value_counts())

In [None]:
print(df['x5'].value_counts())

In [None]:
print(df['x6'].value_counts())

In [None]:
print(df['x7'].value_counts())

In [None]:
print(df['x8'].value_counts())

Now we can conclude that null value will include NaN and unknown for categorical data. Besides, there will have NaN value for x14 for numerical data. So we need to tell pandas to treat these values as null value. *Note - NaN is defaultly represent null value so we need to include this as well.

In [None]:
na_values = ['NaN', "unknown"]
na_values = ['NaN', "unknown"]
data = pd.read_csv("assign.csv", na_values=na_values)

In [None]:
data.isnull().sum()

From the graph below, we know that the distribution of null value according to each column in dataset

In [None]:
data.isnull().sum().plot(kind='bar', figsize=(10, 5))

So, we will handle the null value with following tehnique using folowing method
- Filling in using median value for x14
- Fill in mostly appear value for categorical data [x2, x3, x4, x5, x6, x8]
- Drop x14 

In [None]:
median = data['x14'].median()
data['x14'].fillna(median, inplace=True)

In [None]:
# most frequently value for x3
data.mode().loc[:,['x2', 'x3', 'x4', 'x5', 'x6', 'x8']]

In [None]:
# Fill x3, x2, x8 with most frequently occuring value
data['x2'].fillna("admin", inplace=True)
data['x3'].fillna("married", inplace=True)
data['x4'].fillna("university", inplace=True)
data['x5'].fillna("yes", inplace=True)
data['x6'].fillna("no", inplace=True)
data['x8'].fillna("cell", inplace=True)

In [None]:
sns.countplot(x="x11", data=data)

In [None]:
# Since there are only 3 percent of data that is not 999.
# In other word, there are 97 percent of data value is 999
# Therefore, this column doesn't have any value for modeling.
data['x11'][data['x11'] != 999].count() / data['x11'][data['x11'] == 999].count()

In [None]:
data.drop("x11", axis=1, inplace=True)

In [None]:
sns.countplot(x="x12", data=data)

In [None]:
# There are 15 percent of data value is not 0
# In other word, there are 85 percent of data value is 0
# This column may have some value for modeling, however 
# we would drop it for fine tuning section to see whether the performance of model improve if we drop this column.
data['x12'][data['x12'] != 0].count() / data['x12'][data['x12'] == 0].count()

Convert categorical data to numerical data using one hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train data to categorical and numerical
X_train_num = X_train.drop(['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8'], axis=1)
X_train_cat = X_train[['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8']]

In [None]:
# Initialize preprocessing instances
scalar = StandardScaler()
encode = OneHotEncoder()

full_pipeline = ColumnTransformer([
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

In [None]:
# After deal with categorical and numerical data, we need to resample the data since 
# it has unbalance data
# So, false value is far greater than true value
y.value_counts().plot.pie()

### Undersampling
This resampling technique is temparory, more resampling technique will be examine in fine tune section

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Now we need to resample the data to balance the data
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

ax = y_train_resampled.value_counts().plot.pie()
ax.set_title("Random Undersampling")


# Model training and validation

### Model 1 - SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)

In [None]:
predictions = sgd_clf.predict(X_test_preapred)

In [None]:
# Note that this result is from y_test with not yet resample.
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print("Unresempled test data prediction") 
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))


In [None]:
# This is the result from y_test after resample
X_test_resampled, y_test_resampled = rus.fit_resample(X_test_preapred, y_test)
sgd_clf_resampled = SGDClassifier(max_iter=5000, tol=1e-3, random_state=42)
sgd_clf_resampled.fit(X_train_resampled, y_train_resampled)
predictions = sgd_clf.predict(X_test_resampled)
print("Resempled test data prediction") 
print(classification_report(y_test_resampled, predictions))
print(confusion_matrix(y_test_resampled, predictions))

In [None]:
# performing cross validation
# However the cross validation score seem not too good.
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
sgd_clf = SGDClassifier(max_iter=5000, tol=1e-3, random_state=42)
pipeline = imbpipeline(steps = [['randomUnderSample', RandomUnderSampler(random_state=11)],
                                ['scaler', StandardScaler()],
                                ['classifier', sgd_clf]])
stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
params = {
    "classifier__loss" : ["hinge", "log_loss", "squared_hinge", "modified_huber", "perceptron"],
    "classifier__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "classifier__penalty" : ["l2", "l1", "elasticnet", "none"],
}
#     grid = GridSearchCV(clf, param_grid=params, cv=5)
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           scoring='f1',
                           cv=stratified_kfold,
                           n_jobs=-1)
grid_search.fit(X_train_prepared, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test_preapred, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print(f'Best parameters: {grid_search.best_params_}')

#### Model 1 fine tuning and testing

According to visualization above, we found that x10 and x13 is important feature that
able to clear classify of true and false value.

As mention in data preprocessing, we can fine tune the model by removing x12.

Clearly that removing x12 and applying polynominal to features x10 and x13 will improve precision. For obvious result, the f1-score after fine tune is imporve

In [None]:
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()
# Fine tune the data before feed to model
X_train_num_copy = X_train_num.copy()
X_train_num_copy = X_train_num_copy.drop("x12", axis=1)
X_train_cat_copy = X_train_cat.copy()
X_train_cat_copy = X_train_cat_copy.drop("x5", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x6", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x7", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x8", axis=1) # Fine tune

full_pipeline = ColumnTransformer([
        # polynomial transformation
        ("poly", poly, X_train_num_copy[['x10', 'x13', 'x16']].columns), # Fine tune 
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

# Now we need to resample the data to balance the data
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)
X_test_resampled, y_test_resampled = rus.fit_resample(X_test_preapred, y_test)

# Train the model
sgd_clf = SGDClassifier(max_iter=5000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)

# Test the model on unresempled data
predictions = sgd_clf.predict(X_test_preapred)
print("Unresempled test data prediction")
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))   

# Test the model on resempled data
predictions_resempled = sgd_clf.predict(X_test_resampled)  
print("Resempled test data prediction")  
print(classification_report(y_test_resampled, predictions_resempled))
print(confusion_matrix(y_test_resampled, predictions_resempled))

In [None]:
# Perform cross validation again from modified column transform
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()
# Fine tune the data before feed to cross validation model
X_train_num_copy = X_train_num.copy()
X_train_num_copy = X_train_num_copy.drop("x12", axis=1)
X_train_cat_copy = X_train_cat.copy()
X_train_cat_copy = X_train_cat_copy.drop("x5", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x6", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x7", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x8", axis=1) # Fine tune
column_transformer_pipeline = ColumnTransformer([
        # polynomial transformation
        ("poly", poly, X_train_num_copy[['x10', 'x13', 'x16']].columns), # Fine tune 
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = column_transformer_pipeline.fit_transform(X_train)
X_test_preapred = column_transformer_pipeline.transform(X_test)

pipeline = imbpipeline(steps = [['randomUnderSample', RandomUnderSampler(random_state=11)],
                                ['classifier', sgd_clf]])
stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
params = {
    "classifier__loss" : ["hinge", "log_loss", "squared_hinge", "modified_huber", "perceptron"],
    "classifier__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "classifier__penalty" : ["l2", "l1", "elasticnet", "none"],
}
#     grid = GridSearchCV(clf, param_grid=params, cv=5)
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           scoring='f1',
                           cv=stratified_kfold,
                           n_jobs=-1)
grid_search.fit(X_train_prepared, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test_preapred, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print(f'Best parameters: {grid_search.best_params_}')

##### Attemmpt oversampling method

In [None]:
from imblearn.over_sampling import SMOTE
poly = PolynomialFeatures(degree=2)
scalar = StandardScaler()
encode = OneHotEncoder()
# Fine tune the data before feed to model
X_train_num_copy = X_train_num.copy()
X_train_num_copy = X_train_num_copy.drop("x12", axis=1)
X_train_cat_copy = X_train_cat.copy()
X_train_cat_copy = X_train_cat_copy.drop("x5", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x6", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x7", axis=1) # Fine tune
X_train_cat_copy = X_train_cat_copy.drop("x8", axis=1) # Fine tune
full_pipeline = ColumnTransformer([
        # polynomial transformation
        ("poly", poly, X_train_num_copy[['x10', 'x13', 'x16']].columns), # Fine tune 
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)

# Now we need to resample the data to balance the data
smote = SMOTE(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_prepared, y_train)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_preapred, y_test)

# Train the model
sgd_clf = SGDClassifier(max_iter=5000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train_resampled, y_train_resampled)

# Test the model on unresempled data
predictions = sgd_clf.predict(X_test_preapred)
print("Unresempled test data prediction")
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))   

# Test the model on resempled data
predictions_resempled = sgd_clf.predict(X_test_resampled)  
print("Resempled test data prediction")  
print(classification_report(y_test_resampled, predictions_resempled))
print(confusion_matrix(y_test_resampled, predictions_resempled))

In [None]:
pipeline = imbpipeline(steps = [['SMOTE', SMOTE(random_state=11)],
                                ['classifier', sgd_clf]])
stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
params = {
    "classifier__loss" : ["hinge", "log_loss", "squared_hinge", "modified_huber", "perceptron"],
    "classifier__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "classifier__penalty" : ["l2", "l1", "elasticnet", "none"],
}
#     grid = GridSearchCV(clf, param_grid=params, cv=5)
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=params,
                           scoring='f1',
                           cv=stratified_kfold,
                           n_jobs=-1)
grid_search.fit(X_train_prepared, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test_resampled, y_test_resampled)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
print(f'Best parameters: {grid_search.best_params_}')

In [None]:
# SGDclassifier_beforefinetune is classifier before fine tune (column selection and modification)
# SGDclassifier_v1 is undersample version of best parameter of SGDclassifier
# SGDclassifier_v2 is oversample version of best parameter of SGDclassifier
# Oversampling is better than undersampling in this case, that cover most of the data
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay
classifiers = {
    "SGDClassifier_beforefinetune": SGDClassifier(max_iter=5000, tol=1e-3, random_state=42, alpha=0.01, loss="log_loss", penalty="none"),
    "SGDclassifier_v1":  SGDClassifier(max_iter=5000, tol=1e-3, random_state=42, alpha=0.001, loss="hinge", penalty="l2"),
    "SGDclassifier_v2": SGDClassifier(max_iter=5000, tol=1e-3, random_state=42, alpha=0.01, loss="modified_huber", penalty="elasticnet"),
}
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))
for name, clf in classifiers.items():
    clf.fit(X_train_prepared, y_train)

    RocCurveDisplay.from_estimator(clf, X_test_preapred, y_test, ax=ax_roc, name=name)
    DetCurveDisplay.from_estimator(clf, X_test_preapred, y_test, ax=ax_det, name=name)
ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")

ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")


### Model 2 - Logistic Regression

In [None]:
# reinstanciate data for model 2
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train data to categorical and numerical
X_train_num = X_train.drop(['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8'], axis=1)
X_train_cat = X_train[['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8']]
scalar = StandardScaler()
encode = OneHotEncoder()

full_pipeline = ColumnTransformer([
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter = 5000)
logreg.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = logreg.predict(X_test_preapred)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

#### Model 2 Fine Tune and Testing

In [None]:
#use gridsearchcv to find best hyperparameter for the model
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

solvers = ['newton-cg', 'lbfgs', 'sag', 'saga']
penalty = ['l2', 'none']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_resampled, y_train_resampled)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [None]:
#use oversampling technique and fine tune the model
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_prepared, y_train)

logreg = LogisticRegression(C=10, solver='newton-cg', penalty='l2', max_iter=5000)
logreg.fit(X_res, y_res)
y_pred = logreg.predict(X_test_preapred)     

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

### Model 3 - KNeighboursClassifier 

In [None]:
# reinstanciate data for model 3
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train data to categorical and numerical
X_train_num = X_train.drop(['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8'], axis=1)
X_train_cat = X_train[['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8']]
scalar = StandardScaler()
encode = OneHotEncoder()

full_pipeline = ColumnTransformer([
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_prepared, y_train)

In [None]:
y_pred = neigh.predict(X_test_preapred)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

#### Model 3 Fine Tuning and Testing

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

enn  = EditedNearestNeighbours()
X_train_resampled, y_train_resampled = enn.fit_resample(X_train_prepared, y_train)

ax = y_train_resampled.value_counts().plot.pie()
ax.set_title("undersampling")

In [None]:
neigh2 = KNeighborsClassifier(n_neighbors=3)
neigh2.fit(X_train_resampled, y_train_resampled)
y_pred2 = neigh2.predict(X_test_preapred)

print("Before fine-tuning:")
# Model 3 performance (before fine-tuning + before resampling):
print("Performance before resampling:")
print(classification_report(y_test, y_pred))    # classification_report(real, predicted)

# Model 3 performance (before fine-tuning + after resampling):
print("Performance after resampling:")
print(classification_report(y_test, y_pred2))

Fine-tuning: Finding the best hyperparameter for the model (with data before resampling)

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters to tune:
leaf_size = list(range(1,15))
n_neighbors = list(range(1,6))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
neigh2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(neigh2, hyperparameters, cv=10)

#Fit the model
best_model = clf.fit(X_train_prepared, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Fine-tuning: Finding the best hyperparameter for the model (with data after resampling)

In [None]:
# Hyperparameters to tune:
leaf_size = list(range(1,15))
n_neighbors = list(range(1,6))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
neigh2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(neigh2, hyperparameters, cv=10)

#Fit the model
best_model = clf.fit(X_train_resampled, y_train_resampled)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Checking the performance of Model 3 (after fine-tuning) by using the best hyperparameter found:<br>
A. Using best hyperparameter found with data before resampling

In [None]:
# Model with best hyperparameter 1 (before resampling)
best_neigh1_bef_res = KNeighborsClassifier(n_neighbors=5, p=2, leaf_size=1)
best_neigh1_bef_res.fit(X_train_prepared, y_train)

# Model with best hyperparameter 1 (after resampling)
best_neigh1_res = KNeighborsClassifier(n_neighbors=5, p=2, leaf_size=1)
best_neigh1_res.fit(X_train_resampled, y_train_resampled)

# Test both model with prepared test data
best_neigh1_bef_res_pred = best_neigh1_bef_res.predict(X_test_preapred)
best_neigh1_res_pred = best_neigh1_res.predict(X_test_preapred)

print("After fine-tuning:")
# Model 3 performance (after fine-tuning + before resampling):
print("Performance before resampling:")
print(classification_report(y_test, best_neigh1_bef_res_pred))

# Model 3 performance (after fine-tuning + after resampling):
print("Performance after resampling:")
print(classification_report(y_test, best_neigh1_res_pred))

B. Using best hyperparameter found with data after resampling

In [None]:
# Model with best hyperparameter 2 (before resampling)
best_neigh2_bef_res = KNeighborsClassifier(n_neighbors=1, p=2, leaf_size=1)
best_neigh2_bef_res.fit(X_train_prepared, y_train)

# Model with best hyperparameter 2 (after resampling)
best_neigh2_res = KNeighborsClassifier(n_neighbors=1, p=2, leaf_size=1)
best_neigh2_res.fit(X_train_resampled, y_train_resampled)

# Test both model with prepared test data
best_neigh2_bef_res_pred = best_neigh2_bef_res.predict(X_test_preapred)
best_neigh2_res_pred = best_neigh2_res.predict(X_test_preapred)

print("After fine-tuning:")
# Model 3 performance (after fine-tuning + before resampling):
print("Performance before resampling:")
print(classification_report(y_test, best_neigh2_bef_res_pred))

# Model 3 performance (after fine-tuning + after resampling):
print("Performance after resampling:")
print(classification_report(y_test, best_neigh2_res_pred))

roc_auc_score with default and best hyperparemeter 1 & 2, both with original & resampled data:

In [None]:
from sklearn.metrics import roc_auc_score

print("Before fine-tuning:")
print('roc_aoc_score before resampling =', roc_auc_score(y_test, y_pred))
print('roc_aoc_score after resampling =', roc_auc_score(y_test, y_pred2))
print('Difference =', roc_auc_score(y_test, y_pred2) - roc_auc_score(y_test, y_pred))

print('')
print("After fine-tuning with hyperparameters found using non-resampled data:")
print('roc_aoc_score before resampling (fine-tuned) =', roc_auc_score(y_test, best_neigh1_bef_res_pred))
print('roc_aoc_score after resampling (fine-tuned) =', roc_auc_score(y_test, best_neigh1_res_pred))
print('Difference =', roc_auc_score(y_test, best_neigh1_res_pred) - roc_auc_score(y_test, best_neigh1_bef_res_pred))

print('')
print("After fine-tuning with hyperparameters found using resampled (undersampled) data:")
print('roc_aoc_score before resampling (fine-tuned) =', roc_auc_score(y_test, best_neigh2_bef_res_pred))
print('roc_aoc_score after resampling (fine-tuned) =', roc_auc_score(y_test, best_neigh2_res_pred))
print('Difference =', roc_auc_score(y_test, best_neigh2_res_pred) - roc_auc_score(y_test, best_neigh2_bef_res_pred))

**Result**
1. Before fine-tuning the model:<br>
    (a) The roc_aoc_score using non-resampled data is higher than 2(a) but lower than 3(a).<br>
    (b) The roc_aoc_score using resampled data is higher than 3(b) but lower than 2(b).<br>
    (c) The improvement of roc_aoc_score is slightly higher than the improvement of roc_aoc_score of 3(c).<br>
    <br>
2. After fine-tuning the model with hyperparameters found using non-resampled data:<br>
    (a) The roc_aoc_score using non-resampled data is the lowest.<br>
    (b) The roc_aoc_score using resampled data is the highest.<br>
    (c) The improvement of roc_aoc_score is the most.<br>
    <br>
3. After fine-tuning the model with hyperparameters found using resampled (undersampled) data:<br>
    (a) The roc_aoc_score using non-resampled data is the highest.<br>
    (b) The roc_aoc_score using resampled data is the lowest.<br>
    (c) The improvement of roc_aoc_score is the least.

##### Performance Comparision between Model

In [None]:
# reinstanciate data for model 3
X = data.drop('y', axis=1)
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split train data to categorical and numerical
X_train_num = X_train.drop(['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8'], axis=1)
X_train_cat = X_train[['x2', 'x3', 'x4', 'x5', 'x6', 'x7' ,'x8']]
scalar = StandardScaler()
encode = OneHotEncoder()

full_pipeline = ColumnTransformer([
        ("scaler", scalar, X_train_num.columns),
        ("encode", encode, X_train_cat.columns),
    ])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_preapred = full_pipeline.transform(X_test)
rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train_prepared, y_train)

In [None]:
# SGDclassifier_beforefinetune is classifier before fine tune (column selection and modification)
# SGDclassifier_v1 is undersample version of best parameter of SGDclassifier
# SGDclassifier_v2 is oversample version of best parameter of SGDclassifier
# Oversampling is better than undersampling in this case, that cover most of the data
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay
classifiers = {
    "SGDclassifier": SGDClassifier(max_iter=5000, tol=1e-3, random_state=42, alpha=0.01, loss="modified_huber", penalty="elasticnet"),
    "LogisticRegression": LogisticRegression(C=10, solver='newton-cg', penalty='l2', max_iter=5000),
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=1, p=2, leaf_size=1),
}
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))
for name, clf in classifiers.items():
    clf.fit(X_train_prepared, y_train)

    RocCurveDisplay.from_estimator(clf, X_test_preapred, y_test, ax=ax_roc, name=name)
    DetCurveDisplay.from_estimator(clf, X_test_preapred, y_test, ax=ax_det, name=name)
ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")

ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")