In [9]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
import seaborn as sns

In [10]:
df=pd.read_csv('../data/processed_train_filled.csv')

In [11]:
# Group the duration by duration <= 1000 and duration > 1000
# Group the campaign by campaign <= 10 and campaign > 10
# Group the pdays by pdays > 900 and pdays <= 900
# Group the age by <35, 35-55, 56-80, >81

In [12]:
def categorize_age(age):
    if age < 35:
        return 0
    elif 35 <= age <= 55:
        return 1
    elif 56 <= age <= 80:
        return 2
    else:
        return 3

In [13]:
age_gp = df['age'].apply(categorize_age)
job_gp = pd.get_dummies(df['job'], dtype=float)
education_gp = pd.get_dummies(df['education'], dtype=float)
marital_gp = pd.get_dummies(df['marital'], dtype=float)
default_gp = df['default'].map({'yes':1, 'no':0})
housing_gp = df['housing'].map({'yes':1, 'no':0})
loan_gp = df['loan'].map({'yes':1, 'no':0})
contact_gp = pd.get_dummies(df['contact'], dtype=float)
month_gp = pd.get_dummies(df['month'], dtype=float)
day_of_week_gp = pd.get_dummies(df['day_of_week'], dtype=float)
duration_gp = df['duration'].apply(lambda x: 1 if x <= 1000 else 0)
campaign_gp = df['campaign'].apply(lambda x: 1 if x <= 10 else 0)
pdays_gp = df['pdays'].apply(lambda x: 1 if x > 900 else 0)
previous_gp = df['previous']
poutcome_gp = pd.get_dummies(df['poutcome'], dtype=float)
subscribe_gp = df['subscribe'].map({'yes':1, 'no':0})

training_df = pd.concat([age_gp, job_gp, education_gp, marital_gp, default_gp, housing_gp, loan_gp, contact_gp, 
                         month_gp, day_of_week_gp, duration_gp, campaign_gp, pdays_gp, previous_gp, poutcome_gp
                         ], axis=1)

In [14]:
X = training_df
y = subscribe_gp

In [15]:
for k in range(1, 20):
    steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=k, weights='distance'))]
    pipeline = Pipeline(steps)
    scores = cross_val_score(pipeline, X, y, cv=10, scoring='f1_macro').mean()
    print("When k=%d, F1 Score=%.5f" % (k, scores))

When k=1, F1 Score=0.58652
When k=2, F1 Score=0.59006
When k=3, F1 Score=0.59056
When k=4, F1 Score=0.59343
When k=5, F1 Score=0.58921
When k=6, F1 Score=0.59370
When k=7, F1 Score=0.58975
When k=8, F1 Score=0.59502
When k=9, F1 Score=0.58885
When k=10, F1 Score=0.59326
When k=11, F1 Score=0.58581
When k=12, F1 Score=0.58837
When k=13, F1 Score=0.58767
When k=14, F1 Score=0.58891
When k=15, F1 Score=0.58517
When k=16, F1 Score=0.58671
When k=17, F1 Score=0.58859
When k=18, F1 Score=0.58844
When k=19, F1 Score=0.58785


In [16]:
model_best = KNeighborsClassifier(n_neighbors=8,weights='distance')
model_best.fit(X,y)
y_pred_knn = cross_val_predict(model_best,X,y,cv=10)

steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=8, weights='distance'))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
y_pred_smote = cross_val_predict(pipeline, X, y, cv=10)

# show a classification report demonstrating the accuracy of the classifier for each digit
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# evaluate result
print("Accuracy: %.5f" % (cross_val_score(model_best,X,y,cv=10).mean()))
print("F1 Score: %.5f" % (cross_val_score(model_best, X, y, cv=10, scoring='f1_macro').mean()))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_knn))
print("Accuracy: %.5f" % (cross_val_score(pipeline,X,y,cv=10).mean()))
print("F1 Score: %.5f" % (cross_val_score(pipeline, X, y, cv=10, scoring='f1_macro').mean()))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_smote))

Accuracy: 0.86511
F1 Score: 0.55130
Confusion Matrix:
 [[19140   408]
 [ 2627   325]]
Accuracy: 0.74591
F1 Score: 0.59502
Confusion Matrix:
 [[15258  4290]
 [ 1427  1525]]
