In [55]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
import seaborn as sns

In [56]:
df=pd.read_csv('../data/processed_train_filled.csv')

In [57]:
# Group the duration by duration <= 1000 and duration > 1000
# Group the campaign by campaign <= 10 and campaign > 10
# Group the pdays by pdays > 900 and pdays <= 900
# Group the age by <35, 35-55, 56-80, >81

In [58]:
def categorize_age(age):
    if age < 35:
        return 0
    elif 35 <= age <= 55:
        return 1
    elif 56 <= age <= 80:
        return 2
    else:
        return 3

In [59]:
age_gp = df['age'].apply(categorize_age)
job_gp = pd.get_dummies(df['job'], dtype=float)
education_gp = pd.get_dummies(df['education'], dtype=float)
marital_gp = pd.get_dummies(df['marital'], dtype=float)
default_gp = df['default'].map({'yes':1, 'no':0})
housing_gp = df['housing'].map({'yes':1, 'no':0})
loan_gp = df['loan'].map({'yes':1, 'no':0})
contact_gp = pd.get_dummies(df['contact'], dtype=float)
month_gp = pd.get_dummies(df['month'], dtype=float)
day_of_week_gp = pd.get_dummies(df['day_of_week'], dtype=float)
duration_gp = df['duration'].apply(lambda x: 1 if x <= 1000 else 0)
campaign_gp = df['campaign'].apply(lambda x: 1 if x <= 10 else 0)
pdays_gp = df['pdays'].apply(lambda x: 1 if x > 900 else 0)
previous_gp = df['previous']
poutcome_gp = pd.get_dummies(df['poutcome'], dtype=float)
subscribe_gp = df['subscribe'].map({'yes':1, 'no':0})

training_df = pd.concat([age_gp, job_gp, education_gp, marital_gp, default_gp, housing_gp, loan_gp, contact_gp, 
                         month_gp, day_of_week_gp, duration_gp, campaign_gp, pdays_gp, previous_gp, poutcome_gp
                         ], axis=1)

['age', 'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree', 'divorced', 'married', 'single', 'default', 'housing', 'loan', 'cellular', 'telephone', 'apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep', 'fri', 'mon', 'thu', 'tue', 'wed', 'duration', 'campaign', 'pdays', 'previous', 'failure', 'nonexistent', 'success']


In [60]:
X = training_df
y = subscribe_gp

       age  admin.  blue-collar  entrepreneur  housemaid  management  retired  \
0        1     1.0          0.0           0.0        0.0         0.0      0.0   
1        1     0.0          0.0           0.0        0.0         0.0      0.0   
2        1     0.0          1.0           0.0        0.0         0.0      0.0   
3        0     0.0          0.0           1.0        0.0         0.0      0.0   
4        1     1.0          0.0           0.0        0.0         0.0      0.0   
...    ...     ...          ...           ...        ...         ...      ...   
22495    0     1.0          0.0           0.0        0.0         0.0      0.0   
22496    0     1.0          0.0           0.0        0.0         0.0      0.0   
22497    0     1.0          0.0           0.0        0.0         0.0      0.0   
22498    2     0.0          0.0           0.0        0.0         0.0      1.0   
22499    1     0.0          1.0           0.0        0.0         0.0      0.0   

       self-employed  servi

In [64]:
for k in range(1, 20):
    steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=k, weights='distance'))]
    pipeline = Pipeline(steps)
    scores = cross_val_score(pipeline, X, y, cv=10, scoring='f1_macro').mean()
    print("When k=%d, F1 Score=%.5f" % (k, scores))

When k=1, F1 Score=0.58810
When k=2, F1 Score=0.59069
When k=3, F1 Score=0.58945
When k=4, F1 Score=0.59118
When k=5, F1 Score=0.58720
When k=6, F1 Score=0.59147
When k=7, F1 Score=0.59164
When k=8, F1 Score=0.59571
When k=9, F1 Score=0.59253
When k=10, F1 Score=0.59295
When k=11, F1 Score=0.58808
When k=12, F1 Score=0.58873
When k=13, F1 Score=0.58816
When k=14, F1 Score=0.58923
When k=15, F1 Score=0.58664
When k=16, F1 Score=0.58580
When k=17, F1 Score=0.58668
When k=18, F1 Score=0.58745
When k=19, F1 Score=0.58812


In [66]:
model_best = KNeighborsClassifier(n_neighbors=8,weights='distance')
model_best.fit(X,y)
y_pred_knn = cross_val_predict(model_best,X,y,cv=10)

steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=8, weights='distance'))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
y_pred_smote = cross_val_predict(pipeline, X, y, cv=10)

# show a classification report demonstrating the accuracy of the classifier for each digit
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# evaluate result
print("Accuracy:",cross_val_score(model_best,X,y,cv=10).mean())
print("F1 Score=%.5f", cross_val_score(model_best, X, y, cv=10, scoring='f1_macro').mean())
print("Confusion Matrix:\n", confusion_matrix(y_pred_knn, y))
print("Accuracy:",cross_val_score(pipeline,X,y,cv=10).mean())
print("F1 Score=%.5f", cross_val_score(pipeline, X, y, cv=10, scoring='f1_macro').mean())
print("Confusion Matrix:\n", confusion_matrix(y_pred_smote, y))

Accuracy: 0.8657333333333334
F1 Score=%.5f 0.5530078325109871
Confusion Matrix:
 [[19149  2622]
 [  399   330]]
Accuracy: 0.7467555555555556
F1 Score=%.5f 0.5957077439900955
Confusion Matrix:
 [[15277  1427]
 [ 4271  1525]]
