In [None]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../data/processed_train_filled.csv')

In [None]:
# filter subscribed
# df_prev_success = df_no_unknown[df_no_unknown['poutcome'] == 'success']
# df_prev_success = df[df['subscribe'] == 'yes']

In [None]:
sns.displot(df['duration'])

In [None]:
sns.displot(df['campaign'])

In [None]:
sns.displot(df['pdays'])

In [None]:
sns.displot(df['previous'])

In [None]:
# Group the duration by duration <= 1000 and duration > 1000
# Group the campaign by campaign <= 10 and campaign > 10
# Group the pdays by pdays > 900 and pdays <= 900

In [None]:
cust_features_transform = ['job', 'education', 'marital', 'contact']
cust_features_map = ['default', 'housing', 'loan', 'subscribe']

In [None]:
df['duration_gp'] = df['duration'].apply(lambda x: 1 if x <= 1000 else 0)
df['campaign_gp'] = df['campaign'].apply(lambda x: 1 if x <= 10 else 0)
df['pdays_gp'] = df['pdays'].apply(lambda x: 1 if x > 900 else 0)
for feature in cust_features_transform:
    encoder = OneHotEncoder(sparse_output=False)
    feature_df = pd.DataFrame(encoder.fit_transform(df[[feature]]),
                              columns=encoder.get_feature_names_out([feature])).reset_index(drop=True)
    df = pd.concat([df, feature_df], axis=1)
for feature in cust_features_map:
    feature_df = df[feature].map({'yes':1, 'no':0})
    df = pd.concat([df, feature_df], axis=1)

In [None]:
X = df.iloc[:,22:-1]
y = df.iloc[:,-1]
print(X)
print(y)

In [None]:
for k in range(31, 60):
    # model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    # model.fit(X, y)
    # accuracy = cross_val_score(model,X,y,cv=10).mean()
    steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=k, weights='distance'))]
    pipeline = Pipeline(steps)
    scores = cross_val_score(pipeline, X, y, cv=10, scoring='f1_macro').mean()
    print("When k=%d, accuracy=%.5f" % (k, scores))

In [98]:
model_best = KNeighborsClassifier(n_neighbors=26,weights='distance')
model_best.fit(X,y)
y_pred_knn = cross_val_predict(model_best,X,y,cv=10)

steps = [('smote', SMOTE(random_state=99)), ('knn', KNeighborsClassifier(n_neighbors=39, weights='distance'))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
y_pred_smote = cross_val_predict(pipeline, X, y, cv=10)

# show a classification report demonstrating the accuracy of the classifier for each digit
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# evaluate result
print("Accuracy:",cross_val_score(model_best,X,y,cv=10).mean())
print("Confusion Matrix:\n", confusion_matrix(y_pred_knn, y))
print("Accuracy:",cross_val_score(pipeline,X,y,cv=10).mean())
print("Confusion Matrix:\n", confusion_matrix(y_pred_smote, y))

Accuracy: 0.8630222222222222
Confusion Matrix:
 [[19249  2783]
 [  299   169]]
Accuracy: 0.7999555555555555
Confusion Matrix:
 [[17005  1958]
 [ 2543   994]]


In [None]:
inertia = []
for nc in range(1,11):
    kmeans = KMeans(n_clusters=nc, random_state=0).fit(one_hot_df)
    inertia.append(kmeans.inertia_)
interias = pd.DataFrame({'n_clusters': range(1, 11), 'inertia': inertia})

plot = interias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of Squared Distances')
plot.legend().set_visible(False)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(one_hot_df)
memb = pd.DataFrame(kmeans.labels_, one_hot_df.index, columns=['Cluster'])
utilities_memb = pd.concat([one_hot_df, memb], axis=1)
utilities_memb

In [None]:
# show cluster centroids
centroids = pd.DataFrame(kmeans.cluster_centers_,
                         columns=one_hot_df.columns)
centroids