In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm

## Load data

In [None]:
df = pd.read_csv("bank-additional/bank-additional-full.csv", sep=";")

In [None]:
df.head()

## Transform data

In [None]:
# Drop column "duration"

columns_to_drop = ["duration"]
df.drop(columns_to_drop, axis = 1, inplace = True)

# Handle N/A values

for column in ["job", "marital", "education", "housing", "loan", "default"]:
    ind = np.array(df[column] == "unknown")
    df.loc[ind, column] = np.NaN   
    
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
#df.dropna(inplace = True)

# Add dummies for categorical fields

for column in ["default", "housing", "loan", "y"]:
    df[column] = 1 * (df[column] == "yes")

df = pd.get_dummies(df)

In [None]:
# Center and normalize

#for column in ["age", "pdays", "previous", "emp.var.rate", "cons.price.idx","cons.conf.idx", "euribor3m", "nr.employed", "duration"]:
#    c = df[column]
#    df[column] =  (c - np.mean(c)) / np.sqrt((np.var(c)))

In [None]:
df.head()

In [None]:
df.columns

## Data exploration

In [None]:
n = len(df["y"])
n_s = np.sum(df["y"])
n_ns = np.sum(1 - df["y"])
print("Total number of clients: {}".format(n))
print("Subscription: {}".format(n_s))
print("No subscription: {}".format(n_ns))

In [None]:
column = "age"
colors = ["red", "blue"]

for y in [0, 1]:
    ind = np.array(df["y"] == y)
    sns.distplot(df.loc[ind,column], color = colors[y], kde = True, hist=True)

plt.title('Distribution of "{0}" (red: not subscribed | blue: subscribed)'.format(column))
plt.show()

## Dimension reduction

In [None]:
X = np.array([df[c] for c in df.columns if (c != "y")]).T

In [None]:
y = np.array(df["y"])

In [None]:
from sklearn.decomposition import PCA, KernelPCA

n_components = 5
#pca = KernelPCA(n_components = n_components, kernel="poly", gamma=0.1, degree = 2) 
pca = PCA(n_components = n_components)

In [None]:
explained_v = pca.fit(X).explained_variance_ 
plt.plot(np.arange(1,n_components+1), explained_v)
plt.title("Explained variance")
plt.show()

In [None]:
X = pca.fit_transform(X)

In [None]:
colors = ["green", "red"]

for rating in [0, 1]:
    ind = np.array(y == rating)
    plt.scatter(X[ind,0], X[ind,1], color = colors[rating])

plt.show()

## Test models

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def scores_clf(X, y, clf, n_splits = 5):
    
    # init
    
    kf = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
    kf.get_n_splits(X)
    
    # K-Fold cross-validation
    
    a = []
    i = 0
    
    for train_index, test_index in kf.split(X):
        
        i += 1
        
        # Split dataset
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train
        clf.fit(X_train, y_train)
    
        # Predict
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        # Compute score
        a_train = accuracy_score(y_train, y_pred_train)
        a_test = accuracy_score(y_test, y_pred_test)
        
        print("=> Fold {0}\nTraining accuracy: {1}\nTest accuracy: {2}".format(i, a_train, a_test))
        
        a.append(a_test)

    a_global = np.mean(a)
    
    print("*"*50 + "\nGlobal accuracy: {0}\n".format(a_global) + "*"*50)
    
    return a_global

#### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C = 1.)

scores_clf(X, y, clf)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "penalty" : ["l1", "l2"],
    "C" : [0.1, 0.25, 0.5, 0.75, 1., 1.5, 2., 10.],
    "fit_intercept" : [True, False]
}

grid_cv = GridSearchCV(estimator = clf, param_grid = parameters)

grid_cv.fit(X,y).best_estimator_

#### SVM

In [None]:
from sklearn.svm import SVC

clf = SVC()

scores_clf(X, y, clf)

#### K-Neighbors 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 25)

a = scores_clf(X, y, clf)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "n_neighbors" : 5 * np.arange(1,6),
    "weights" : ["uniform", "distance"]
}

grid_cv = GridSearchCV(estimator = clf, param_grid = parameters)

grid_cv.fit(X,y).best_estimator_

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

scores_clf(X, y, clf)

#### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 25, criterion = "gini")

scores_clf(X, y, clf, n_splits=5)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "n_estimators" : [25, 50, 100], 
    "criterion" : ["gini","entropy"]
}

grid_cv = GridSearchCV(estimator = clf, param_grid = parameters)

grid_cv.fit(X,y).best_estimator_

#### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(100,100), activation = "tanh")

In [None]:
a = scores_clf(X, y, clf, n_splits=5)

## Best estimator

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100, criterion = "entropy")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)