In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
import csv
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
# set up your dataset folder.
dataset_folder = './data/bank/'

## data preprocessing

In [3]:
if not os.path.exists(dataset_folder):
    print('your folder is not exists')
else:
    with open(dataset_folder + 'bank-full.csv','r',encoding = 'utf-8-sig') as origin_data:
        with open(dataset_folder + 'bank-full-fixed.csv','w',newline='') as output_data:
            spamwriter = csv.writer(output_data)
            spamwriter.writerows([origin_data.readline().rstrip('\n').split(';')])
            for data in origin_data.readlines():
                spamwriter.writerows([data.rstrip('\n').split(';')])

### data encoder

In [4]:
if os.path.exists(dataset_folder+'label_mapping.txt'):
    os.remove(dataset_folder+'label_mapping.txt')
encoder_data = pd.DataFrame()
dataset = pd.read_csv(dataset_folder + 'bank-full-fixed.csv')
for label in dataset.columns:
    if type(dataset[label][1]) != str:
        encoder_data[label] = dataset[label]
    else:
        le = preprocessing.LabelEncoder()
        le.fit(dataset[label])
        encoder_data[label] = le.transform(dataset[label])
        with open(dataset_folder + 'label_mapping.txt','a',encoding='utf-8-sig') as label_mapping:
            label_mapping.write(label+' : '+' , '.join(le.classes_)+'\n')
encoder_data.to_csv(dataset_folder + 'bank-full-fixed-encoder.csv',index=False)

## Decision Trees

In [24]:
dataset = pd.read_csv(dataset_folder + 'bank-full-fixed-encoder.csv')
X = np.array(dataset.iloc[:,:-1])
Y = np.array(dataset['"y"'])
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 20)
# end_of_train = int(dataset.shape[0]*0.8)
# X_train = np.array(dataset.iloc[:end_of_train,:-1])
# X_test = np.array(dataset.iloc[end_of_train:,:-1])
# Y_train = np.array(dataset.iloc[:end_of_train,-1])
# Y_test = np.array(dataset.iloc[end_of_train:,-1])

In [25]:
clf = DecisionTreeClassifier()
# clf = clf.fit(X, Y)
clf.fit(X_train,Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [29]:
number_of_correct = 0
for index,testing_data in enumerate(X_test):
    if clf.predict(testing_data.reshape(1,-1)) == Y_test[index]:
        number_of_correct +=1
print('accuracy ' + str(number_of_correct/len(X_test)))
print('recall ' + str(metrics.recall_score(Y_test,clf.predict(X_test))))
print('precision ' + str(metrics.precision_score(Y_test,clf.predict(X_test))))
print('f1_score ' + str(metrics.f1_score(Y_test,clf.predict(X_test))))

accuracy 0.8763684617936526
recall 0.460362941738
precision 0.465700483092
f1_score 0.463016330451


In [7]:
clf.predict([[58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3]])

array([0], dtype=int64)

In [8]:
clf.predict([[59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3]])

array([1], dtype=int64)

## ANN

In [30]:
dataset = pd.read_csv(dataset_folder + 'bank-full-fixed-encoder.csv')
X = np.array(dataset.iloc[:,:-1])
Y = np.array(dataset['"y"'])

In [31]:
NN = MLPClassifier()
NN.fit(X,Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [32]:
NN.predict([[59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3]])

array([0], dtype=int64)

In [33]:
NN.predict([[5,7,1,1,0,2476,1,0,2,5,8,222579,1,-1,0,3]])

array([0], dtype=int64)

In [34]:
NN.predict([[58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3]])

array([0], dtype=int64)

In [35]:
number_of_correct = 0
for index,testing_data in enumerate(X_test):
    if NN.predict(testing_data.reshape(1,-1)) == Y_test[index]:
        number_of_correct +=1
print('accuracy ' + str(number_of_correct/len(X_test)))
print('recall ' + str(metrics.recall_score(Y_test,NN.predict(X_test))))
print('precision ' + str(metrics.precision_score(Y_test,NN.predict(X_test))))
print('f1_score ' + str(metrics.f1_score(Y_test,NN.predict(X_test))))

accuracy 0.8849939179475838
recall 0.0105062082139
precision 0.733333333333
f1_score 0.0207156308851
