In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pre
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('train.csv')
train = train.drop('duration', axis = 1) # dropped because if duration = 0, deposit always = no. We wouldn't have this
                                         # variable in a real dataset, as advised by kaggle/UCI
train_x = train[[i for i in train.columns if i != 'deposit']]
train_y = train['deposit']
train_y = train_y.replace({'yes':1,'no':0})

In [None]:
train_x.head()

In [None]:
plt.figure(num = "Age Plot", figsize =  (10,4))
sns.distplot(train_x['age'])
plt.show()
# most clients are mid/late thirties

In [None]:
plt.figure(num = 'job plot', figsize = (14,4))
sns.barplot(x = train_x['job'].unique(), y = train_x['job'].value_counts(), palette = "pastel")
plt.show()
# the 'office professional' jobs make up the vast majority of clients, followed by retireees

In [None]:
train_x['marital'].value_counts()
# more people are married than single and divorced combined

In [None]:
train_x = pd.get_dummies(train_x)

In [None]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [None]:
clf.fit(train_x, train_y) # trains the decision tree

### model details

In [None]:
importances = pd.DataFrame(data = {'feature':train_x.columns,'importance':clf.feature_importances_})
importances = importances.sort_values('importance',ascending = False)

In [None]:
plt.figure('Importances', figsize = (16,4))
plt.xticks(rotation = 90)
sns.barplot(x = importances['feature'], y = importances['importance']);
# the most important feature are the client's balance, their age, and seemingly the day of the month they were called

### model prediction

In [None]:
# if this were productionised, this cell would be all you'd need to re-use the model each time
# pickle.dump(clf,open('random_forests.pkl',mode = "wb"))
# clf = pickle.load(open('model.pkl',mode = "rb"))

In [None]:
test = pd.read_csv('test.csv')
test = test.drop('duration', axis = 1)
test_x = test[[i for i in train.columns if i != 'deposit']]
test_y = test[['deposit']]
test_y = test_y.replace({'yes':1,'no':0})
test_x = pd.get_dummies(test_x)

In [None]:
predictions = clf.predict(test_x)

In [None]:
clf.score(train_x,train_y)

In [None]:
clf.score(test_x,test_y)