In [11]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import metrics

df = pd.read_csv("data/bankdata.csv")

df = df.drop(['id'], axis = 1)

#get categorical data
categorical = [var for var in df.columns if df[var].dtype == 'O']

le = LabelEncoder()

# encode categorical data
# for var in categorical:
#     df[var] = le.fit_transform(df[var])

df['sex'] = le.fit_transform(df['sex'])
df['married'] = le.fit_transform(df['married'])
df['car'] = le.fit_transform(df['car'])
df['save_act'] = le.fit_transform(df['save_act'])
df['current_act'] = le.fit_transform(df['current_act'])
df['mortgage'] = le.fit_transform(df['mortgage'])
df['pep'] = le.fit_transform(df['pep'])
df['region'] = le.fit_transform(df['region'])
    
target = df['pep']
features = df.drop(['pep'], axis = 1)

# split test
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

# check the shape of X_train and X_test
x_train.shape, x_test.shape

# instantiate GaussianNB model
gnb = GaussianNB()

# fit the model
gnb.fit(x_train, y_train)

# predict model accuracy results
y_pred = gnb.predict(x_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

# train model
y_pred_train = gnb.predict(x_train)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

print('================================')

# check overfitting and underfitting
# print('Training set score: {:.4f}'.format(gnb.score(x_train, y_train)))
# print('Test set score: {:.4f}'.format(gnb.score(x_test, y_test)))

# k-fold cross validation
scores = cross_val_score(gnb, x_train, y_train, cv = 10, scoring = 'accuracy')

# mean accuracy score
print('Model accuracy score: {0:0.4f}'. format(scores.mean()))

# prediction and accuracy test
gnb.fit(x_train, y_train)
test_predict = gnb.predict(x_test)
print('Training-set accuracy score: {0:0.4f}'. format(metrics.accuracy_score(y_test, test_predict)))

print('================================')

# initiate BernoulliNB model
bnb = BernoulliNB(binarize = 0.0)

scores = cross_val_score(bnb, x_train, y_train, cv = 10, scoring = 'accuracy')

print('Model accuracy score: {0:0.4f}'. format(scores.mean()))

bnb.fit(x_train, y_train)
test_predict = bnb.predict(x_test)

print('Training-set accuracy score: {0:0.4f}'. format(metrics.accuracy_score(y_test, test_predict)))

print('================================')

# initiate MultinomialNB model
mnb = MultinomialNB()

scores = cross_val_score(mnb, x_train, y_train, cv = 10, scoring = 'accuracy')

print('Model accuracy score: {0:0.4f}'. format(scores.mean()))

mnb.fit(x_train, y_train)
test_predict = mnb.predict(x_test)

print('Training-set accuracy score: {0:0.4f}'. format(metrics.accuracy_score(y_test, test_predict)))

Model accuracy score: 0.5917
Training-set accuracy score: 0.6583
Model accuracy score: 0.6438
Training-set accuracy score: 0.5917
Model accuracy score: 0.5729
Training-set accuracy score: 0.5250
Model accuracy score: 0.5979
Training-set accuracy score: 0.5417


In [12]:
x_train

Unnamed: 0,age,sex,region,income,married,children,car,save_act,current_act,mortgage
118,39,1,3,17270.1,0,0,1,0,0,0
362,56,1,3,29625.1,1,2,0,0,1,0
55,52,0,1,47835.8,0,3,0,1,0,0
90,37,1,3,24814.5,1,1,1,0,1,1
390,56,0,0,44288.3,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
277,25,1,3,22366.1,1,1,1,1,1,0
9,54,1,3,24212.1,1,2,1,1,1,0
359,52,0,0,22792.3,1,1,1,1,0,0
192,64,0,1,49024.9,1,3,0,1,1,0


In [13]:
df.dtypes

age              int64
sex              int32
region           int32
income         float64
married          int32
children         int64
car              int32
save_act         int32
current_act      int32
mortgage         int32
pep              int32
dtype: object

In [14]:
df.dtypes

age              int64
sex              int32
region           int32
income         float64
married          int32
children         int64
car              int32
save_act         int32
current_act      int32
mortgage         int32
pep              int32
dtype: object