In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings('ignore')

In [110]:
train_data = pd.read_csv('Train_v2.csv')
test_data = pd.read_csv('Test_v2.csv')

In [111]:
train_data_original = train_data.copy()
test_data_original = test_data.copy()

In [112]:
train_data.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [113]:
train_data.shape

(23524, 13)

In [114]:
test_data.shape

(10086, 12)

In [115]:
test_data.head()

Unnamed: 0,country,year,uniqueid,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_6056,Urban,Yes,3,30,Male,Head of Household,Married/Living together,Secondary education,Formally employed Government
1,Kenya,2018,uniqueid_6060,Urban,Yes,7,51,Male,Head of Household,Married/Living together,Vocational/Specialised training,Formally employed Private
2,Kenya,2018,uniqueid_6065,Rural,No,3,77,Female,Parent,Married/Living together,No formal education,Remittance Dependent
3,Kenya,2018,uniqueid_6072,Rural,No,6,39,Female,Head of Household,Married/Living together,Primary education,Remittance Dependent
4,Kenya,2018,uniqueid_6073,Urban,No,3,16,Male,Child,Single/Never Married,Secondary education,Remittance Dependent


In [116]:
'''convert the target variable’s categories into 0 and 1 so that we can find its correlation with numerical variables. 
Few models like logistic regression takes only numeric values as input. 
We will replace N with 0 and Y with 1.'''
train_data['bank_account'].replace('No', 0,inplace=True) 
train_data['bank_account'].replace('Yes', 1,inplace=True)

In [117]:
train_data.bank_account.value_counts()

0    20212
1     3312
Name: bank_account, dtype: int64

In [118]:
X=train_data.drop(['year', 'uniqueid', 'bank_account'],axis=1)
y=train_data.bank_account
test_data=test_data.drop(['year', 'uniqueid'],axis=1)


In [119]:
X=pd.get_dummies(X)
train_data = pd.get_dummies(train_data)
test_data=pd.get_dummies(test_data)

In [120]:
X.shape, test_data.shape

((23524, 39), (10086, 39))

In [121]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, gradient_boosting
import yellowbrick
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16466, 39), (7058, 39), (16466,), (7058,))

In [123]:
import xgboost
from xgboost import XGBClassifier

In [124]:
xgb_model = XGBClassifier(scale_pos_weight=6.15)
xgb_model.fit(X_train, y_train)
y_predicted = xgb_model.predict(X_test)


print('Training accuracy: {}'.format(xgb_model.score(X_train, y_train)))
print('Testing accuracy: {}'.format(xgb_model.score(X_test, y_test)))

Training accuracy: 0.7857403133730111
Testing accuracy: 0.7840748087276849


In [125]:
class_weight = int(y_train.value_counts()[0]/y_train.value_counts()[1])
print(class_weight)

6


In [126]:
print(classification_report(y_test, y_predicted))
pd.crosstab(y_test, y_predicted)

              precision    recall  f1-score   support

           0       0.96      0.78      0.86      6047
           1       0.38      0.79      0.51      1011

    accuracy                           0.78      7058
   macro avg       0.67      0.78      0.69      7058
weighted avg       0.87      0.78      0.81      7058



col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4740,1307
1,217,794


In [128]:
smote = SMOTE()

In [129]:
X_train_smote, y_train_smote = smote.fit_sample(X_train.astype('float'), y_train)

In [130]:
from collections import Counter

print("Before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_smote))

Before SMOTE: Counter({0: 14165, 1: 2301})
After SMOTE: Counter({1: 14165, 0: 14165})


In [131]:
smote_xgb_model = XGBClassifier().fit(X_train_smote, y_train_smote)
y_predicted_smoted = xgb_model.predict(X_test)

In [132]:
accuracy_score(y_test, y_predicted_smoted)


0.7840748087276849

In [133]:
pd.crosstab(y_test, y_predicted_smoted)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4740,1307
1,217,794


In [134]:
accuracy_score(y_test, y_predicted_smoted)


0.7840748087276849

In [135]:
from sklearn.pipeline import make_pipeline

In [151]:
pipe3 = make_pipeline( 
    SelectKBest(k=10), 
    LogisticRegression()
)
pipe3.fit(X_train_smote, y_train_smote)
y_pred3 = pipe3.predict(X_test)

In [152]:
print(pipe3.score(X_train_smote,y_train_smote))
print(pipe3.score(X_test,y_test))

0.7613836921990822
0.8015018418815528


In [153]:
pd.crosstab(y_test, y_pred3)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5001,1046
1,355,656


In [154]:
pipe4 = make_pipeline( 
    SelectKBest(k=10), 
    LinearDiscriminantAnalysis()
)
pipe4.fit(X_train_smote, y_train_smote)
y_pred4 = pipe4.predict(X_test)

In [157]:
pipe5 = make_pipeline( 
    SelectKBest(k=10), 
    RandomForestClassifier()
)
pipe5.fit(X_train_smote, y_train_smote)
y_pred5 = pipe5.predict(X_test)

In [158]:
print(pipe5.score(X_train_smote,y_train_smote))
print(pipe5.score(X_test,y_test))

0.8406636074832333
0.8624256163219042


In [159]:
pd.crosstab(y_test, y_pred5)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5647,400
1,571,440


In [155]:
print(pipe4.score(X_train_smote,y_train_smote))
print(pipe4.score(X_test,y_test))

0.7510412989763502
0.7764239161235478


In [156]:
pd.crosstab(y_test, y_pred4)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4776,1271
1,307,704


In [136]:
pipe1 = make_pipeline( 
    SelectKBest(k=10), 
    KNeighborsClassifier()
)
pipe1.fit(X_train_smote, y_train_smote)
y_pred = pipe1.predict(X_test)

In [137]:
pipe1.score(X_train_smote,y_train_smote)

0.8315213554535827

In [138]:
pipe1.score(X_test,y_test)

0.869226409747804

In [139]:
pd.crosstab(y_test, y_pred)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5842,205
1,718,293


In [147]:
pipe2 = make_pipeline( 
    SelectKBest(k=10), 
    XGBClassifier()
)
pipe2.fit(X_train_smote, y_train_smote)
y_pred2 = pipe2.predict(X_test)

In [148]:
pipe2.score(X_train_smote,y_train_smote)

0.8381927285563008

In [149]:
pipe2.score(X_test,y_test)

0.8515160102011902

In [150]:
pd.crosstab(y_test, y_pred2)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5488,559
1,489,522


In [144]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [145]:
BRF = BalancedRandomForestClassifier(n_estimators=500, random_state=6)
BRF.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=500, random_state=6)

In [146]:
y_BRF_pred = BRF.predict(X_test)


print('Training accuracy: {}'.format(BRF.score(X_train, y_train)))
print('Testing accuracy: {}'.format(BRF.score(X_test, y_test)))

print(classification_report(y_test, y_BRF_pred))
pd.crosstab(y_test, y_BRF_pred)

Training accuracy: 0.8180493137373983
Testing accuracy: 0.7564465854349675
              precision    recall  f1-score   support

           0       0.96      0.75      0.84      6047
           1       0.35      0.79      0.48      1011

    accuracy                           0.76      7058
   macro avg       0.65      0.77      0.66      7058
weighted avg       0.87      0.76      0.79      7058



col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4542,1505
1,214,797


### stratified K fold cross validation

In [197]:
accuracy = []

skf = StratifiedKFold(n_splits=10, random_state=None)
skf.split(X, y)

for train_index, test_index in skf.split(X,y):
    print('Train:', train_index, 'validation:', test_index)
    X1_train, X1_test = X.iloc[train_index], X.iloc[test_index]
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
    xgb_classifier = XGBClassifier()
    xgb_classifier.fit(X1_train, y1_train)
    skf_y_predicdected = xgb_classifier.predict(X1_test)
    score = accuracy_score(skf_y_predicdected, y1_test)
    accuracy.append(score)
print(accuracy)
    
    
    
    
    

Train: [ 1340  1342  1344 ... 23521 23522 23523] validation: [   0    1    2 ... 2715 2716 2717]
Train: [    0     1     2 ... 23521 23522 23523] validation: [1340 1342 1344 ... 5413 5414 5416]
Train: [    0     1     2 ... 23521 23522 23523] validation: [2616 2620 2630 ... 7787 7789 7790]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 3920  3925  3927 ... 10085 10086 10087]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 5214  5215  5220 ... 12367 12368 12370]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 7167  7169  7191 ... 14649 14650 14651]
Train: [    0     1     2 ... 23521 23522 23523] validation: [10006 10011 10013 ... 16861 16862 16863]
Train: [    0     1     2 ... 23521 23522 23523] validation: [12932 12943 12948 ... 19091 19093 19094]
Train: [    0     1     2 ... 23521 23522 23523] validation: [16200 16226 16235 ... 21319 21320 21321]
Train: [    0     1     2 ... 21319 21320 21321] validation: [19690 19691 19694 ... 23521 2

In [198]:
np.mean(accuracy)

0.880251527937992

In [199]:
pd.crosstab(y1_test, skf_y_predicdected)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1973,48
1,205,126


In [None]:
np.mean(accuracy)

In [170]:
accuracy = []

skf = StratifiedKFold(n_splits=10, random_state=None)
skf.split(X, y)

for train_index, test_index in skf.split(X,y):
    print('Train:', train_index, 'validation:', test_index)
    X1_train, X1_test = X.iloc[train_index], X.iloc[test_index]
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
    KNN_classifier = KNeighborsClassifier()
    KNN_classifier.fit(X1_train, y1_train)
    skf_y_predicdected1 = KNN_classifier.predict(X1_test)
    score1 = accuracy_score(skf_y_predicdected1, y1_test)
    accuracy.append(score1)
print(accuracy)
    

Train: [ 1340  1342  1344 ... 23521 23522 23523] validation: [   0    1    2 ... 2715 2716 2717]
Train: [    0     1     2 ... 23521 23522 23523] validation: [1340 1342 1344 ... 5413 5414 5416]
Train: [    0     1     2 ... 23521 23522 23523] validation: [2616 2620 2630 ... 7787 7789 7790]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 3920  3925  3927 ... 10085 10086 10087]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 5214  5215  5220 ... 12367 12368 12370]
Train: [    0     1     2 ... 23521 23522 23523] validation: [ 7167  7169  7191 ... 14649 14650 14651]
Train: [    0     1     2 ... 23521 23522 23523] validation: [10006 10011 10013 ... 16861 16862 16863]
Train: [    0     1     2 ... 23521 23522 23523] validation: [12932 12943 12948 ... 19091 19093 19094]
Train: [    0     1     2 ... 23521 23522 23523] validation: [16200 16226 16235 ... 21319 21320 21321]
Train: [    0     1     2 ... 21319 21320 21321] validation: [19690 19691 19694 ... 23521 2

In [171]:
pd.crosstab(y1_test, skf_y_predicdected1)

col_0,0,1
bank_account,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1892,129
1,228,103


In [172]:
np.mean(accuracy)

0.8581895922414866

In [200]:
bank_account = xgb_classifier.predict(test_data)

In [201]:
test_data_original['bank_account']=bank_account
test_data_original.to_csv('predicion1.csv')