In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [43]:
df = pd.read_csv('Financial_inclusion_dataset.csv')

In [44]:
df.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [14]:
df = df.filter(['location_type', 'bank_account', 'cellphone_access', 'age_of_respondent', 'gender_of_respondent', 'marital_status', 'education_level', 'job_type'], axis=1)

In [15]:
df

Unnamed: 0,location_type,bank_account,cellphone_access,age_of_respondent,gender_of_respondent,marital_status,education_level,job_type
0,Rural,Yes,Yes,24,Female,Married/Living together,Secondary education,Self employed
1,Rural,No,No,70,Female,Widowed,No formal education,Government Dependent
2,Urban,Yes,Yes,26,Male,Single/Never Married,Vocational/Specialised training,Self employed
3,Rural,No,Yes,34,Female,Married/Living together,Primary education,Formally employed Private
4,Urban,No,No,26,Male,Single/Never Married,Primary education,Informally employed
...,...,...,...,...,...,...,...,...
23519,Rural,No,Yes,48,Female,Divorced/Seperated,No formal education,Other Income
23520,Rural,No,Yes,27,Female,Single/Never Married,Secondary education,Other Income
23521,Rural,No,Yes,27,Female,Widowed,Primary education,Other Income
23522,Urban,No,Yes,30,Female,Divorced/Seperated,Secondary education,Self employed


In [20]:
cols = []

for col in df.columns:
    if df[col].dtype == 'object':
        cols.append(col)

In [21]:
cols

['location_type',
 'bank_account',
 'cellphone_access',
 'gender_of_respondent',
 'marital_status',
 'education_level',
 'job_type']

In [23]:
le = LabelEncoder()

In [26]:
for col in cols:
    df[col] = le.fit_transform(df[col])
    print(le.classes_)

[0 1]
[0 1]
[0 1]
[0 1]
[0 1 2 3 4]
[0 1 2 3 4 5]
[0 1 2 3 4 5 6 7 8 9]


In [28]:
count = Counter(df['bank_account'])
count

Counter({0: 20212, 1: 3312})

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [30]:
X = df.drop(['bank_account'], axis=1)
y = df['bank_account']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [32]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [33]:
model.score(X_test, y_test)

0.8537725823591924

In [34]:
import joblib

In [35]:
joblib.dump(model, 'fin_inc.joblib')

['fin_inc.joblib']

In [36]:
joblib.dump(le, 'fin_inc_le.joblib')

['fin_inc_le.joblib']

In [37]:
df.head()

Unnamed: 0,location_type,bank_account,cellphone_access,age_of_respondent,gender_of_respondent,marital_status,education_level,job_type
0,0,1,1,24,0,2,3,9
1,0,0,0,70,0,4,0,4
2,1,1,1,26,1,3,5,9
3,0,0,1,34,0,2,2,3
4,1,0,0,26,1,3,2,5


In [47]:
df['job_type'].unique()

array(['Self employed', 'Government Dependent',
       'Formally employed Private', 'Informally employed',
       'Formally employed Government', 'Farming and Fishing',
       'Remittance Dependent', 'Other Income',
       'Dont Know/Refuse to answer', 'No Income'], dtype=object)

In [None]:
df