## LOAD DATA
Our first step will be to load, analize and prepare our data for machine learning classification. Let's start by importing all the libraries we will need for this project in one step

In [7]:
##LIBRARIES##
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import random

#ML
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

#SPATIAL
import geoviews as gv
import geoviews.feature as gf
import xarray as xr
from cartopy import crs
from geoviews import dim, opts
gv.extension('bokeh')
import geopandas as gpd
import geoviews.tile_sources as gts


##FUNCTIONS##
def data_pipeline(data, output):
    """
    Preprocessing pipeline part 1: Transform full data frame
    Arguments: Pandas dataframe, output column (dependent variable)
    Returns: Modified dataframe
    """
    data = cost_per_metric(data, output) if 'cost_per' in output \
                                         else data[data[output] > 0]
    data = drop_columns(data, output, threshold=.5)
    data = data.dropna(axis='index')
    data = create_other_buckets(data, threshold=.1)
    data = one_hot_encode(data)
    data = log_transform(data, feature)
    return data

def split_pipeline(data, output):
    """
    Preprocessing pipeline part 2: Split data into variables
    Arguments: Pandas dataframe, output column (dependent variable)
    Returns: List of scaled and unscaled dependent and independent variables
    """
    y, X = data[output], data.drop([output], axis=1)
    
    X_train, X_valid, y_train, y_valid = train_test_split(
        data.drop([output], axis=1), data[output], test_size=.2, random_state=1)
    
    X_scaled, y_scaled, X_train_scaled, y_train_scaled, X_valid_scaled, \
    = scale(X, y, X_train, y_train, X_valid)
    
    return [X_train, y_train, X_valid, y_valid,
            X_train_scaled, y_train_scaled, X_valid_scaled]



def scale(X, y, X_train, y_train, X_valid):
    """Scale dependent and independent variables"""
    X_scaler, y_scaler = StandardScaler(), StandardScaler()

    X_scaled = X_scaler.fit_transform(X.values.astype(float))
    y_scaled = y_scaler.fit_transform(
        y.values.astype(float).reshape(-1, 1)).flatten()

    X_train_scaled = pd.DataFrame(data=X_scaler.transform(
        X_train.values.astype(float)), columns=X.columns)
    y_train_scaled = y_scaler.transform(
        y_train.values.astype(float).reshape(-1, 1)).flatten()

    X_valid_scaled = pd.DataFrame(data=X_scaler.transform(
        X_valid.values.astype(float)), columns=X.columns)

    return [X_scaled, y_scaled, X_train_scaled, y_train_scaled,
            X_valid_scaled]


##SETUP##
sns.set()
random.seed(1234)
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (12, 10)})
pd.set_option("display.max_columns",None)
pd.set_option("display.max_colwidth", None)
#np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(threshold=0)
## — — — — — — to get rid of warning messages
warnings.filterwarnings('ignore') 
## — — — — — — — Remove scientific notations and display numbers with 2 decimal points instead — — — — — — — 
pd.options.display.float_format = '{:,.2f}'.format
## — — — — — — — Update default background style of plots — — — — — — — 
#sns.set_style(style=’darkgrid’)

######################################################################################### 

##PATHS##
pth=r'C:\Users\SALDRU1\Dropbox\PC\Desktop\ml_lres\lab_wb_fin_incl\\'

##CANVAS VARIABLES##
train = "train_clean.csv"
test = "test.csv"


Lets load in the financial inclusion data.


In [13]:
#Load train data
df_train = pd.read_csv(pth + train)
print(df_train.shape)
#print(df_train0.columns)
print(df_train.dtypes.value_counts())
df_train.head()

(7094, 63)
int64      58
float64     5
dtype: int64


Unnamed: 0.1,Unnamed: 0,edu_cat,earn_salary/wage,earn_sell_produce,earn_provide_service,earn_casual_work,earn_rely_others,earn_nothing,literacy_kiswhahili_lvl,literacy_english_lvl,mobile_money,mobile_money_classification,age_cat,dist_cat,access_cat,lpop_12_i,lgdp_17_i,female,own_other_land,own_mobile_phone,sent_money,received_money,married,divorced,widowed,own_land,hh_own_land,rent_land,no_land,grow_crops/produce,region_Arusha,region_Dar es Salaam,region_Dodoma,region_Geita,region_Iringa,region_Kagera,region_Kaskazini Pemba,region_Kaskazini Unguja,region_Katavi,region_Kigoma,region_Kilimanjaro,region_Kusini Pemba,region_Kusini Unguja,region_Lindi,region_Manyara,region_Mara,region_Mbeya,region_Mjini Magharibi,region_Morogoro,region_Mtwara,region_Muchiga,region_Mwanza,region_Njombe,region_Pwani,region_Rukwa,region_Ruvuma,region_Shinyanga,region_Simiyu,region_Singida,region_Songwe,region_Tabora,region_Tanga,mobile_money_purchase_freq
0,0,1,0,0,0,0,1,0,4,4,0,0,9.0,4.0,2.0,14.57,14.09,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
1,1,3,1,0,0,0,0,0,1,4,1,3,2.0,3.0,4.0,13.29,14.69,1,1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2,2,6,0,0,0,0,1,0,1,1,1,2,0.0,1.0,5.0,14.61,14.63,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5
3,3,3,0,0,0,1,0,0,1,4,1,3,4.0,0.0,2.0,14.34,14.87,1,1,1,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
4,4,1,0,1,0,0,0,0,1,4,1,3,2.0,6.0,0.0,13.82,14.6,1,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5


# MODEL PREDICTION

Finally, we can proceed to build and train multiple models to ultimately predict the outcome (value of the dependent variable), i.e. whether someone opens a mobile money account. We will try four different supervised learning techniques — logistic regression, decision trees, random forest (of decision trees) and support vector machines — and will implement those with the respective classes provided by the Scikit-learn library, which was already used to scale and split the data during pre-processing.

While more advanced models like artificial neural networks could provide more accurate predictors, they are less intuitive and interpretable for our purposes.

#### Split data

First check if outcome variable is balanced to ensure that no additonal weighting/balancing methods are necessary

In [9]:
# check balance of target
df_train['mobile_money_classification'].value_counts()/len(df_train['mobile_money_classification'])*100
#categories are fairly disributed above 10%

3.00   44.07
1.00   25.06
0.00   19.54
2.00   11.33
Name: mobile_money_classification, dtype: float64

Before we build up to a multi-class prediction model, we will test our model against the binary  'mobile money' classification that records 1 for "use mobile money" and 0 for "doesn't use mobile money". 

*Note: possible biased features: 'Q16'-which captures frequency of mobile money use and 'mobile_money_classification' - which is constructed from the 'mobile money' variable.

In [10]:
df_train['mobile_money'].value_counts()/len(df_train['mobile_money'])*100


1.00   55.40
0.00   44.60
Name: mobile_money, dtype: float64

Our target is balanced enough so we don’t need to do any balancing methods before we split our data.

In [None]:
df_train.drop(columns=['mobile_money_classification'], axis=1, inplace=True)

Now, we can split our train data into train and validation data, where validation data will be used for fine tuning the model which leaves our test data unseen.

In [11]:

X_train, y_train, X_valid, y_valid, X_train_scaled, y_train_scaled, X_valid_scaled= split_pipeline(df_train, "mobile_money")

# check balance of outcomes after splits
print(f'Train outcome balance: \n{y_train.value_counts()/len(y_train)}\n')
print(f'Validation outcome balance: \n{y_valid.value_counts()/len(y_valid)}\n')

Train outcome balance: 
1.00   0.55
0.00   0.45
Name: mobile_money, dtype: float64

Validation outcome balance: 
1.00   0.57
0.00   0.43
Name: mobile_money, dtype: float64



#### Baseline model selection

I’ll create a pipeline and run a vanilla model for several different types of models to see how each perform with our data. My pipeline will scale the data and train each of the models in a loop. Then we will check the overall accuracy of each of the models using the validation data sets to compare and decide which one can provide us the the best results for our data with the least tuning.

In [14]:
# list all classifiers to test
classifiers = [
    KNeighborsClassifier(),
    SVC(),
    LinearSVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()
    ]

# run a pipeline in a loop for each classifier, along with its validation scores
for classifier in classifiers:
    pipe = Pipeline(steps=[('scaler', StandardScaler()),
                           ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print(f'model score: {pipe.score(X_valid, y_valid)}')
    print('----------------')

KNeighborsClassifier()
model score: 0.897815362931642
----------------
SVC()
model score: 0.9936575052854123
----------------
LinearSVC()
model score: 1.0
----------------
NuSVC()
model score: 0.9725158562367865
----------------
DecisionTreeClassifier()
model score: 1.0
----------------
RandomForestClassifier()
model score: 1.0
----------------
AdaBoostClassifier()
model score: 1.0
----------------
GradientBoostingClassifier()
model score: 1.0
----------------
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, 