# Costa Rica Poverty Level Classiciation 

In [0]:
pip install catboost dask-ml -q

Note: you may need to restart the kernel to use updated packages.


# Imports and DataFrame Creation

In [0]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
import joblib
from dask.distributed import Client, progress
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='Accent')
import plotly.express as px

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import confusion_matrix, classification_report, f1_score, balanced_accuracy_score, roc_curve, precision_recall_curve, roc_auc_score, auc

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import label_binarize

import warnings
warnings.filterwarnings('ignore')

import timeit
from datetime import datetime

import itertools
from itertools import cycle

In [0]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:52425  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [0]:
df = dd.read_csv('/Users/jtchristyliner/Downloads/train 2.csv')

#https://www.kaggle.com/c/costa-rican-household-poverty-prediction

# Data Cleaning

## Reverse One Hot Encoding to Ordinal Features

In [0]:
#Categories to reverse
water_df = df.loc[:,['abastaguano', 'abastaguafuera', 'abastaguadentro']]
wall_df = df.loc[:,['epared1', 'epared2', 'epared3']]
roof_df = df.loc[:,['etecho1', 'etecho2', 'etecho3']]
floor_df = df.loc[:,['eviv1', 'eviv2', 'eviv3']]
education_df = df.loc[:, ['instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']]
home_ownership_df = df.loc[:, ['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']]

In [0]:
#Reverse and combine back into original dataframe
for column in education_df:
    education_df[column] = education_df[column].replace(1, education_df.columns.get_loc(column))
    
df['education'] = education_df.sum(axis = 1) 

for column in water_df:
    water_df[column] = water_df[column].replace(1, water_df.columns.get_loc(column))
    
df['water'] = water_df.sum(axis = 1) 

for column in wall_df:
    wall_df[column] = wall_df[column].replace(1, wall_df.columns.get_loc(column))
    
df['wall'] = wall_df.sum(axis = 1) 

for column in roof_df:
    roof_df[column] = roof_df[column].replace(1, roof_df.columns.get_loc(column))
    
df['roof'] = roof_df.sum(axis = 1) 

for column in floor_df:
    floor_df[column] = floor_df[column].replace(1, floor_df.columns.get_loc(column))
    
df['floor'] = floor_df.sum(axis = 1) 

for column in home_ownership_df:
    home_ownership_df[column] = home_ownership_df[column].replace(1, home_ownership_df.columns.get_loc(column))
    
df['home_ownership'] = home_ownership_df.sum(axis = 1) 


In [0]:
#drop columns that were combined from one hot encoding reversal and rename columns

df = df.drop(columns = ['abastaguano', 'abastaguafuera', 'abastaguadentro', 'epared1', 
                              'epared2', 'epared3', 'etecho1', 'etecho2', 'etecho3', 'eviv1', 
                              'eviv2', 'eviv3', 'instlevel1', 'instlevel2', 'instlevel3',
                              'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
                              'instlevel9', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 
                              'tipovivi5'])


## Handle Object Dtypes

In [0]:
df.columns[df.dtypes==object]

Index(['Id', 'idhogar', 'dependency', 'edjefe', 'edjefa'], dtype='object')

## Dependency

In [0]:
df['dependency'].unique()

Dask Series Structure:
npartitions=1
    object
       ...
Name: dependency, dtype: object
Dask Name: unique-agg, 104 tasks

In [0]:
df['dependency'] = df['dependency'].replace('yes', 1)
df['dependency'] = df['dependency'].replace('no', 0)
df['dependency'] = df['dependency'].astype('float64')

## Years of Education for Male and Female Head of Household edjefe / edjefa

In [0]:
df['edjefe'].unique().compute()

0      10
1      12
2      no
3      11
4       9
5      15
6       4
7       6
8       8
9      17
10      7
11     16
12     14
13      5
14     21
15      2
16     19
17    yes
18      3
19     18
20     13
21     20
Name: edjefe, dtype: object

In [0]:
df['edjefa'].unique().compute()

0      no
1      11
2       4
3      10
4       9
5      15
6       7
7      14
8      13
9       8
10     17
11      6
12      5
13      3
14     16
15     19
16    yes
17     21
18     12
19      2
20     20
21     18
Name: edjefa, dtype: object

In [0]:
df['edjefe'] = df['edjefe'].replace('yes', 1)
df['edjefe'] = df['edjefe'].replace('no', 0)
df['edjefe'] = df['edjefe'].astype('int64')

df['edjefa'] = df['edjefa'].replace('yes', 1)
df['edjefa'] = df['edjefa'].replace('no', 0)
df['edjefa'] = df['edjefa'].astype('int64')

## Initial Check For Nulls


### Rent 'v2a1'

In [0]:
print('Rent Value Counts: ')
print(df['v2a1'].value_counts().compute())
print('\n--------------\n')
print('Rent Null Values: ')
print(df['v2a1'].isnull().sum().compute())

Rent Value Counts: 
150000.0     233
100000.0     181
200000.0     159
120000.0     149
50000.0      118
80000.0      104
90000.0       81
70000.0       79
180000.0      77
300000.0      76
250000.0      75
40000.0       72
60000.0       57
350000.0      53
130000.0      53
110000.0      53
30000.0       48
160000.0      46
170000.0      41
140000.0      39
220000.0      30
0.0           29
570540.0      25
45000.0       25
175000.0      23
400000.0      22
20000.0       22
285270.0      21
25000.0       21
125000.0      20
            ... 
2353477.0      2
78000.0        2
219087.0       2
525000.0       2
470000.0       2
420000.0       2
325000.0       2
294000.0       2
288750.0       2
278000.0       2
275000.0       2
83333.0        2
155000.0       2
191500.0       2
176000.0       2
93000.0        2
159751.0       2
100297.0       2
102000.0       2
108000.0       1
542013.0       1
510000.0       1
119813.0       1
118097.0       1
234000.0       1
125518.0       1
25310.0    

In [0]:
df['v2a1'] = df['v2a1'].fillna(value = 0)

### Years Behind in School

In [0]:
print('Years Behind in School Value Counts: ')
print(df['rez_esc'].value_counts().compute())
print('\n--------------\n')
print('Years Behind in School Null Values: ')
print(df['rez_esc'].isnull().sum().compute())

Years Behind in School Value Counts: 
0.0    1211
1.0     227
2.0      98
3.0      55
4.0      29
5.0       9
Name: rez_esc, dtype: int64

--------------

Years Behind in School Null Values: 
7928


In [0]:
years_behind_school_age_df = df.loc[df['rez_esc'].isnull(), ['rez_esc', 'age']]
years_list = years_behind_school_age_df['age'].unique()

years_list.compute()

0     43
1     67
2     92
3     37
4     38
5     30
6     28
7     18
8     34
9     79
10    39
11    19
12    70
13    50
14    22
15    26
16    69
17    66
18    41
19    20
20    40
21    44
22    62
23    33
24    35
25    56
26    52
27    36
28    24
29    76
      ..
57    72
58     6
59     5
60    58
61    27
62     3
63     2
64    61
65    25
66     0
67    23
68    54
69    32
70    65
71    77
72    81
73    88
74    64
75    87
76    82
77    95
78    80
79    85
80    83
81    84
82    90
83    86
84    91
85    93
86    10
Name: age, Length: 87, dtype: int64

In [0]:
age_10_yrs_behind = df.loc[df['age']==10, ['age', 'rez_esc']]
age_10_yrs_behind.loc[age_10_yrs_behind['rez_esc'].isnull()].compute()

Unnamed: 0,age,rez_esc
2514,10,


Since all of the null values for years behind in school (except 1) fall on people before or after school ages--the nulls will be filled in with zeros. 

In [0]:
df['rez_esc'] = df['rez_esc'].fillna(value = 0)

### Household Tablets 'v18q1'

In [0]:
df['v18q1'].value_counts().compute()

1.0    1586
2.0     444
3.0     129
4.0      37
5.0      13
6.0       6
Name: v18q1, dtype: int64

Since there are no zeros on the number of tablets owned, I will fill in the null values with zero since that's what they seem to indicate.


In [0]:
df['v18q1'] = df['v18q1'].fillna(0)

### Drop the remaining nulls since there are only 5 rows with null values.

In [0]:
df = df.dropna()

# Initial Feature Selection

Drop features due to multicollinearity or low impact on model. Drop ID numbers. 

In [0]:
df1 = df.copy()
df1 = df1.drop(columns = ['v18q', 'edjefe', 'Id', 'idhogar', 'hacdor', 'hogar_total'])

## Check for Multicollinearity 

In [0]:
abs(df1.corr()['Target']).compute().sort_values(ascending = True)

planpri            0.000551
paredother         0.000615
coopele            0.002398
estadocivil6       0.003306
estadocivil7       0.004391
hogar_mayor        0.004988
parentesco7        0.006253
parentesco8        0.008580
public             0.008945
parentesco10       0.011012
parentesco9        0.012362
sanitario6         0.012937
parentesco12       0.013759
parentesco5        0.016572
lugar2             0.018411
parentesco11       0.019038
techoentrepiso     0.019643
parentesco4        0.020849
pisoother          0.021233
elimbasu6          0.024521
techozinc          0.027592
paredfibras        0.027719
estadocivil2       0.030001
techootro          0.032454
noelec             0.033962
techocane          0.035316
r4m2               0.036567
edjefa             0.037415
parentesco1        0.037697
male               0.038595
                     ...   
paredmad           0.168426
lugar1             0.173109
r4m3               0.173588
computer           0.183107
dependency         0

# Model Creation, Fine Tuning, and Metrics

## Train Test Split

In [0]:
X = df1.drop(columns = ['Target']) 
y = df1['Target']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Random Forest

### Random Forest with Best Hyper Parameters

In [0]:
#create model
rfc = RandomForestClassifier(max_features = 'auto', random_state = 21, n_estimators = 2000, max_depth = 80, min_samples_split=2, min_samples_leaf=1, bootstrap = False)

with joblib.parallel_backend('dask'):
    rfc.fit(X_train, y_train)

In [0]:
#model results
preds_train = rfc.predict(X_train.values.compute())
preds_test = rfc.predict(X_test.values.compute())

### Random Forest Metrics

In [0]:
print('Train Accuracy: {}'.format(rfc.score(X_train, y_train)))
print('Test Accuracy: {}'.format(rfc.score(X_test, y_test)))
print('Train F1: {}'.format(f1_score(y_train, preds_train, average = 'micro')))
print('Test F1: {}'.format(f1_score(y_test, preds_test, average = 'micro')))

Train Accuracy: 1.0
Test Accuracy: 0.9399255715045188
Train F1: 1.0
Test F1: 0.9399255715045188


### Classification Report

In [0]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

           1       0.97      0.86      0.91       155
           2       0.94      0.89      0.91       321
           3       0.95      0.81      0.87       249
           4       0.94      0.99      0.96      1156

    accuracy                           0.94      1881
   macro avg       0.95      0.89      0.91      1881
weighted avg       0.94      0.94      0.94      1881



### Confusion Matrix

In [0]:
confusion_matrix(y_test, preds_test)

array([[ 133,    8,    0,   14],
       [   3,  285,    6,   27],
       [   1,    9,  201,   38],
       [   0,    2,    5, 1149]])

## XG Boost

### XG Boost with Best Hyperparameters

In [0]:
#create model
xgb1 = XGBClassifier(max_depth=9, learning_rate=.3, n_estimators=1000)

with joblib.parallel_backend('dask'):
    xgb1.fit(X_train, y_train)

In [0]:
#make predictions
xgb1_y_pred_train = xgb1.predict(X_train)

xgb1_y_pred_test = xgb1.predict(X_test)


print('Train Accuracy: {}'.format(xgb1.score(X_train, y_train)))
print('Test Accuracy: {}'.format(xgb1.score(X_test, y_test)))
print('Test F1: {}'.format(f1_score(y_test, xgb1_y_pred_test, average = 'micro')))
print('Train F1: {}'.format(f1_score(y_train, xgb1_y_pred_train, average = 'micro')))

Train Accuracy: 1.0
Test Accuracy: 0.9473684210526315
Test F1: 0.9473684210526315
Train F1: 1.0


### Classification Report

In [0]:
print(classification_report(y_test, xgb1_y_pred_test))

              precision    recall  f1-score   support

           1       0.96      0.88      0.92       155
           2       0.94      0.89      0.92       321
           3       0.93      0.86      0.89       249
           4       0.95      0.99      0.97      1156

    accuracy                           0.95      1881
   macro avg       0.95      0.91      0.92      1881
weighted avg       0.95      0.95      0.95      1881



### Confusion Matrix

In [0]:
confusion_matrix(y_test, xgb1_y_pred_test)

array([[ 137,    7,    1,   10],
       [   4,  286,    8,   23],
       [   1,    9,  213,   26],
       [   0,    2,    8, 1146]])