# Predicting poverty/wealthness of costa rican household

In [1]:
import pandas as pd
import numpy as np

np.random.seed(112)

path = "data/"# "../input/" for running the kernel on kaggle

In [2]:
# This will be an helper method to re-use the code between train and test set

class Household():
    def __init__(self, individuals): 
        self.individuals = individuals
        
        self.individuals.loc[(self.individuals['tipovivi1'] == 1), 'v2a1'] = 0
        self.individuals.tipovivi1 = self.individuals.tipovivi1.fillna(0)
        
        self.grouped_individuals = individuals.groupby('idhogar')
        self.household = pd.DataFrame()

    def X(self):
        self.__add_base_features__()
        self.__add_house_features__()
        return self.household.loc[:, train.household.columns != 'Target']
    
    def y(self):
        self.household['Target'] = self.grouped_individuals.Target.mean().round().astype(int)
        return self.household['Target']

    def __add_base_features__(self):
        self.household['SQBedjefe'] = self.grouped_individuals.SQBedjefe.mean()
        self.household['SQBdependency'] = self.grouped_individuals.SQBdependency.mean()
        self.household['overcrowding'] = self.grouped_individuals.overcrowding.mean()
        self.household['qmobilephone'] = self.grouped_individuals.qmobilephone.mean()
        self.household['rooms'] = self.grouped_individuals.rooms.mean()
        self.household['SQBhogar_nin'] = self.grouped_individuals.SQBhogar_nin.mean()
        self.household['G_people_count'] = self.grouped_individuals.age.count()
        self.household['G_percentage_under_15'] = self.G_percentage_under(15)

        self.household['fem_perc'] = self.fem_perc()
        self.household['schooling_avg'] = self.schooling_avg()
        self.household['edjefa'] = self.edjefa()
        self.household['elec'] = self.elec()
        
    def it_equipement(self):
        return (
            self.grouped_individuals.refrig.mean() + 
            self.grouped_individuals.computer.mean() + 
            (self.grouped_individuals.v18q1.mean() > 0) + 
            self.grouped_individuals.television.mean()
        )
        
    def no_facilities(self):
        return (
            (self.grouped_individuals.sanitario1.mean() == 1) +
            (self.grouped_individuals.noelec.mean() == 1) +
            (self.grouped_individuals.pisonotiene.mean() == 1) +
            (self.grouped_individuals.abastaguano.mean() == 1) +
            (self.grouped_individuals.cielorazo.mean() == 0)
        )
        
    def G_percentage_under(self, age):
        return (self.individuals[self.individuals.age <= age].groupby('idhogar').Id.count()/ self.household['G_people_count']).replace({None: 0., np.NaN: 0.0})
    
    def edjefa(self):
        edjefa = self.grouped_individuals.edjefa.max().replace({'no': 0, 'yes': 1, None: 0., np.NaN: 0.0})
        return edjefa.apply(int)
    
    def edjefe(self):
        edjefe = self.grouped_individuals.edjefe.max().replace({'no': 0, 'yes': 1, None: 0., np.NaN: 0.0})
        return edjefe.apply(int)
        
    def schooling_avg(self):
        schooling_sum = self.grouped_individuals.escolari.sum()
        people = self.grouped_individuals.hogar_total.mean()
        schooling_avg = schooling_sum / people
        return schooling_avg
    
    def size_pp(self):
        size = self.grouped_individuals.hhsize.mean()
        people = self.grouped_individuals.hogar_total.mean()
        
        return size / people
            
    def fem_perc(self):
        female = self.grouped_individuals.r4m3.mean()
        people = self.grouped_individuals.hogar_total.mean()
        
        return female / people
       
    def rent(self):
        rent = (
            # self.grouped_individuals.tipovivi5.mean() * 1 +
            # self.grouped_individuals.tipovivi4.mean() * 2 +
            self.grouped_individuals.tipovivi3.mean() * 1 +
            self.grouped_individuals.tipovivi5.mean() * 2 +
            self.grouped_individuals.tipovivi1.mean() * 4
            # self.grouped_individuals.tipovivi1.mean() * 5
        )
        return rent

    def rent_pc(self):
        return self.grouped_individuals.v2a1.mean() / self.grouped_individuals.tamviv.mean()
    
    def elec(self):
        elec = (
            self.grouped_individuals.noelec.mean() * 0 +
            self.grouped_individuals.coopele.mean() * 1 +
            self.grouped_individuals.public.mean() * 2 +
            self.grouped_individuals.planpri.mean() * 4
        )
        return elec
    
    def floor(self):
        floor = (
            self.grouped_individuals.eviv1.mean() * 0 +
            self.grouped_individuals.eviv2.mean() * 1 +
            self.grouped_individuals.eviv3.mean() * 4
        )
        return floor ** 1
    
    def roof(self):
        roof = (
            self.grouped_individuals.etecho1.mean() * 0 +
            self.grouped_individuals.etecho2.mean() * 1 +
            self.grouped_individuals.etecho3.mean() * 4
        )
        return roof ** 1
    
    def wall(self):
        wall = (
            self.grouped_individuals.epared1.mean() * 0 +
            self.grouped_individuals.epared2.mean() * 1 +
            self.grouped_individuals.epared3.mean() * 4
        )
        return wall ** 1
    
    def house_state(self):
        return self.floor() + self.roof() + self.wall()

    def __add_house_features__(self):
        return None
        # TODO

# First feature engineering
Out of my notebook feature-selection (1), I am selecting the important features, that are already aggregated on a Household level. Improvements are definitivly possible.

(1) https://www.kaggle.com/gobert/data-selection-with-randomforest

In [3]:
# Build dataset on Household level.
individuals = pd.read_csv(path + 'train.csv')
train = Household(individuals)
X = train.X()
y = train.y()

# Train model & validate locally 
On macro F1-score

In [4]:
y = train.household.Target

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=112, test_size=0.2)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

random_grid = [{
    'n_neighbors': [1, 4, 5, 10],
    'weights': ['distance'],
    'leaf_size': [3, 6, 12, 24, 48],
    'p': [1, 2, 4, 8]
              }
    ]

rf_random = GridSearchCV(estimator = clf, param_grid = random_grid, scoring = 'f1_macro', cv = 5)
rf_random.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'p': [1, 2, 4, 8], 'leaf_size': [3, 6, 12, 24, 48], 'weights': ['distance'], 'n_neighbors': [1, 4, 5, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_macro', verbose=0)

In [7]:
best_clf = rf_random.best_estimator_

In [8]:
cvres = rf_random.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(mean_score * 100, std_score * 100, params)

34.952208895356534 2.80931531712833 {'p': 1, 'n_neighbors': 1, 'weights': 'distance', 'leaf_size': 3}
35.38522320585799 2.5406990396474054 {'p': 2, 'n_neighbors': 1, 'weights': 'distance', 'leaf_size': 3}
35.06046727975563 2.943768838336873 {'p': 4, 'n_neighbors': 1, 'weights': 'distance', 'leaf_size': 3}
35.133866461069964 2.754089449036469 {'p': 8, 'n_neighbors': 1, 'weights': 'distance', 'leaf_size': 3}
36.698872347813314 2.794472566215003 {'p': 1, 'n_neighbors': 4, 'weights': 'distance', 'leaf_size': 3}
35.39000520938879 1.5027051939710334 {'p': 2, 'n_neighbors': 4, 'weights': 'distance', 'leaf_size': 3}
35.02062512417795 1.610591763079998 {'p': 4, 'n_neighbors': 4, 'weights': 'distance', 'leaf_size': 3}
35.39549829981924 1.4873276253581669 {'p': 8, 'n_neighbors': 4, 'weights': 'distance', 'leaf_size': 3}
36.050864500295155 2.5044356219569495 {'p': 1, 'n_neighbors': 5, 'weights': 'distance', 'leaf_size': 3}
34.955909850916086 2.388355677004233 {'p': 2, 'n_neighbors': 5, 'weights': 

In [9]:
print(rf_random.best_score_ * 100)

36.98897351482336


In [12]:
best_clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 24,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 4,
 'p': 1,
 'weights': 'distance'}

# Predict on test set & export

In [13]:
X = train.X()
y = train.y()
best_clf.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=24, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=1,
           weights='distance')

In [14]:
# Build test dataset on Household level
df_test = pd.read_csv(path + 'test.csv')
test = Household(df_test)
X = test.X()
X_test = X

In [15]:
X_test['Target'] = clf.predict(X_test)

Now we need to copy the result on a household level to an individual level:

In [16]:
df_test['Target'] = None

def target(idhogar):
    return X_test.Target[idhogar]

df_test['Target'] = df_test.idhogar.map(target)

In [17]:
df_test[['Id', 'Target']].to_csv("sample_submission.csv", index=False)