# Predicting poverty/wealthness of costa rican household

In [258]:
import pandas as pd
import numpy as np

np.random.seed(112) # to get stable & reproductible results

path = "data/"

In [259]:
class BaseFeature():
    def __init__(self, individuals):
        self.individuals = individuals

        self.grouped_individuals = individuals.groupby('idhogar')
        self.household = pd.DataFrame()

    def X(self):
        self.household['qmobilephone'] = self.grouped_individuals.qmobilephone.mean()
        return self.household.loc[:, train.household.columns != 'Target']

    def y(self):
        self.household['Target'] = self.grouped_individuals.Target.mean().round().astype(int)
        return self.household['Target']


In [260]:
from sklearn.base import BaseEstimator, TransformerMixin


class FeaturesEnginor(BaseEstimator, TransformerMixin):
    def __init__(self, individuals, add_edjefe, add_dependency, add_overcrowding, add_qmobilephone,
                 add_rooms, add_SQBhogar_nin, add_G_people_count, add_G_percentage_under_15, add_fem_perc,
                 add_schooling_avg, add_edjefa, add_elec, add_floor, add_no_facilities, add_it_equipement,
                 add_elec_missing, add_rent_pc, add_rent, add_house_state, add_cielorazo, add_size_pp, add_roof):
        self.individuals = individuals
        
        self.grouped_individuals = individuals.groupby('idhogar')
        self.household = pd.DataFrame()

        # need to build other features
        self.household['G_people_count'] = self.grouped_individuals.age.count()
        self.household['elec'] = self.elec()
        
        self.add_edjefe = add_edjefe
        self.add_dependency = add_dependency
        self.add_overcrowding = add_overcrowding
        self.add_qmobilephone = add_qmobilephone
        self.add_rooms = add_rooms
        self.add_SQBhogar_nin = add_SQBhogar_nin
        self.add_G_people_count = add_G_people_count
        self.add_G_percentage_under_15 = add_G_people_count
        self.add_fem_perc = add_fem_perc
        self.add_schooling_avg = add_schooling_avg
        self.add_edjefa = add_edjefa
        self.add_elec = add_elec
        self.add_floor = add_floor
        self.add_no_facilities = add_no_facilities
        self.add_it_equipement = add_it_equipement
        self.add_elec_missing = add_elec_missing
        self.add_rent_pc = add_rent_pc
        self.add_rent = add_rent
        self.add_house_state = add_house_state
        self.add_cielorazo = add_cielorazo
        self.add_size_pp = add_size_pp
        self.add_roof = add_roof

    def fit(self, X, y):
        return self

    def transform(self, X):
        if self.add_edjefe:
            X['edjefe'] = self.edjefe()
        if self.add_dependency:
            X['dependency'] = self.dependency()
        if self.add_overcrowding:
            X['overcrowding'] = self.grouped_individuals.overcrowding.mean()
        if self.add_qmobilephone:
            X['qmobilephone'] = self.grouped_individuals.qmobilephone.mean()
        if self.add_rooms:
            X['rooms'] = self.grouped_individuals.rooms.mean()
        if self.add_SQBhogar_nin:
            X['SQBhogar_nin'] = self.grouped_individuals.SQBhogar_nin.mean()
        if self.add_G_people_count:
            X['G_people_count'] = self.grouped_individuals.age.count()
        if self.add_G_percentage_under_15:
            X['G_percentage_under_15'] = self.G_percentage_under(15)
        if self.add_fem_perc:
            X['fem_perc'] = self.fem_perc()
        if self.add_schooling_avg:
            X['schooling_avg'] = self.schooling_avg()
        if self.add_edjefa:
            X['edjefa'] = self.edjefa()
        if self.add_elec:
            X['elec'] = self.elec()
        if self.add_floor:
            X['floor'] = self.floor()
        if self.add_no_facilities:
            X['no_facilities'] = self.no_facilities()
        if self.add_it_equipement:
            X['it_equipement'] = self.it_equipement()
        if self.add_elec_missing:
            X['elec-missing'] = self.household['elec'].isnull()
        if self.add_rent_pc:
            X['rent_pc'] = self.rent_pc()
        if self.add_rent:
            X['rent'] = self.rent()
        if self.add_house_state:
            self.household['house_state'] = self.house_state()
        if self.add_cielorazo:
            self.household['cielorazo'] = self.grouped_individuals.cielorazo.mean()
        if self.add_size_pp:
            self.household['size_pp'] = self.size_pp()
        if self.add_roof:
            self.household['roof'] = self.roof()
        return X
    
    def edjefe(self):
        edjefe = self.grouped_individuals.edjefe.max().replace({'no': 0, 'yes': 1, None: 0., np.NaN: 0.0})
        return edjefe.apply(int)
    
    def dependency(self):
        dependency = self.grouped_individuals.dependency.max().replace({'no': 0, 'yes': 1, None: 0., np.NaN: 0.0})
        return dependency.apply(float)
    
    def G_percentage_under(self, age):
        return (self.individuals[self.individuals.age <= age].groupby('idhogar').Id.count()/ self.household['G_people_count']).replace({None: 0., np.NaN: 0.0})

    def fem_perc(self):
        female = self.grouped_individuals.r4m3.mean()
        people = self.grouped_individuals.hogar_total.mean()

        return female / people
    
    def schooling_avg(self):
        schooling_sum = self.grouped_individuals.escolari.sum()
        people = self.grouped_individuals.hogar_total.mean()
        schooling_avg = schooling_sum / people
        return schooling_avg
    
    def edjefa(self):
        edjefa = self.grouped_individuals.edjefa.max().replace({'no': 0, 'yes': 1, None: 0., np.NaN: 0.0})
        return edjefa.apply(int)

    def elec(self):
        elec = (
            self.grouped_individuals.noelec.mean() * 0 +
            self.grouped_individuals.coopele.mean() * 1 +
            self.grouped_individuals.public.mean() * 2 +
            self.grouped_individuals.planpri.mean() * 4
        )
        return elec
    
    def floor(self):
        floor = (
            self.grouped_individuals.eviv1.mean() * 0 +
            self.grouped_individuals.eviv2.mean() * 1 +
            self.grouped_individuals.eviv3.mean() * 4
        )
        return floor ** 1
    
    def roof(self):
        roof = (
            self.grouped_individuals.etecho1.mean() * 0 +
            self.grouped_individuals.etecho2.mean() * 1 +
            self.grouped_individuals.etecho3.mean() * 4
        )
        return roof ** 1
    
    def wall(self):
        wall = (
            self.grouped_individuals.epared1.mean() * 0 +
            self.grouped_individuals.epared2.mean() * 1 +
            self.grouped_individuals.epared3.mean() * 4
        )
        return wall ** 1
    
    def no_facilities(self):
        return (
            (self.grouped_individuals.sanitario1.mean() == 1) +
            (self.grouped_individuals.noelec.mean() == 1) +
            (self.grouped_individuals.pisonotiene.mean() == 1) +
            (self.grouped_individuals.abastaguano.mean() == 1) +
            (self.grouped_individuals.cielorazo.mean() == 0)
        )

    def it_equipement(self):
        return (
            self.grouped_individuals.refrig.mean() +
            self.grouped_individuals.computer.mean() +
            (self.grouped_individuals.v18q1.mean() > 0) +
            self.grouped_individuals.television.mean()
        )
    
    def rent(self):
        rent = (
            # self.grouped_individuals.tipovivi5.mean() * 1 +
            # self.grouped_individuals.tipovivi4.mean() * 2 +
            self.grouped_individuals.tipovivi3.mean() * 1 +
            self.grouped_individuals.tipovivi5.mean() * 2 +
            self.grouped_individuals.tipovivi1.mean() * 4
        )
        return rent

    def rent_pc(self):
        return self.grouped_individuals.v2a1.mean() / self.grouped_individuals.tamviv.mean()

    def house_state(self):
        return self.floor() + self.roof() + self.wall()
    
    def size_pp(self):
        size = self.grouped_individuals.hhsize.mean()
        people = self.grouped_individuals.hogar_total.mean()

        return size / people

# Feature engineering

In [261]:
# Build dataset on Household level.
individuals = pd.read_csv(path + 'train.csv')
train = BaseFeature(individuals)
X_base = train.X()
y = train.y()

# Train model & validate locally 
On macro F1-score

In [262]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

In [263]:
from sklearn.pipeline import Pipeline

add_edjefe=True
add_dependency=True
add_overcrowding= True
add_qmobilephone=True
add_rooms=True
add_SQBhogar_nin=True
add_G_people_count=True
add_G_percentage_under_15=True
add_fem_perc=True
add_schooling_avg=True
add_edjefa=True
add_elec=True
add_floor=True
add_no_facilities=True
add_it_equipement=True
add_elec_missing=True
add_rent_pc=False
add_rent=True
add_house_state=True
add_cielorazo=True
add_size_pp=True
add_roof=True


pipeline = Pipeline([
        ('features', FeaturesEnginor(
            individuals, add_edjefe=add_edjefe, add_dependency=add_dependency, add_overcrowding=add_overcrowding,
            add_qmobilephone=add_qmobilephone, add_rooms=add_rooms, add_SQBhogar_nin=add_SQBhogar_nin,
            add_G_people_count=add_G_people_count, add_G_percentage_under_15=add_G_percentage_under_15,
            add_fem_perc=add_fem_perc, add_schooling_avg=add_schooling_avg, add_edjefa=edjefa, add_elec=add_elec,
            add_floor=add_floor, add_no_facilities=add_no_facilities, add_it_equipement=add_it_equipement,
            add_elec_missing=add_elec_missing, add_rent_pc=add_rent_pc, add_rent=add_rent,
            add_house_state=add_house_state, add_cielorazo=add_cielorazo, add_size_pp=add_size_pp, add_roof=add_roof)
        ),
        ('clf', clf)
    ]
)

In [264]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

random_grid = [{
    'features__add_edjefe': [True],
    'features__add_dependency': [True],
    'features__add_overcrowding': [True],
    'features__add_qmobilephone': [True],
    'features__add_rooms': [True],
    'features__add_SQBhogar_nin': [True],
    'features__add_G_people_count': [True],
    'features__add_G_percentage_under_15': [True],
    'features__add_fem_perc': [True],
    'features__add_schooling_avg': [True],
    'features__add_edjefa': [True],
    'features__add_elec': [True],
    'features__add_floor': [False],
    'features__add_no_facilities': [False],
    'features__add_it_equipement': [False],
    'features__add_elec_missing': [False],
    'features__add_rent_pc': [False], # debug for true. missing values
    'features__add_rent': [False],
    'features__add_house_state': [False],
    'features__add_cielorazo': [False],
    'features__add_size_pp': [False],
    'features__add_roof': [False],
    
    'clf__n_neighbors': [4],
    'clf__weights': ['distance'],
    'clf__leaf_size': [24],
    'clf__p': [1]
              }
    ]

rf_random = GridSearchCV(estimator = pipeline, param_grid = random_grid, scoring = 'f1_macro', cv = 5)
rf_random.fit(X_base, y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeaturesEnginor(add_G_people_count=True, add_G_percentage_under_15=True,
        add_SQBhogar_nin=True, add_cielorazo=True, add_dependency=True,
        add_edjefa=True, add_edjefe=True, add_elec=True,
        add_elec_missing=True, add_fem_perc=True, add_floor=True,
        add_...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'features__add_G_people_count': [True], 'clf__leaf_size': [24], 'clf__n_neighbors': [4], 'features__add_it_equipement': [False], 'features__add_qmobilephone': [True], 'features__add_floor': [False], 'features__add_elec': [True], 'features__add_G_percentage_under_15': [True], 'features__...rue], 'clf__weights': ['distance'], 'features__add_edjefa': [True], 'features__add_edjefe': [True]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [265]:
best_clf = rf_random.best_estimator_

In [266]:
cvres = rf_random.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(mean_score * 100, std_score * 100, params)

35.180121773607475 3.070341254304128 {'features__add_G_people_count': True, 'clf__leaf_size': 24, 'clf__n_neighbors': 4, 'features__add_overcrowding': True, 'features__add_fem_perc': True, 'features__add_floor': False, 'features__add_elec': True, 'features__add_G_percentage_under_15': True, 'features__add_no_facilities': False, 'features__add_rent_pc': False, 'features__add_schooling_avg': True, 'clf__p': 1, 'features__add_SQBhogar_nin': True, 'features__add_it_equipement': False, 'features__add_dependency': True, 'features__add_roof': False, 'features__add_cielorazo': False, 'features__add_size_pp': False, 'features__add_house_state': False, 'features__add_elec_missing': False, 'features__add_rent': False, 'features__add_rooms': True, 'features__add_qmobilephone': True, 'features__add_edjefe': True, 'features__add_edjefa': True, 'clf__weights': 'distance'}


In [267]:
print(rf_random.best_score_ * 100)

35.180121773607475


The initial idea was to try all the combinations of the 22 features: 2^22 = 4 millions possibilities. This is not doable in a reasonable amount of time. 

Possible improve