In [60]:
import pandas as pd

In [61]:
# This will be an helper method to reuse the code between train and test set

class HouseholdDataFrame():
    def __init__(self, individuals): 
        self.individuals = individuals
        self.grouped_individuals = individuals.groupby('idhogar')
        self.household = pd.DataFrame()
        
    def X(self):
        self.household['SQBedjefe'] = self.grouped_individuals.SQBedjefe.mean()
        self.household['SQBdependency'] = self.grouped_individuals.SQBdependency.mean()
        self.household['overcrowding'] = self.grouped_individuals.overcrowding.mean()
        self.household['qmobilephone'] = self.grouped_individuals.qmobilephone.mean()
        self.household['rooms'] = self.grouped_individuals.rooms.mean()
        self.household['SQBhogar_nin'] = self.grouped_individuals.SQBhogar_nin.mean()
        self.household['G_people_count'] = self.grouped_individuals.age.count()
        self.household['G_percentage_under_5'] = self.G_percentage_under(5)
        self.household['G_percentage_under_9'] = self.G_percentage_under(9)
        self.household['G_percentage_under_12'] = self.G_percentage_under(12)
        self.household['G_percentage_under_15'] = self.G_percentage_under(15)
        self.household['G_percentage_under_17'] = self.G_percentage_under(17)
        # self.household['G_percentage_male'] = self.G_percentage_male().replace({None: 0., np.NaN: 0.0})
            
        return self.household.loc[:, self.household.columns != 'Target']
        
    def G_percentage_under(self, age):
        return (self.individuals[self.individuals.age <= age].groupby('idhogar').Id.count()/ self.household['G_people_count']).replace({None: 0., np.NaN: 0.0})
    
    # def G_percentage_male(self):
    #    return self.individuals[self.individuals.age <= 12].groupby('idhogar').Id.count()/ self.household['G_people_count']

    
    def y(self):
        self.household['Target'] = self.grouped_individuals.Target.mean().round().astype(int)
        return self.household.Target
    

In [62]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, remove_SQBedjefe, remove_SQBdependency, remove_overcrowding,
                 remove_qmobilephone, remove_rooms, remove_SQBhogar_nin, remove_G_people_count,
                 remove_G_percentage_under_5, remove_G_percentage_under_9, remove_G_percentage_under_12,
                 remove_G_percentage_under_15, remove_G_percentage_under_17):
        self.remove_SQBedjefe = remove_SQBedjefe 
        self.remove_SQBdependency = remove_SQBdependency
        self.remove_overcrowding = remove_overcrowding
        self.remove_qmobilephone = remove_qmobilephone
        self.remove_rooms = remove_rooms
        self.remove_SQBhogar_nin = remove_SQBhogar_nin
        self.remove_G_people_count = remove_G_people_count
        self.remove_G_percentage_under_5 = remove_G_percentage_under_5
        self.remove_G_percentage_under_9 = remove_G_percentage_under_9
        self.remove_G_percentage_under_12 = remove_G_percentage_under_12
        self.remove_G_percentage_under_15 = remove_G_percentage_under_15
        self.remove_G_percentage_under_17 = remove_G_percentage_under_17

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.__selected_features__]
    
    def __selected_features__(self, X):
        all_features = X.columns.tolist()
        if self.remove_SQBedjefe:
            all_features.remove('SQBedjefe')
        if self.remove_SQBdependency:
            all_features.remove('SQBdependency')
        if self.remove_overcrowding:
            all_features.remove('overcrowding')
        if self.remove_qmobilephone:
            all_features.remove('qmobilephone')
        if self.remove_rooms:
            all_features.remove('rooms')
        if self.remove_SQBhogar_nin:
            all_features.remove('SQBhogar_nin')
        if self.remove_G_people_count:
            all_features.remove('G_people_count')
        if self.remove_G_percentage_under_5:
            all_features.remove('G_percentage_under_5')
        if self.remove_G_percentage_under_9:
            all_features.remove('G_percentage_under_9')
        if self.remove_G_percentage_under_12:
            all_features.remove('G_percentage_under_12')
        if self.remove_G_percentage_under_15:
            all_features.remove('G_percentage_under_15')
        if self.remove_G_percentage_under_17:
            all_features.remove('G_percentage_under_17')
            

        return all_features

# First feature engineering
Out of my notebook, feature-selection I will take the important features, that are already aggregated on a Household level.

In [63]:
individuals_df = pd.read_csv("data/train.csv")
households_df = HouseholdDataFrame(individuals_df)
X = households_df.X()
y = households_df.y()

# Pipeline & cross validation

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

remove_SQBedjefe = False
remove_SQBdependency = True
remove_overcrowding = True
remove_qmobilephone = True
remove_rooms = True
remove_SQBhogar_nin = True
remove_G_people_count = True
remove_G_percentage_under_5 = False
remove_G_percentage_under_9 = False
remove_G_percentage_under_12 = False
remove_G_percentage_under_15 = False
remove_G_percentage_under_17 = False

feature_pipeline = Pipeline([
        ('features', FeatureSelection(remove_SQBedjefe, remove_SQBdependency, remove_overcrowding,
                                     remove_qmobilephone, remove_rooms, remove_SQBhogar_nin, remove_G_people_count,
                                     remove_G_percentage_under_5, remove_G_percentage_under_9,
                                     remove_G_percentage_under_12, remove_G_percentage_under_15,
                                     remove_G_percentage_under_17)),
        ('clf', RandomForestClassifier()),
    ])

params_grid = [
    {
        'features__remove_SQBedjefe': [False],
        # 'features__remove_SQBdependency': [False],
        'features__remove_overcrowding': [False],
        'features__remove_qmobilephone': [False],
        'features__remove_rooms': [False],
        'features__remove_SQBhogar_nin': [False],
        'features__remove_G_people_count': [True, False],
        'features__remove_G_percentage_under_5': [True, False],
        'features__remove_G_percentage_under_9': [True, False],
        'features__remove_G_percentage_under_12': [True, False],
        'features__remove_G_percentage_under_15' : [True, False],
        'features__remove_G_percentage_under_17': [True, False],
    }
]

grid_search = GridSearchCV(feature_pipeline, params_grid, cv=3, scoring='f1_macro')
grid_search.fit(X, y)
None

In [73]:
cvres = grid_search.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(mean_score * 100, std_score * 100, params)

32.74686264889951 1.2777070496652694 {'features__remove_SQBhogar_nin': False, 'features__remove_rooms': False, 'features__remove_G_people_count': True, 'features__remove_qmobilephone': False, 'features__remove_overcrowding': False, 'features__remove_G_percentage_under_9': True, 'features__remove_G_percentage_under_17': True, 'features__remove_G_percentage_under_12': True, 'features__remove_G_percentage_under_15': True, 'features__remove_SQBedjefe': False, 'features__remove_G_percentage_under_5': True}
32.23046773197876 1.6619861379687604 {'features__remove_SQBhogar_nin': False, 'features__remove_rooms': False, 'features__remove_G_people_count': True, 'features__remove_qmobilephone': False, 'features__remove_overcrowding': False, 'features__remove_G_percentage_under_9': False, 'features__remove_G_percentage_under_17': True, 'features__remove_G_percentage_under_12': True, 'features__remove_G_percentage_under_15': True, 'features__remove_SQBedjefe': False, 'features__remove_G_percentage_u

In [74]:
grid_search.best_params_

{'features__remove_G_people_count': False,
 'features__remove_G_percentage_under_12': False,
 'features__remove_G_percentage_under_15': True,
 'features__remove_G_percentage_under_17': False,
 'features__remove_G_percentage_under_5': False,
 'features__remove_G_percentage_under_9': True,
 'features__remove_SQBedjefe': False,
 'features__remove_SQBhogar_nin': False,
 'features__remove_overcrowding': False,
 'features__remove_qmobilephone': False,
 'features__remove_rooms': False}

In [75]:
grid_search.best_score_

0.3527079588308707