In [66]:
import pandas as pd

In [67]:
# This will be an helper method to reuse the code between train and test set

class HouseholdDataFrame():
    def __init__(self, individuals): 
        self.individuals = individuals
        self.grouped_individuals = individuals.groupby('idhogar')
        self.household = pd.DataFrame()
        
    def X(self):
        self.household['SQBedjefe'] = self.grouped_individuals.SQBedjefe.mean()
        self.household['SQBdependency'] = self.grouped_individuals.SQBdependency.mean()
        self.household['overcrowding'] = self.grouped_individuals.overcrowding.mean()
        self.household['qmobilephone'] = self.grouped_individuals.qmobilephone.mean()
        self.household['rooms'] = self.grouped_individuals.rooms.mean()
        self.household['SQBhogar_nin'] = self.grouped_individuals.SQBhogar_nin.mean()
        self.household['G_people_count'] = self.grouped_individuals.age.count()
      
        return self.household.loc[:, self.household.columns != 'Target']
    
    def y(self):
        self.household['Target'] = self.grouped_individuals.Target.mean().round().astype(int)
        return self.household.Target
    

In [68]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, remove_SQBedjefe, remove_SQBdependency, remove_overcrowding,
                 remove_qmobilephone, remove_rooms, remove_SQBhogar_nin, remove_G_people_count):
        self.remove_SQBedjefe = remove_SQBedjefe 
        self.remove_SQBdependency = remove_SQBdependency
        self.remove_overcrowding = remove_overcrowding
        self.remove_qmobilephone = remove_qmobilephone
        self.remove_rooms = remove_rooms
        self.remove_SQBhogar_nin = remove_SQBhogar_nin
        self.remove_G_people_count = remove_G_people_count

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.__selected_features__]
    
    def __selected_features__(self, X):
        all_features = X.columns.tolist()
        if self.remove_SQBedjefe:
            all_features.remove('SQBedjefe')
        if self.remove_SQBdependency:
            all_features.remove('SQBdependency')
        if self.remove_overcrowding:
            all_features.remove('overcrowding')
        if self.remove_qmobilephone:
            all_features.remove('qmobilephone')
        if self.remove_rooms:
            all_features.remove('rooms')
        if self.remove_SQBhogar_nin:
            all_features.remove('SQBhogar_nin')
        if self.remove_G_people_count:
            all_features.remove('G_people_count')
        return all_features

# First feature engineering
Out of my notebook, feature-selection I will take the important features, that are already aggregated on a Household level.

In [69]:
individuals_df = pd.read_csv("data/train.csv")
households_df = HouseholdDataFrame(individuals_df)
X = households_df.X()
y = households_df.y()

# Pipeline & cross validation

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

remove_SQBedjefe = False
remove_SQBedjefe = False
remove_SQBdependency = False
remove_overcrowding = False
remove_qmobilephone = False
remove_rooms = False
remove_SQBhogar_nin = False
remove_G_people_count = False

feature_pipeline = Pipeline([
        ('features', FeatureSelection(remove_SQBedjefe, remove_SQBdependency, remove_overcrowding,
                                     remove_qmobilephone, remove_rooms, remove_SQBhogar_nin, remove_G_people_count)),
        ('clf', RandomForestClassifier()),
    ])

params_grid = [
    {
        'features__remove_SQBedjefe': [False],
        'features__remove_SQBdependency': [True, False],
        'features__remove_overcrowding': [True, False],
        'features__remove_qmobilephone': [True, False],
        'features__remove_rooms': [True, False],
        'features__remove_SQBhogar_nin': [True, False],
        'features__remove_G_people_count': [True, False]
    }
]

grid_search = GridSearchCV(feature_pipeline, params_grid, cv=3, scoring='f1_macro')
grid_search.fit(X, y)
None

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [72]:
cvres = grid_search.cv_results_
for mean_score, std_score, params in zip(cvres['mean_test_score'], cvres['std_test_score'], cvres['params']):
    print(mean_score, std_score, params)

0.19775434547799361 0.00012857755807954682 {'features__remove_rooms': True, 'features__remove_G_people_count': True, 'features__remove_qmobilephone': True, 'features__remove_SQBhogar_nin': True, 'features__remove_SQBedjefe': False, 'features__remove_SQBdependency': True, 'features__remove_overcrowding': True}
0.22032234868717923 0.005559507665981944 {'features__remove_rooms': False, 'features__remove_G_people_count': True, 'features__remove_qmobilephone': True, 'features__remove_SQBhogar_nin': True, 'features__remove_SQBedjefe': False, 'features__remove_SQBdependency': True, 'features__remove_overcrowding': True}
0.21956112786657356 0.010434813806862196 {'features__remove_rooms': True, 'features__remove_G_people_count': True, 'features__remove_qmobilephone': False, 'features__remove_SQBhogar_nin': True, 'features__remove_SQBedjefe': False, 'features__remove_SQBdependency': True, 'features__remove_overcrowding': True}
0.2587385936469856 0.02043871262027074 {'features__remove_rooms': Fal

In [74]:
grid_search.best_params_

{'features__remove_G_people_count': True,
 'features__remove_SQBdependency': False,
 'features__remove_SQBedjefe': False,
 'features__remove_SQBhogar_nin': False,
 'features__remove_overcrowding': False,
 'features__remove_qmobilephone': False,
 'features__remove_rooms': False}