원본 커널 : https://www.kaggle.com/skooch/xgboost

In [1]:
path = 'C:/Users/user/Desktop/kaggle_data/04. costa-rican-household-poverty-prediction/'

In [2]:
import numpy as np # linear algebra
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")

<br>

### Categorical mappping

In [4]:
from sklearn.preprocessing import LabelEncoder

def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])
    
def feature_importance(forest, X_train, display_results = True):
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_
    
    indices = np.argsort(importances)[::-1]
    
    if display_results:
        print('Feature ranking: ')
        
    for f in range(X_train.shape[1]):
        if display_results:
            print('{}. feature {} ({})'.format(f+1, indices[f], importances[indices[f]]) + ' - ' + X_train.columns[indices[f]])
            
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0 :
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

<br>

### Feature Engineering

In [5]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

   
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

In [22]:
# convert one hot encoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

<br>

### 데이터 정제

In [3]:
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')

test_ids = test.Id

In [6]:
def process_df(df_):
    encode_data(df_) # 인코딩 함수
    
    return do_features(df_) # FE 함수

train = process_df(train)
test = process_df(test)

In [7]:
train.head()

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,agg18_instlevel5_MEAN,agg18_instlevel5_COUNT,agg18_instlevel6_MEAN,agg18_instlevel6_COUNT,agg18_instlevel7_MEAN,agg18_instlevel7_COUNT,agg18_instlevel8_MEAN,agg18_instlevel8_COUNT,agg18_instlevel9_MEAN,agg18_instlevel9_COUNT
0,190000.0,0,3,0,1,1,0,,0,1,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,135000.0,0,4,0,1,1,1,1.0,0,1,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,,0,8,0,1,1,0,,0,0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,180000.0,0,5,0,1,1,1,1.0,0,2,...,1.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0
4,180000.0,0,5,0,1,1,1,1.0,0,2,...,1.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0


In [8]:
test.head()

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,agg18_instlevel5_MEAN,agg18_instlevel5_COUNT,agg18_instlevel6_MEAN,agg18_instlevel6_COUNT,agg18_instlevel7_MEAN,agg18_instlevel7_COUNT,agg18_instlevel8_MEAN,agg18_instlevel8_COUNT,agg18_instlevel9_MEAN,agg18_instlevel9_COUNT
0,,0,5,0,1,1,0,,1,1,...,0.0,2.0,0.0,2.0,0.0,2.0,0.5,2.0,0.5,2.0
1,,0,5,0,1,1,0,,1,1,...,0.0,2.0,0.0,2.0,0.0,2.0,0.5,2.0,0.5,2.0
2,,0,5,0,1,1,0,,1,1,...,0.0,2.0,0.0,2.0,0.0,2.0,0.5,2.0,0.5,2.0
3,,0,14,0,1,1,1,1.0,0,1,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,175000.0,0,4,0,1,1,1,1.0,0,0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [9]:
train.shape, test.shape

((9557, 218), (23856, 217))

<br>

#### 결측치 제거 및 문자형을 숫자형으로 바꾸기

In [10]:
# dependency는 모두 제곱값으로 바꾸기
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# education == no 는 0으로 바꾸기
train.loc[train['edjefa'] == 'no', 'edjefa'] = 0
train.loc[train['edjefe'] == 'no', 'edjefe'] = 0
test.loc[test['edjefa'] == 'no', 'edjefa'] = 0
test.loc[test['edjefe'] == 'no', 'edjefe'] = 0



In [11]:
# 교육 = yes, 이고 가장일 경우에 escolari 로 만들기

train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'edjefa'] = train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'escolari']
train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'edjefe'] = train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'escolari']

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "edjefa"] = test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1), "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "edjefe"] = test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1), "escolari"]

In [13]:
# edjefa == yes를 - > 숫자 4로

train.loc[train['edjefa'] == 'yes', 'edjefa'] = 4
train.loc[train['edjefe'] == 'yes', 'edjefe'] = 4

test.loc[test['edjefa'] == 'yes', 'edjefa'] = 4
test.loc[test['edjefe'] == 'yes', 'edjefe'] = 4

In [14]:
train['edjefe'] = train['edjefe'].astype(int)
train['edjefa'] = train['edjefa'].astype(int)
test['edjefe'] = test['edjefe'].astype(int)
test['edjefa'] = test['edjefa'].astype(int)

In [16]:
train['edjef'] = np.max(train[['edjefa', 'edjefe']], axis = 1)
test['edjef'] = np.max(test[['edjefa', 'edjefe']], axis = 1)

In [17]:
# nan 값 대체

train['v2a1'] = train['v2a1'].fillna(0)
test['v2a1'] = test['v2a1'].fillna(0)

train['v18q1'] = train['v18q1'].fillna(0)
test['v18q1'] = test['v18q1'].fillna(0)

train['rez_esc'] = train['rez_esc'].fillna(0)
test['rez_esc'] = test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), 'meaneduc'] = 0
train.loc[train.SQBmeaned.isnull(), 'SQBmeaned'] = 0

test.loc[test.meaneduc.isnull(), 'meaneduc'] = 0
test.loc[test.SQBmeaned.isnull(), 'SQBmeaned'] = 0

In [18]:
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'v14a'] = 0
train.loc[(train.v14a == 1) & (train.sanitario1 == 1) & (train.abastaguano == 0), 'sanitario1'] = 0

test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'v14a'] = 0
test.loc[(test.v14a == 1) & (test.sanitario1 == 1) & (test.abastaguano == 0), 'sanitario1'] = 0


In [19]:
def train_test_apply_func(train_, test_, func_):
    
    test_['Target'] = 0
    xx = pd.concat([train_, test_])
    
    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis = 1)
    
    del xx, xx_func
    return train_, test_
    

In [23]:
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


<br>

### Geo aggregates

In [27]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE',
              'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 
             'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',
             'bedrooms', 'overcrowding']


def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar'] + cols_nums)], pd.get_dummies(df_[cols_2_ohe], columns = cols_2_ohe)], axis =1)
    
    geo_agg = tmp_df.groupby(['lugar_LE', 'idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how = 'left', on = 'lugar_LE')

train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [28]:
train['num_over_18'] = 0
train['num_over_18'] = train[train.age >= 18].groupby('idhogar').transform("count")
train['num_over_18'] = train.groupby("idhogar")["num_over_18"].transform("max")
train['num_over_18'] = train['num_over_18'].fillna(0)

test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform("count")
test['num_over_18'] = test.groupby("idhogar")["num_over_18"].transform("max")
test['num_over_18'] = test['num_over_18'].fillna(0)

In [29]:
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - Total persons in the household
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # rent to people in household
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
    # some households have no one over 18, use the total rent for those
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1

In [30]:
extract_features(train)
extract_features(test)

In [31]:
# drop duplicated columns

needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female']

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

needless_cols.extend(instlevel_cols)

train = train.drop(needless_cols, axis = 1)
test = test.drop(needless_cols, axis =1)

<br>

### Split the data



In [32]:
def split_data(train, y, sample_weight = None, households = None, test_percentage = 0.20, seed = None):
    train2 = train.copy()
    
    cv_hhs = np.random.choice(households, size = int(len(households) * test_percentage), replace = False)
    
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]
    
    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [33]:
X = train.query('parentesco1==1')

y = X['Target'] - 1
X = X.drop(['Target'], axis = 1)

np.random.seed(seed = None)

train2 = X.copy()

train_hhs = train2.idhogar

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size = int(len(households) * 0.15), replace = False)

cv_idx = np.isin(train2.idhogar, cv_hhs)

X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]


X_train = train2
y_train = y

train_households = X_train.idhogar

In [34]:
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices = None)

In [35]:
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc']

In [36]:
xgb_drop_cols = extra_drop_features + ['idhogar', 'parentesco1']

<br>

### Fit a voting classifier
- Vote based on LGBM models with early stopping based on macro F1 and decaying learning rate.

In [37]:
opt_parameters = {'max_depth':35, 'eta':0.1, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 1, 'num_class': 4, 'gamma': 2.0, 'colsample_bylevel': 0.9, 'subsample': 0.84, 'colsample_bytree': 0.88, 'reg_lambda': 0.40 }
# 5
opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':1, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }

In [38]:
def evaluate_macroF1_lgb(predictions, truth):
    pred_labels = predictions.argmax(axis = 1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average = 'macro')
    return ('macroF1', 1-f1)

fit_params = {'early_stopping_rounds': 500, 
             'eval_metric' : evaluate_macroF1_lgb,
             'eval_set': [(X_train, y_train), (X_test, y_test)],
             'verbose' : False}

In [39]:
def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50

In [42]:
np.random.seed(100)

def _parallel_fit_estimator(estimator1, X, y ,sample_weight = None, threshold = True, **fit_params):
    estimator = clone(estimator1)
    
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X, y, sample_weight, households = train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X, y, None, households = train_households)
        
    fit_params['eval_set'] = [(X_test, y_test)]
    
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, sample_weight = y_train_weight, **fit_params)
    else:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)
    if not isinstance(estimator1, ExtraTreesClassifier) and not isinstance(estimator1, RandomForestClassifier) and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average = 'macro')
        best_cv = f1_score(y_test, estimator.predict(X_test), average = 'macro')
        print('Train F1: ', best_train)
        print('Test F1: ', best_cv)
        
    if threshold:
        if ((best_cv > 0.37) and (best_train > 0.75)) or ((best_cv > 0.44) and (best_train > 0.65)):
            return estimator
        
        else:
            print('Unacceptable!!!! Trying again ...')
            return _parallel_fit_estimator(estimator1, X, y, sample_weight = sample_weight, **fit_params)
        
    else:
        return estimator
    
    
        
    

In [45]:
class VotingClassifierLGBM(VotingClassifier):
    def fit(self, X, y, sample_weight = None, threshold = True, **fit_params):
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output', 'classification is not supported')
        if self.voting not in ('soft', 'hard'):
            raise ValueError('Voting must be "soft" or "hard"; got (voting = {})'.format( self.voting))
        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid "estimators" attribute, "estimators"'
                                ' should be a list of (string, estimator)'
                                'tuples')
        if (self.weights is not None and len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                            '; got {} weights, {} estimators'.format(len(self.weights), len(self.estimators)))
            
        names, clfs = zip(*self.estimators)
        self._validate_names(names)
        
        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is'
                            'required to be a classifier!')
            
        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []
        
        transformed_y = self.le_.transform(y)
        
        self.estimators_ = Parallel(n_jobs = self.n_jobs)(delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                                                          sample_weight = sample_weight, threshold = threshold, **fit_params)
                                                          for clf in clfs if clf is not None)
        return self
        

In [46]:
clfs = []

for i in range(15):
    clf = xgb.XGBClassifier(random_state = 217+i, n_estimators = 300, learning_rate = 0.15, n_jobs = 4, **opt_parameters)
    
    clfs.append(('xgb{}'.format(i), clf))
    
vc = VotingClassifierLGBM(clfs, voting = 'soft')
del(clfs)

_ = vc.fit(X_train.drop(xgb_drop_cols, axis =1), y_train, sample_weight = y_train_weights, threshold = False, **fit_params)

clf_final = vc.estimators_[0]

[0]	validation_0-merror:0.43771	validation_0-macroF1:0.614644
Multiple eval metrics have been passed: 'validation_0-macroF1' will be used for early stopping.

Will train until validation_0-macroF1 hasn't improved in 500 rounds.
[50]	validation_0-merror:0.367003	validation_0-macroF1:0.567672
[100]	validation_0-merror:0.37037	validation_0-macroF1:0.578871
[150]	validation_0-merror:0.37037	validation_0-macroF1:0.576036
[200]	validation_0-merror:0.368687	validation_0-macroF1:0.572118
[250]	validation_0-merror:0.368687	validation_0-macroF1:0.579058
[299]	validation_0-merror:0.36532	validation_0-macroF1:0.57495
Train F1:  0.9052797369926351
Test F1:  0.43767296281004997
[0]	validation_0-merror:0.457912	validation_0-macroF1:0.636377
Multiple eval metrics have been passed: 'validation_0-macroF1' will be used for early stopping.

Will train until validation_0-macroF1 hasn't improved in 500 rounds.
[50]	validation_0-merror:0.392256	validation_0-macroF1:0.587039
[100]	validation_0-merror:0.388889

Will train until validation_0-macroF1 hasn't improved in 500 rounds.
[50]	validation_0-merror:0.392256	validation_0-macroF1:0.594613
[100]	validation_0-merror:0.393939	validation_0-macroF1:0.590772
[150]	validation_0-merror:0.397306	validation_0-macroF1:0.594742
[200]	validation_0-merror:0.388889	validation_0-macroF1:0.589454
[250]	validation_0-merror:0.387205	validation_0-macroF1:0.587828
[299]	validation_0-merror:0.385522	validation_0-macroF1:0.586648
Train F1:  0.8784555309619573
Test F1:  0.43760226783021694
[0]	validation_0-merror:0.488215	validation_0-macroF1:0.659458
Multiple eval metrics have been passed: 'validation_0-macroF1' will be used for early stopping.

Will train until validation_0-macroF1 hasn't improved in 500 rounds.
[50]	validation_0-merror:0.405724	validation_0-macroF1:0.599743
[100]	validation_0-merror:0.39899	validation_0-macroF1:0.59441
[150]	validation_0-merror:0.392256	validation_0-macroF1:0.589681
[200]	validation_0-merror:0.392256	validation_0-macroF1:0.592

In [47]:
# params 4- 400 early stop - 15 estimators - l1 used features - weighted

global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols, axis =1)), average = 'macro')

vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis = 1)), average = 'macro')

vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis =1)), average = 'macro')

print('Validation score of a single LGBM Classifier: {:.4f}'.format(global_score))

print("Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}".format(global_score_soft))

print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_score_hard))

Validation score of a single LGBM Classifier: 0.7938
Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.9071
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.9142


In [48]:
useless_features = []
drop_features = set()

counter = 0
for est in vc.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis = 1), display_results = False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'agg18_estadocivil4_COUNT',
 'agg18_estadocivil5_COUNT',
 'geo_energcocinar_LE_0',
 'geo_epared_LE_2',
 'geo_pared_LE_0'}

In [49]:
ranked_features = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis = 1))

Feature ranking: 
1. feature 14 (0.01970352604985237) - agg18_escolari_MAX
2. feature 125 (0.01817922107875347) - geo_sanitario_LE_2
3. feature 58 (0.018052292987704277) - fe_children_fraction
4. feature 32 (0.016881752759218216) - agg18_parentesco2_MEAN
5. feature 133 (0.01595587283372879) - geo_pared_LE_1
6. feature 49 (0.01521733496338129) - edjefe
7. feature 15 (0.013887185603380203) - agg18_escolari_MEAN
8. feature 1 (0.013547150418162346) - SQBdependency
9. feature 47 (0.012796520255506039) - edjef
10. feature 124 (0.011644174344837666) - geo_sanitario_LE_1
11. feature 112 (0.011553376913070679) - geo_etecho_LE_1
12. feature 116 (0.01144295372068882) - geo_elimbasu_LE_0
13. feature 90 (0.01123342290520668) - r4t1
14. feature 105 (0.010503770783543587) - geo_hogar_total
15. feature 52 (0.010420488193631172) - epared_LE
16. feature 59 (0.01026556920260191) - fe_human_bed_density
17. feature 19 (0.00999092310667038) - agg18_estadocivil2_MEAN
18. feature 25 (0.0099242078140378) - agg

<br>

### Random Forest

In [50]:
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN']

In [51]:
et_drop_cols.extend(['idhogar', 'parentesco1', 'fe_rent_per_person', 'fe_rent_per_room', 'fe_tablet_adult_density', 'fe_tablet_density'])

In [53]:
# do the same thing for some extra trees classifiers
ets = []    
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217+i, n_jobs=4, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight="balanced")
    ets.append(('rf{}'.format(i), rf))   

vc2 = VotingClassifierLGBM(ets, voting='soft')    
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)  

Train F1:  0.8973744577845608
Test F1:  0.4119287284836981
Train F1:  0.8973405328848642
Test F1:  0.4044941482768548
Train F1:  0.8880139756454893
Test F1:  0.39881107834328067
Train F1:  0.891769612996069
Test F1:  0.43265498149713966
Train F1:  0.8921492656245431
Test F1:  0.452800720791523
Train F1:  0.8938989965907618
Test F1:  0.43800331626520606
Train F1:  0.8967698827248772
Test F1:  0.4109323467334529
Train F1:  0.8906362327579868
Test F1:  0.45892004679838216
Train F1:  0.9006816627520686
Test F1:  0.42879021233275155
Train F1:  0.8971414961089192
Test F1:  0.41035647077305115


In [54]:
# w/ threshold, extra drop cols

vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis = 1)), average ='macro')

vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis =1)), average = 'macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy : {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8596
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy : 0.8776


In [56]:
useless_features = []
drop_features = set()
counter = 0

for est in vc2.estimators_ : 
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis =1), display_results = False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
        
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'parentesco_LE', 'rez_esc'}

In [57]:
def combine_voters(data, weights = [0.5, 0.5]):
    vc.voting = 'soft'
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols, axis =1))
    vc2.voting = 'soft'
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols, axis =1))
    
    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    
    predictions = np.argmax(final_vote, axis = 1)
    
    return predictions

In [58]:
combo_preds = combine_voters(X_test, weights = [0.5, 0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average = 'macro')
global_combo_score_soft


0.8889950525664811

In [59]:
combo_preds = combine_voters(X_test, weights = [0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average = 'macro')
global_combo_score_soft

0.8984463750262703

<br>

### Prepare submission

In [60]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [61]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep = True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis =1))+1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis =1)) +1

y_subm_ens = y_subm.copy(deep = True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [62]:
from datetime import datetime

now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_ens = 'submission_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

# y_subm_lgb.to_csv(sub_file_lgb, index=False)
# y_subm_rf.to_csv(sub_file_rf, index=False)
# y_subm_ens.to_csv(sub_file_ens, index=False)