In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold


In [7]:
#Cargo los sets
X_train = pd.read_csv('data/X_train.csv', dtype={'id':'int',\
                                           'tipodepropiedad':'category',\
                                           'ciudad':'category', 'provincia':'category',\
                                           'antiguedad':'float', 'habitaciones':'float',\
                                           'garages':'float','banos':'float',\
                                           'metroscubiertos':'float', 'metrostotales':'float',\
                                           'idzona':'float',\
                                           'gimnasio':'bool', 'usosmultiples':'bool',\
                                           'piscina':'bool', 'escuelascercanas':'bool',\
                                           'centroscomercialescercanos':'bool'\
                                           }, index_col='id')
y_train = pd.read_csv('data/y_train.csv',header=None,squeeze=True,index_col=0)
X_test = pd.read_csv('data/X_test.csv', dtype={'id':'int',\
                                           'tipodepropiedad':'category',\
                                           'ciudad':'category', 'provincia':'category',\
                                           'antiguedad':'float', 'habitaciones':'float',\
                                           'garages':'float','banos':'float',\
                                           'metroscubiertos':'float', 'metrostotales':'float',\
                                           'idzona':'float',\
                                           'gimnasio':'bool', 'usosmultiples':'bool',\
                                           'piscina':'bool', 'escuelascercanas':'bool',\
                                           'centroscomercialescercanos':'bool'\
                                           }, index_col='id')
y_test = pd.read_csv('data/y_test.csv',header=None,squeeze=True,index_col=0)

In [8]:
X_train['log_precio'] = np.log1p(y_train)
X_train = X_train.iloc[0:100,]

In [9]:
categorical_features = ['tipodepropiedad','provincia','ciudad']

In [10]:
# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)

def impact_coding(data, feature, target='log_precio'):
    '''
    In this implementation we get the values and the dictionary as two different steps.
    This is just because initially we were ignoring the dictionary as a result variable.
    
    In this implementation the KFolds use shuffling. If you want reproducibility the cv 
    could be moved to a parameter.
    '''
    n_folds = 20
    n_inner_folds = 10
    impact_coded = pd.Series()
    
    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
    kf = KFold(n_splits=n_folds, shuffle=True)
    oof_mean_cv = pd.DataFrame()
    split = 0
    for infold, oof in kf.split(data[feature]):
            impact_coded_cv = pd.Series()
            kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
            inner_split = 0
            inner_oof_mean_cv = pd.DataFrame()
            oof_default_inner_mean = data.iloc[infold][target].mean()
            for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
                # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
                oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
                impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
                            lambda x: oof_mean[x[feature]]
                                      if x[feature] in oof_mean.index
                                      else oof_default_inner_mean
                            , axis=1))

                # Also populate mapping (this has all group -> mean for all inner CV folds)
                inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
                inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
                inner_split += 1

            # Also populate mapping
            oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
            oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
            split += 1
            
            impact_coded = impact_coded.append(data.iloc[oof].apply(
                            lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
                                      if x[feature] in inner_oof_mean_cv.index
                                      else oof_default_mean
                            , axis=1))

    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean

# Apply the encoding to training and test data, and preserve the mapping
impact_coding_map = {}
for f in categorical_features:
    print("Impact coding for {}".format(f))
    X_train["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(X_train, f)
    impact_coding_map[f] = (impact_coding_mapping, default_coding)
    mapping, default_mean = impact_coding_map[f]
    X_test["impact_encoded_{}".format(f)] = X_test.apply(lambda x: mapping[x[f]]
                                                                         if x[f] in mapping
                                                                         else default_mean
                                                               , axis=1)

Impact coding for tipodepropiedad
Impact coding for provincia
Impact coding for ciudad


In [11]:
X_train

Unnamed: 0_level_0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,...,piscina,escuelascercanas,centroscomercialescercanos,año,mes,dia,log_precio,impact_encoded_tipodepropiedad,impact_encoded_provincia,impact_encoded_ciudad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120542,Casa,Mazatlán,Sinaloa,8.121536,3.000000,2.0,3.000000,130.000000,144.000000,346913.0,...,False,False,False,2016,12,15,14.483084,14.442381,14.486873,14.488351
11955,Casa,Xalapa,Veracruz,0.000000,3.000000,1.0,3.000000,170.000000,130.000000,106912.0,...,False,False,False,2013,3,19,14.430697,14.443098,14.291340,14.291340
38491,Apartamento,Cuauhtémoc,Distrito Federal,5.000000,2.000000,1.0,1.000000,64.000000,64.000000,23835.0,...,False,False,False,2014,10,6,14.200774,14.667600,14.790978,14.716514
235712,Local Comercial,Atizapán de Zaragoza,Edo. de México,8.121536,3.000000,0.0,1.000000,50.000000,176.812747,55785.0,...,False,False,False,2016,5,17,14.119313,14.380192,14.674919,14.519360
273251,Apartamento,La Magdalena Contreras,Distrito Federal,4.000000,2.000000,1.0,1.000000,50.000000,50.000000,24517.0,...,False,True,False,2016,12,28,13.997833,14.666299,14.791475,15.171152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18710,Apartamento,Cuauhtémoc,Distrito Federal,1.000000,1.000000,0.0,1.000000,51.000000,51.000000,274638.0,...,False,False,False,2015,4,28,14.430480,14.666454,14.790964,14.718624
213059,Terreno,Chihuahua,Chihuahua,8.121536,2.903351,0.0,2.133895,350.000000,350.000000,262786.0,...,False,True,True,2015,3,18,14.058458,14.208108,14.524737,14.539078
273441,Apartamento,Coyoacán,Distrito Federal,4.000000,3.000000,3.0,3.000000,145.000000,176.812747,23668.0,...,False,True,True,2014,5,6,15.201805,14.666152,14.789835,14.057979
267709,Casa,Atlatlahucan,Morelos,0.000000,3.000000,2.0,2.133895,300.000000,176.812747,66866.0,...,True,True,False,2012,10,2,14.841553,14.443320,14.292354,14.507099
