#Importación de librerías

In [None]:
#Importamos librerias y leemos los archivos de datos. 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import f1_score, make_scorer
import lightgbm as lgb
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_validate,KFold, RandomizedSearchCV

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.5f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')



In [None]:
# hay varias features que todavia no se usan
dtype_={'building_id':'int32',
        'geo_level_1_id':'int8',
        'geo_level_2_id':'int16',
        'geo_level_3_id':'int16',
        'count_floors_pre_eq':'int8',
        'age':'int16',
        'area_percentage':'int8',
        'height_percentage':'int8',
        'land_surface_condition':'category',
        'foundation_type':'category',
        'roof_type':'category',
        'ground_floor_type':'category',
        'other_floor_type':'category',
        'has_superstructure_adobe_mud':'int8',
        'has_superstructure_mud_mortar_stone':'int8',
        'has_superstructure_stone_flag':'int8',
        'has_superstructure_cement_mortar_stone':'int8',
        'has_superstructure_mud_mortar_brick':'int8',
        'has_superstructure_cement_mortar_brick':'int8',
        'has_superstructure_timber':'int8',
        'has_superstructure_bamboo':'int8',
        'has_superstructure_rc_non_engineered':'int8',
        'has_superstructure_rc_engineered':'int8',
        'has_superstructure_other':'int8'}

train_values=pd.read_csv('train_values.csv',dtype=dtype_)
train_labels=pd.read_csv('train_labels.csv',dtype={'building_id':'int32','damage_grade':'int8'})

test_values=pd.read_csv('test_values.csv',dtype=dtype_)
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

###F1 Scorer

In [None]:
f1 = make_scorer(f1_score , average='micro')

#Feature Engineering

In [None]:
geo_level=train_values.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id']].merge(train_labels)
gl_by_mean=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_by_mode=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_combine=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]

for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[2][key]=value
for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[2][key]=value
for gl in gl_by_mean:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
for gl in gl_by_mode:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
gl_combine=[gl_by_mean[i]+gl_by_mode[i] for i in range(3)]

foundation_dict={'i':0,'w':1,'u':2,'h':3,'r':4}
other_floor_dict={'s':0,'j':1,'q':2,'x':2}

def get_features(features):
    short=features.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id','foundation_type',\
                          'age','roof_type','ground_floor_type','other_floor_type','count_floors_pre_eq',\
                          'height_percentage','area_percentage']\
                           +[col for col in features.columns if col.startswith('has_superstructure')]]
    short['height_to_area_rt']=short['height_percentage']/short['area_percentage']

    # Aca se elige el encoding que se quiere usar para cada geo_level
    short['geo_level_1_id']=short.agg({'geo_level_1_id':lambda x:gl_by_mean[0][x]})
    short['geo_level_2_id']=short.agg({'geo_level_2_id':lambda x:gl_combine[1][x]})
    short['geo_level_3_id']=short.agg({'geo_level_3_id':lambda x:gl_combine[2][x]})

    # Encodeo las variables categóricas así porque en el TP1 parecían ser las que más peso tenían sobre damage_grade
    short.other_floor_type=short.agg({'other_floor_type':lambda x:other_floor_dict[x]})
    short.foundation_type=short.agg({'foundation_type':lambda x:foundation_dict[x]})
    short.roof_type=short['roof_type']=='x'
    short.ground_floor_type=short['ground_floor_type']=='v'
    return short

In [None]:

short_train=get_features(train_values)
short_train=short_train.merge(train_labels)
X, y = short_train.iloc[:,:-1],short_train.iloc[:,-1]

##Tuning de Hiperparámetros

In [None]:
parameters = {'max_depth'         : [10,12],
              'iterations'    : [1000,1500],
              'learning_rate'   :[0.09,0.115,0.12],
            }

In [None]:
%%time
clf=lgb.LGBMClassifier()

kf=KFold(n_splits=2,shuffle=True)

gds=GridSearchCV(clf, param_grid = parameters, cv = kf, scoring = f1)

gds.fit(X,y)

In [None]:
gds.best_params_

{'iterations': 1000, 'learning_rate': 0.12, 'max_depth': 12}

In [None]:
model=lgb.LGBMClassifier(n_jobs=-1,max_depth= 12,learning_rate= 0.12, num_iterations=1000)

In [None]:
# Train
model.fit(X,y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.12, max_depth=12,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_iterations=1000, num_leaves=31,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

##Verificamos el score

In [None]:
f1_score(y, model.predict(X), average='micro')

0.7948741562772207

In [None]:
submission = pd.DataFrame(data=model.predict(get_features(test_values)),
                          columns=submission_format.columns,
                          index=submission_format.index)
submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,3
745817,1
421793,3


In [None]:
submission.to_csv('submissionLightGBM.csv')

Este modelo obtuvo 0.7387 en drivendata