In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.5f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [None]:
# hay varias features que todavia no se usan
dtype_={'building_id':'int32',
        'geo_level_1_id':'int8',
        'geo_level_2_id':'int16',
        'geo_level_3_id':'int16',
        'count_floors_pre_eq':'int8',
        'age':'int16',
        'area_percentage':'int8',
        'height_percentage':'int8',
        'land_surface_condition':'category',
        'foundation_type':'category',
        'roof_type':'category',
        'ground_floor_type':'category',
        'other_floor_type':'category',
        'has_superstructure_adobe_mud':'int8',
        'has_superstructure_mud_mortar_stone':'int8',
        'has_superstructure_stone_flag':'int8',
        'has_superstructure_cement_mortar_stone':'int8',
        'has_superstructure_mud_mortar_brick':'int8',
        'has_superstructure_cement_mortar_brick':'int8',
        'has_superstructure_timber':'int8',
        'has_superstructure_bamboo':'int8',
        'has_superstructure_rc_non_engineered':'int8',
        'has_superstructure_rc_engineered':'int8',
        'has_superstructure_other':'int8'}

train_values=pd.read_csv('train_values.csv',dtype=dtype_)
train_labels=pd.read_csv('train_labels.csv',dtype={'building_id':'int32','damage_grade':'int8'})

test_values=pd.read_csv('test_values.csv',dtype=dtype_)

# Aca hago mean encoding, mode(moda) encoding y uno mas que es la suma de los 2.

In [None]:
geo_level=train_values.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id']].merge(train_labels)
gl_by_mean=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_by_mode=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_combine=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]

for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[2][key]=value
for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[2][key]=value
for gl in gl_by_mean:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
for gl in gl_by_mode:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
gl_combine=[gl_by_mean[i]+gl_by_mode[i] for i in range(3)]

foundation_dict={'i':0,'w':1,'u':2,'h':3,'r':4}
other_floor_dict={'s':0,'j':1,'q':2,'x':2}

def get_features(features):
    short=features.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id','foundation_type',\
                          'age','roof_type','ground_floor_type','other_floor_type','count_floors_pre_eq',\
                          'height_percentage','area_percentage']\
                           +[col for col in features.columns if col.startswith('has_superstructure')]]
    short['height_to_area_rt']=short['height_percentage']/short['area_percentage']

    # Aca se elige el encoding que se quiere usar para cada geo_level
    short['geo_level_1_id']=short.agg({'geo_level_1_id':lambda x:gl_by_mean[0][x]})
    short['geo_level_2_id']=short.agg({'geo_level_2_id':lambda x:gl_combine[1][x]})
    short['geo_level_3_id']=short.agg({'geo_level_3_id':lambda x:gl_combine[2][x]})

    # Encodeo las variables categóricas así porque en el TP1 parecían ser las que más peso tenían sobre damage_grade
    short.other_floor_type=short.agg({'other_floor_type':lambda x:other_floor_dict[x]})
    short.foundation_type=short.agg({'foundation_type':lambda x:foundation_dict[x]})
    short.roof_type=short['roof_type']=='x'
    short.ground_floor_type=short['ground_floor_type']=='v'
    return short

In [None]:
short_train=get_features(train_values)
short_train=short_train.merge(train_labels)
X, y = short_train.iloc[:,:-1],short_train.iloc[:,-1]

In [None]:
params={'objective':['multi:softprob'],
        'tree_method':['hist'],'single_precision_histogram':[True],
        'max_depth'       :[5],
        'learning_rate'   :[0.115,0.116,0.117,0.118,0.119,0.12],
        'n_estimators'    :[500],
        'colsample_bytree':[0.3334],
       }

xgb_grid=GridSearchCV(estimator=xgb.XGBClassifier(),
                      param_grid=params,
                      n_jobs=2,
                      scoring=make_scorer(f1_score,average='micro'))
xgb_grid.fit(X,y)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None...
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=2,
             param_grid={'colsample_bytree': [0.3334],
                         'eval_metric': ['mlogloss'],
                         'lear

In [None]:
pd.DataFrame(xgb_grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_eval_metric,param_learning_rate,param_max_depth,param_n_estimators,param_objective,...,param_tree_method,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,169.44995,2.73588,1.80302,0.09436,0.3334,mlogloss,0.115,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75741,0.75318,0.75909,0.7584,0.75812,0.75724,0.0021,2
1,183.92309,9.78951,1.96154,0.05072,0.3334,mlogloss,0.116,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75789,0.75353,0.75867,0.75814,0.75787,0.75722,0.00187,3
2,177.03172,11.69542,1.95021,0.01806,0.3334,mlogloss,0.117,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75816,0.75276,0.75908,0.75911,0.75781,0.75738,0.00237,1
3,182.56482,3.94454,1.99957,0.03785,0.3334,mlogloss,0.118,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75691,0.75282,0.7585,0.75888,0.75777,0.75698,0.00218,6
4,175.9201,9.9234,1.99024,0.01939,0.3334,mlogloss,0.119,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75691,0.75334,0.75821,0.75919,0.75735,0.757,0.00199,5
5,182.38205,11.75758,1.80982,0.39937,0.3334,mlogloss,0.12,5,500,multi:softprob,...,hist,"{'colsample_bytree': 0.3334, 'eval_metric': 'm...",0.75747,0.75238,0.75873,0.75842,0.75852,0.7571,0.0024,4


# Tomamos un modelo con bajo std_test_score y buen mean_test_score

In [None]:
xgb_cls=xgb.XGBClassifier(objective='multi:softprob',
                          tree_method='hist',single_precision_histogram=True,
                          max_depth=5,
                          learning_rate=0.116,
                          n_estimators=500,
                          colsample_bytree=0.3334)
xgb_cls.fit(X,y)
f1_score(y, xgb_cls.predict(X), average='micro')



0.7677445596908683

In [None]:
###grafico de features mas importantes para el modelo
###a zip pasarle array las columnas del set de train
###y array de feature importances
zip_iterator = zip(X.columns, xgb_grid.best_estimator_.feature_importances_)
a_dictionary = dict(zip_iterator)
#ordeno por importancia
a_dictionary = dict(sorted(a_dictionary.items(), key=lambda x: x[1], reverse=True))
#me quedo con los 10 primeros
a_dictionary = {k: a_dictionary[k] for k in list(a_dictionary)[:10]}
#paso a lista para graficar
features_names = list(a_dictionary.keys())
feature_importances = list(a_dictionary.values())

#estilizo el bar plot
fig = plt.figure(figsize = (18, 5.5)) 
plt.rc('xtick',labelsize=8.5)
plt.rc('ytick',labelsize=12)
plt.bar(features_names, feature_importances, color ='#ff9995', width = 0.4)
plt.title("Features mas importantes", fontsize=22)
plt.ylabel("Importancia", fontsize=20)
plt.xticks(rotation=45)
plt.show()

In [None]:
submission_format=pd.read_csv('submission_format.csv',index_col='building_id')
submission = pd.DataFrame(data=xgb_cls.predict(get_features(test_values)),
                          columns=submission_format.columns,
                          index=submission_format.index)
submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,3
745817,1
421793,3


In [None]:
submission.to_csv('submissions/submission8.csv')
!head submissions/submission8.csv

building_id,damage_grade
300051,3
99355,2
890251,3
745817,1
421793,3
871976,2
691228,1
896100,3
343471,2
