In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import f1_score

%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
# hay varias features que todavia no se usan
features = pd.read_csv('train_values.csv', dtype={'building_id':'int32',
                                                  'geo_level_1_id':'int8',
                                                  'geo_level_2_id':'int16',
                                                  'geo_level_3_id':'int16',
                                                  'count_floors_pre_eq':'int8',
                                                  'age':'int16',
                                                  'area_percentage':'int8',
                                                  'height_percentage':'int8',
                                                  'land_surface_condition':'category',
                                                  'foundation_type':'category',
                                                  'roof_type':'category',
                                                  'ground_floor_type':'category',
                                                  'other_floor_type':'category',
                                                  'plan_configuration':'category',
                                                  'position':'category',
                                                  'has_superstructure_adobe_mud':'int8',
                                                  'has_superstructure_mud_mortar_stone':'int8',
                                                  'has_superstructure_stone_flag':'int8',
                                                  'has_superstructure_cement_mortar_stone':'int8',
                                                  'has_superstructure_mud_mortar_brick':'int8',
                                                  'has_superstructure_cement_mortar_brick':'int8',
                                                  'has_superstructure_timber':'int8',
                                                  'has_superstructure_bamboo':'int8',
                                                  'has_superstructure_rc_non_engineered':'int8',
                                                  'has_superstructure_rc_engineered':'int8',
                                                  'has_superstructure_other':'int8',
                                                  'legal_ownership_status':'category',
                                                  'count_families':'int8',
                                                  'has_secondary_use':'int8',
                                                  'has_secondary_use_agriculture':'int8',
                                                  'has_secondary_use_hotel':'int8',
                                                  'has_secondary_use_rental':'int8',
                                                  'has_secondary_use_institution':'int8',
                                                  'has_secondary_use_school':'int8',
                                                  'has_secondary_use_industry':'int8',
                                                  'has_secondary_use_health_post':'int8',
                                                  'has_secondary_use_gov_office':'int8',
                                                  'has_secondary_use_use_police':'int8',
                                                  'has_secondary_use_other':'int8'})
damage_grade = pd.read_csv('train_labels.csv', dtype={'building_id':'int32','damage_grade':'int8'})

In [3]:
geo_level=features.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id']].merge(damage_grade)

In [4]:
# Aca hago mean encoding, mode(moda) encoding y uno mas que es la suma de los 2.

In [5]:
gl_by_mean=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_by_mode=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]
gl_combine=[np.zeros(geo_level.geo_level_1_id.max()+1),
            np.zeros(geo_level.geo_level_2_id.max()+1),
            np.zeros(geo_level.geo_level_3_id.max()+1)]

for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':'mean'}).to_dict()['damage_grade'].items():
    gl_by_mean[2][key]=value
for key,value in geo_level.groupby('geo_level_1_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[0][key]=value
for key,value in geo_level.groupby('geo_level_2_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[1][key]=value
for key,value in geo_level.groupby('geo_level_3_id').agg({'damage_grade':lambda x: stats.mode(x)[0][0]})\
                 .to_dict()['damage_grade'].items():
    gl_by_mode[2][key]=value
for gl in gl_by_mean:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
for gl in gl_by_mode:
    for i,value in enumerate(gl):
        if value == 0:
            gl[i]=gl[i-1]
gl_combine=[gl_by_mean[i]+gl_by_mode[i] for i in range(3)]

foundation_dict={'i':0,'w':1,'u':2,'h':3,'r':4}
other_floor_dict={'s':0,'j':1,'q':2,'x':2}

In [6]:
short=features.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id','foundation_type','age',\
                       'roof_type','ground_floor_type','other_floor_type','count_floors_pre_eq']\
                       +[col for col in features.columns if col.startswith('has_superstructure')]]
short['height_to_area_rt']=features['height_percentage']/features['area_percentage']

# Aca se elige el encoding que se quiere usar para cada geo_level
short['geo_level_1_id']=short.agg({'geo_level_1_id':lambda x:gl_by_mean[0][x]})
short['geo_level_2_id']=short.agg({'geo_level_2_id':lambda x:gl_combine[1][x]})
short['geo_level_3_id']=short.agg({'geo_level_3_id':lambda x:gl_combine[2][x]})

# Encodeo las variables categóricas así porque en el TP1 parecían ser las que más peso tenían sobre damage_grade
short.other_floor_type=short.agg({'other_floor_type':lambda x:other_floor_dict[x]})
short.foundation_type=short.agg({'foundation_type':lambda x:foundation_dict[x]})
short.roof_type=short['roof_type']=='x'
short.ground_floor_type=short['ground_floor_type']=='v'

short=short.merge(damage_grade)

In [7]:
X, y = short.iloc[:,:-1],short.iloc[:,-1]

In [37]:
# la métrica mlogloss explicita es para sacar el warning
xg_cls=xgb.XGBClassifier(objective='multi:softprob',eval_metric='mlogloss',\
                         learning_rate=0.178,n_estimators=500,colsample_bytree=0.25,\
                         tree_method='hist')
xg_cls.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.25, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.178, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=2,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='hist', validate_parameters=1, verbosity=None)

In [38]:
preds_train=xg_cls.predict(X)
f1_score(y, preds_train, average='micro')

0.776201933223587

In [39]:
test=pd.read_csv('test_values.csv')

In [11]:
short_test=test.loc[:,['building_id','geo_level_1_id','geo_level_2_id','geo_level_3_id','foundation_type','age',\
                       'roof_type','ground_floor_type','other_floor_type','count_floors_pre_eq']\
                       +[col for col in test.columns if col.startswith('has_superstructure')]]
short_test['height_to_area_rt']=test['height_percentage']/test['area_percentage']
short_test['geo_level_1_id']=short_test.agg({'geo_level_1_id':lambda x:gl_by_mean[0][x]})
short_test['geo_level_2_id']=short_test.agg({'geo_level_2_id':lambda x:gl_combine[1][x]})
short_test['geo_level_3_id']=short_test.agg({'geo_level_3_id':lambda x:gl_combine[2][x]})

short_test.other_floor_type=short_test.agg({'other_floor_type':lambda x:other_floor_dict[x]})
short_test.foundation_type=short_test.agg({'foundation_type':lambda x:foundation_dict[x]})
short_test.roof_type=short_test['roof_type']=='x'
short_test.ground_floor_type=short_test['ground_floor_type']=='v'

In [12]:
X_test = short_test.iloc[:,:]

In [13]:
preds_test=xg_cls.predict(X_test)

In [14]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [15]:
my_submission = pd.DataFrame(data=preds_test,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [16]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,3
745817,1
421793,3


In [17]:
my_submission.to_csv('submissions/submission6.csv')

In [18]:
!head submissions/submission6.csv

building_id,damage_grade
300051,3
99355,2
890251,3
745817,1
421793,3
871976,2
691228,1
896100,3
343471,2
