In [14]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import joblib

In [3]:
train_values = pd.read_csv('train_values.csv')

train_labels = pd.read_csv('train_labels.csv')

In [4]:
data = train_values.merge(train_labels)

In [5]:
features_elegidos = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',
                     'age',
                     'geo_level_1_id',
                     'geo_level_2_id',
                     'geo_level_3_id',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',
                     'legal_ownership_status',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_other',
                     'damage_grade']

train_values_short = data[features_elegidos]

In [6]:
train_values_short = pd.get_dummies(train_values_short)

In [7]:
# Dividimos los datos en entrenamiento y prueba

# x contiene los features independientes
x = train_values_short.drop(['damage_grade'], axis = 1)

# y contiene la variable dependiente damage_grade
y = train_values_short['damage_grade']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

Grid Search

In [8]:
pipe = make_pipeline(StandardScaler(), XGBClassifier(random_state = 2018))

pipe

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, gamma=None, gpu_id=None,
                               importance_type='gain',
                               interaction_constraints=None, learning_rate=None,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               random_state=2018, reg_alpha=None,
                               reg_lambda=None, scale_pos_weight=None,
                               subsample=None, tree_method=None,
                               validate_parameters=None, verbosity=None))])

In [9]:
param_grid = {'xgbclassifier__n_estimators' : [500, 550, 580],
              'xgbclassifier__colsample_bytree': [0.4, 0.5, 0.6],
              'xgbclassifier__learning_rate': [0.2, 0.3, 0.4]}

gs = GridSearchCV(pipe, param_grid, cv = 5)

In [10]:
gs.fit(x_train, y_train)

































































































































































































































































































































































































































































































































































GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                

In [11]:
gs.best_params_

{'xgbclassifier__colsample_bytree': 0.6,
 'xgbclassifier__learning_rate': 0.3,
 'xgbclassifier__n_estimators': 580}

In [12]:
f1_score(y_test, gs.predict(x_test), average = 'micro')

0.7458365949870301

Guardamos el modelo

In [15]:
joblib.dump(gs, 'modeloXGBoost_entrenado.pkl')

['modeloXGBoost_entrenado.pkl']

Preparamos la Submission

In [16]:
test_values = pd.read_csv('test_values.csv')

In [17]:
selected_features = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',
                     'age',
                     'geo_level_1_id',
                     'geo_level_2_id',
                     'geo_level_3_id',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',
                     'legal_ownership_status',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_other']

test_values_subset = test_values[selected_features]
test_values_subset = pd.get_dummies(test_values_subset)

In [18]:
predictions = gs.predict(test_values_subset)

In [19]:
submission_format = pd.read_csv('submission_format.csv', index_col = 'building_id')

In [20]:
my_submission = pd.DataFrame(data = predictions,
                             columns = submission_format.columns,
                             index = submission_format.index)

In [21]:
my_submission.value_counts()

damage_grade
2               55927
3               24439
1                6502
dtype: int64

In [22]:
my_submission.to_csv('submission7.csv')