In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train_values = pd.read_csv('/content/drive/MyDrive/train_values.csv')

train_labels = pd.read_csv('/content/drive/MyDrive/train_labels.csv')

In [4]:
data = train_values.merge(train_labels)

In [None]:
features_elegidos = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',
                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_other',
                     'damage_grade']

train_values_short = data[features_elegidos]

In [None]:
train_values_short = pd.get_dummies(train_values_short)

In [None]:
# Dividimos los datos en entrenamiento y prueba

# x contiene los features independientes
x = train_values_short.drop(['damage_grade'], axis = 1)

# y contiene la variable target damage_grade
y = train_values_short['damage_grade']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 123)

**XGBoost**

In [None]:
# Creamos el modelo de XGBoost

xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 10)

In [None]:
# Entrenamos el modelo

xgboost_model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='multi:softprob',
              objetive='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [None]:
# Evaluamos los resultados

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5740050037604949

Variamos los hiperparámetros

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5894460560850946

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.3, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5938972540713112

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 80,
                              colsample_bytree = 0.3, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5933600405212507

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 90,
                              colsample_bytree = 0.3, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5936977176098601

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.3, learning_rate = 0.1,
                              max_depth = 4, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5913186290310203

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.2, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5910423477767034

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5951558686743105

Probando con la función objetivo reg:logistic se obtienen los mismos scores

In [None]:
xgboost_model = XGBClassifier(objetive = 'reg:logistic', n_estimators = 100,
                              colsample_bytree = 0.3, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5938972540713112

In [None]:
xgboost_model = XGBClassifier(objetive = 'reg:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.1,
                              max_depth = 5, alpha = 10)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5951558686743105

Variando el learning rate

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.1,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5957544780586638

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.5969823947445165

Agregamos más Features

In [None]:
features_elegidos = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use',
                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_institution',
                     'has_secondary_use_school',
                     'has_secondary_use_industry',
                     'has_secondary_use_health_post',
                     'has_secondary_use_gov_office',
                     'has_secondary_use_use_police',
                     'has_secondary_use_other',
                     'damage_grade']

train_values_short = data[features_elegidos]

In [None]:
train_values_short = pd.get_dummies(train_values_short)

In [None]:
# Dividimos los datos en entrenamiento y prueba

# x contiene los features independientes
x = train_values_short.drop(['damage_grade'], axis = 1)

# y contiene la variable dependiente damage_grade
y = train_values_short['damage_grade']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.6037205875581342

Algunos features de uso secundario se encuentran en pocos registros del dataset de entrenamiento, por lo que no tendrían demasiada importancia en el modelo y podrían causar overfitting.

In [25]:
features_elegidos = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_other',
                     'damage_grade']

train_values_short = data[features_elegidos]

In [26]:
train_values_short = pd.get_dummies(train_values_short)

In [27]:
# Dividimos los datos en entrenamiento y prueba

# x contiene los features independientes
x = train_values_short.drop(['damage_grade'], axis = 1)

# y contiene la variable dependiente damage_grade
y = train_values_short['damage_grade']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [None]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.6042117542324753

Agregamos geo_level_1_id

In [24]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.6846249481972648

Agregamos geo_level_2_id

In [28]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 100,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.7012325213734248

Agregamos features restantes y aumentamos n_estimators a 200

In [9]:
features_elegidos = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',
                     'age',
                     'geo_level_1_id',
                     'geo_level_2_id',
                     'geo_level_3_id',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',
                     'legal_ownership_status',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_institution',
                     'has_secondary_use_school',
                     'has_secondary_use_industry',
                     'has_secondary_use_health_post',
                     'has_secondary_use_gov_office',
                     'has_secondary_use_use_police',
                     'has_secondary_use_other',
                     'damage_grade']

train_values_short = data[features_elegidos]

In [10]:
train_values_short = pd.get_dummies(train_values_short)

In [11]:
# Dividimos los datos en entrenamiento y prueba

# x contiene los features independientes
x = train_values_short.drop(['damage_grade'], axis = 1)

# y contiene la variable dependiente damage_grade
y = train_values_short['damage_grade']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [12]:
xgboost_model = XGBClassifier(objetive = 'binary:logistic', n_estimators = 200,
                              colsample_bytree = 0.4, learning_rate = 0.2,
                              max_depth = 5, alpha = 10, seed = 123)

xgboost_model.fit(x_train, y_train)

preds = xgboost_model.predict(x_test)
f1_score(y_test, preds, average = 'micro')

0.7214164018971313

Preparamos la Submission

In [13]:
test_values = pd.read_csv('/content/drive/MyDrive/test_values.csv')

In [14]:
selected_features = ['area_percentage', 
                     'height_percentage',
                     'count_families',
                     'count_floors_pre_eq',
                     'age',
                     'geo_level_1_id',
                     'geo_level_2_id',
                     'geo_level_3_id',

                     'foundation_type',
                     'ground_floor_type',
                     'plan_configuration',
                     'roof_type',
                     'land_surface_condition',
                     'position',
                     'other_floor_type',
                     'legal_ownership_status',

                     'has_superstructure_stone_flag',
                     'has_superstructure_adobe_mud',
                     'has_superstructure_mud_mortar_stone',
                     'has_superstructure_mud_mortar_brick',
                     'has_superstructure_cement_mortar_stone',
                     'has_superstructure_cement_mortar_brick',
                     'has_superstructure_bamboo',
                     'has_superstructure_rc_non_engineered',
                     'has_superstructure_rc_engineered',
                     'has_superstructure_timber',
                     'has_superstructure_other',

                     'has_secondary_use_agriculture',
                     'has_secondary_use_hotel',
                     'has_secondary_use_rental',
                     'has_secondary_use_institution',
                     'has_secondary_use_school',
                     'has_secondary_use_industry',
                     'has_secondary_use_health_post',
                     'has_secondary_use_gov_office',
                     'has_secondary_use_use_police',
                     'has_secondary_use_other']

test_values_subset = test_values[selected_features]
test_values_subset = pd.get_dummies(test_values_subset)

In [15]:
predictions = xgboost_model.predict(test_values_subset)

In [16]:
submission_format = pd.read_csv('/content/drive/MyDrive/submission_format.csv', index_col = 'building_id')

In [17]:
my_submission = pd.DataFrame(data = predictions,
                             columns = submission_format.columns,
                             index = submission_format.index)

In [18]:
my_submission.value_counts()

damage_grade
2               58880
3               22603
1                5385
dtype: int64

In [19]:
my_submission.to_csv('/content/drive/MyDrive/submission5.csv')

In [20]:
!head /content/drive/MyDrive/submission5.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
871976,2
691228,1
896100,3
343471,2
