In [1]:
import optuna
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Specify dtype option on import or set low_memory=False.
pd.options.mode.chained_assignment = None  # default='warn'

import math
import scipy

# Datviz purposes
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
import missingno as msno

import xgboost
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv("train_data.csv", low_memory=False)
test_data = pd.read_csv("test_data.csv", low_memory=False) # For local
     

In [3]:
# copying the original dataset into X
X=train_data.copy()
# droping building_id and dependent variable damage_grade
# independant variables
X=X.drop(["damage_grade"],axis=1)
# dependent variable
y=train_data["damage_grade"]

In [4]:

x_train, x_test,y_train, y_test = train_test_split(X,y,test_size = 0.20,random_state = 42 )
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [5]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.05993474062246474, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.7770774265529939, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'max_delta_step' : trial.suggest_int('max_delta_step', 1, 3)
        
    }
    model = xgboost.XGBClassifier(**param)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return f1_score(y_test, y_pred, average='micro')

In [6]:
study = optuna.create_study(direction='maximize', study_name='xgboost')
study.optimize(objective, n_trials=50)

XgbmPar = copy.deepcopy(study.best_params)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2023-04-12 21:33:54,144][0m A new study created in memory with name: xgboost[0m
[32m[I 2023-04-12 21:35:09,058][0m Trial 0 finished with value: 0.22335002682883204 and parameters: {'max_depth': 4, 'learning_rate': 0.6057641534943944, 'n_estimators': 171, 'min_child_weight': 7, 'gamma': 0.49524824949449936, 'subsample': 0.9702401492547568, 'colsample_bytree': 0.6638088002057182, 'reg_alpha': 0.18933352872509074, 'reg_lambda': 0.7811903206059074, 'max_delta_step': 1}. Best is trial 0 with value: 0.22335002682883204.[0m
[32m[I 2023-04-12 21:35:37,182][0m Trial 1 finished with value: 0.22729982710308233 and parameters: {'max_depth': 1, 'learning_rate': 0.8448379756572656, 'n_estimators': 312, 'min_child_weight': 5, 'gamma': 0.32793826835066336, 'subsample': 0.8830243665655956, 'colsample_bytree': 0.061524554262160955, 'reg_alpha': 0.11035642731055115, 'reg_lambda': 0.6299345557266762, 'max_delta_step': 2}. Best is trial 1 with value: 0.22729982710308233.[0m
[32m[I 2023-04-

KeyboardInterrupt: 

In [10]:
par = {'max_depth': 1, 'learning_rate': 0.8129569232934538, 'n_estimators': 631, 'min_child_weight': 5, 'gamma': 0.24077354978878754, 'subsample': 0.7934114236558842, 'colsample_bytree': 0.42821918862915, 'reg_alpha': 0.13073655673760248, 'reg_lambda': 0.5123984389075366, 'max_delta_step': 3}
model = xgboost.XGBClassifier(**par)
model.fit(x_train, y_train)

test_data=test_data.drop("id",axis=1)
rf_pred_test_data=model.predict(test_data)

In [13]:
submission=pd.read_csv("sample_submission.csv")
submission=submission.drop("damage_grade",axis=1)
rf_pred_test_data=pd.DataFrame(rf_pred_test_data)
submission["damage_grade"]=rf_pred_test_data
submission.head()
     

Unnamed: 0,id,damage_grade
0,0,4
1,1,4
2,2,4
3,3,3
4,4,1


In [14]:
submission.to_csv('submission.csv', index=False)

In [9]:
test_data.head()

Unnamed: 0,id,floors_before_eq(total),old_building,plinth_area(ft^2),height_before_eq(ft),position,has_secondary_use,type_of_reinforcement_concrete,no_family_residing,flexible_superstructure,ismorethanplintharea,reconstruction,major_repair,minor_repair,no_need,land_surface_condition_flat,land_surface_condition_moderate slope,land_surface_condition_steep slope,type_of_foundation_bamboo_or_timber,type_of_foundation_cement_stone_or_cement_brick,type_of_foundation_mixed,type_of_foundation_other,type_of_foundation_rc,type_of_roof_bamboo_or_timber_heavy,type_of_roof_bamboo_or_timber_light,type_of_roof_rcc_rb_rbc,type_of_ground_floor_brick_or_stone,type_of_ground_floor_clay_mud,type_of_ground_floor_other,type_of_ground_floor_rc,type_of_ground_floor_wood,type_of_other_floor_not applicable,type_of_other_floor_rcc_rb_rbc,type_of_other_floor_wood_or_bambo_mud,type_of_other_floor_wood_plank,building_plan_configuration_building with central courtyard,building_plan_configuration_e-shape,building_plan_configuration_h-shape,building_plan_configuration_l-shape,building_plan_configuration_multi-projected,building_plan_configuration_others,building_plan_configuration_rectangular,building_plan_configuration_square,building_plan_configuration_t-shape,building_plan_configuration_u-shape,residential_type_hotel/motel,residential_type_housing,residential_type_non-residential,residential_type_other,residential_type_other residential type,residential_type_rental residential,public_place_type_education_places,public_place_type_health_places,public_place_type_non-public,public_place_type_other institutional building,public_place_type_public_places,public_place_type_religious sites,industrial_use_type_agro,industrial_use_type_farm,industrial_use_type_food,industrial_use_type_forest-based,industrial_use_type_infrastructure,industrial_use_type_manufacturing,industrial_use_type_metallurgy,industrial_use_type_non-industrial,industrial_use_type_service/tourism,govermental_use_type_govermental buildings,govermental_use_type_non-govermental,govermental_use_type_police offices,wall_binding_0,wall_binding_1,wall_binding_2,wall_binding_3,wall_binding_5,wall_binding_7,wall_material_0,wall_material_1,wall_material_2,wall_material_3
0,0,2,7,418,14,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
1,1,3,13,396,21,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
2,2,3,40,400,18,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
3,3,2,25,378,20,2,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
4,4,2,5,375,20,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
