# LightGBM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgbm

In [32]:
df_train = pd.read_csv('train_ready.csv', index_col='building_id')
df_test = pd.read_csv('test_ready.csv', index_col='building_id')

In [33]:
df_train['damage_grade'].value_counts()

2    148259
3     87218
1     25124
Name: damage_grade, dtype: int64

In [34]:
# lightGBM converts one label to 0 on target variable when training and on grid search it occurs errors
df_train.loc[df_train['damage_grade'] == 1, 'damage_grade'] = 0
df_train.loc[df_train['damage_grade'] == 2, 'damage_grade'] = 1
df_train.loc[df_train['damage_grade'] == 3, 'damage_grade'] = 2

In [35]:
df_train['damage_grade'].value_counts()

1    148259
2     87218
0     25124
Name: damage_grade, dtype: int64

## Split train test

In [36]:
X = df_train.drop(columns=['damage_grade'])
y = df_train['damage_grade']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Model fitting

In [38]:
classifier = lgbm.LGBMClassifier(objective='multiclass', learning_rate=0.05, max_depth=50)

In [39]:
lgbm.Dataset

lightgbm.basic.Dataset

In [1]:
#classifier.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_test, y_test)], early_stopping_rounds=10)

In [41]:
y_pred = classifier.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.38      0.48      7636
           1       0.69      0.85      0.76     44607
           2       0.71      0.53      0.61     25938

    accuracy                           0.69     78181
   macro avg       0.68      0.59      0.62     78181
weighted avg       0.69      0.69      0.68     78181



## Gridsearch

In [54]:
from sklearn.metrics import fbeta_score, make_scorer

fO5_scorer = make_scorer(fbeta_score, beta=2, average = 'micro')

In [55]:
classifier = lgbm.LGBMClassifier(objective='multiclass', class_weight ={})

In [56]:
params ={
    'learning_rate':[0.001, 0.005, 0.05, 0.1],
    'max_depth':[50, 100, 200, 500],
    'n_estimators':[50, 100, 200, 500],
    'num_leaves':[30, 50, 100],
    'min_child_samples':[20,50]
    
}

In [57]:
grid = GridSearchCV(classifier, param_grid=params, scoring = fO5_scorer, cv =5)

In [2]:
#grid.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_test, y_test)], early_stopping_rounds=10)

In [59]:
pred = grid.predict(X_test)

In [60]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.50      0.58      7636
           1       0.74      0.85      0.79     44607
           2       0.76      0.63      0.69     25938

    accuracy                           0.74     78181
   macro avg       0.73      0.66      0.69     78181
weighted avg       0.74      0.74      0.74     78181



In [61]:
grid.best_params_

{'learning_rate': 0.1,
 'max_depth': 50,
 'min_child_samples': 50,
 'n_estimators': 500,
 'num_leaves': 100}

In [62]:
predict = grid.predict(df_test)

In [63]:
submit = pd.read_csv('submission_format.csv', index_col=0)

In [64]:
submit.drop(columns=['damage_grade'], inplace=True)

In [65]:
submit['damage_grade'] = predict

In [66]:
submit.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,1
890251,1
745817,0
421793,2


In [68]:
submit.loc[submit['damage_grade'] == 2, 'damage_grade'] = 3
submit.loc[submit['damage_grade'] == 1, 'damage_grade'] = 2
submit.loc[submit['damage_grade'] == 0, 'damage_grade'] = 1

In [69]:
submit.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [70]:
submit.to_csv('lightGBM.csv')