In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Specify dtype option on import or set low_memory=False.
pd.options.mode.chained_assignment = None  # default='warn'

import math
import scipy

# Datviz purposes
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
import missingno as msno

import tensorflow as tf
from tensorflow import keras

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import cohen_kappa_score

from sklearn.model_selection import train_test_split

In [None]:
try :
    train_data = pd.read_csv("/kaggle/input/joints-cleaned/train_data.csv")
    test_data = pd.read_csv("/kaggle/input/joints-cleaned/test_data.csv")
    submission_sample_data = pd.read_csv("/kaggle/input/joints-cleaned/sample_submission.csv") # For Kaggle

except :
    train_data = pd.read_csv("../4-FeatureEng/train_data.csv", low_memory=False)
    test_data = pd.read_csv("../4-FeatureEng/test_data.csv", low_memory=False) # For local

In [None]:
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

In [None]:
print("Train data columns: ", train_data.columns)
print("Test data columns: ", test_data.columns)

## One-hot encoding

In [None]:
one_hot = ['building_plan_configuration', 'residential_type', 'public_place_type', 
           'industrial_use_type', 'govermental_use_type','legal_ownership_status']

train_data = pd.get_dummies(train_data, columns=one_hot)
test_data = pd.get_dummies(test_data, columns=one_hot)

## RobustScaler

In [None]:
# use robust scaler

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

#'floors_before_eq(total)', 'old_building', 'plinth_area(ft^2)','height_before_eq(ft)', 'height_per_floor','pressure'
transform = ['floors_before_eq(total)', 'old_building', 'plinth_area(ft^2)','height_before_eq(ft)', 'height_per_floor','pressure']

train_data[transform] = scaler.fit_transform(train_data[transform])
test_data[transform] = scaler.transform(test_data[transform])

# Data preparation

In [None]:
# copying the original dataset into X
X=train_data.copy()
# droping building_id and dependent variable damage_grade
# independant variables
X=X.drop(["damage_grade"],axis=1)
# dependent variable
y=train_data["damage_grade"]

In [None]:
x_train, x_test,y_train, y_test = train_test_split(X,y,test_size = 0.20,random_state = 42)

# XGBoost

In [None]:
# try to use xgboost classifier

from xgboost import XGBClassifier

par = {'max_depth': 9, 
       'learning_rate': 0.09171368249775058, 
       'n_estimators': 2000, 
       'min_child_weight': 1, 
       'gamma': 0.8941301203825416, 
       'subsample': 0.9235417357436105, 
       'colsample_bytree': 0.3938564995197662, 
       'reg_alpha': 0.5575094496034125, 
       'reg_lambda': 0.21515479375548496, 
       'max_delta_step': 1}

XGB = XGBClassifier(**par)

# change x_train from 1,2,3,4,5 to 0,1,2,3,4
y_train = y_train - 1
y_test = y_test - 1

evalset = [(x_train,y_train)]

History = XGB.fit(x_train, 
                  y_train, 
                  eval_metric='mlogloss', 
                  eval_set=evalset)

xgb_pred = XGB.predict(x_test)
cm=confusion_matrix(y_test,xgb_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:1','Predicted:2',
                                          'Predicted:3','Predicted:4','Predicted:5',],
                                               index=['Actual:1','Actual:2',
                                                            'Actual:3','Actual:4','Actual:5'])

plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
plt.title("confusion Matrix for  XGBoost", fontsize=15, fontweight='bold', pad=20)
plt.xticks(rotation=90)
plt.yticks(rotation=360)
plt.show()

In [None]:
print("-"*100)
print("Accuracy Score for XGBoost :",accuracy_score(y_test,xgb_pred))
print("-"*100)
print("\n")
print("classification report for XGBoost :\n\n",classification_report(y_test, xgb_pred))
print("-"*100)

In [None]:
# plot feature importance

feature_importance = XGB.feature_importances_

# make importances relative to max importance
# show only top 10 features
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[-10:]
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
results = XGB.evals_result()
# plot learning curves
plt.plot(results['validation_0']['mlogloss'], label='train')
# add x and y labels
plt.xlabel('Epochs')
plt.ylabel('Mean Log Loss')
# show the legend
plt.legend()
# show the plot
plt.show()

# Make submission

In [None]:
# now predicting on test_data dataset
test_data=test_data.drop("id",axis=1)
xgb_pred_test_data=XGB.predict(test_data)

In [None]:
submission_sample_data=submission_sample_data.drop("damage_grade",axis=1)
xgb_pred_test_data=pd.DataFrame(xgb_pred_test_data)
submission_sample_data["damage_grade"]=xgb_pred_test_data + 1
submission_sample_data.head()

In [None]:
submission_sample_data['damage_grade'].unique()

In [None]:
submission_sample_data.to_csv('submission.csv', index=False)