In [2]:
# import packages for pipeline
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import eli5

In [3]:
# import and merge data
train_values = pd.read_csv('data/train_values.csv')
train_labels = pd.read_csv('data/train_labels.csv')
train_data = train_values.merge(train_labels, left_on='building_id', right_on='building_id')

In [5]:
# check data types
train_data.dtypes

building_id                                int64
geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_c

In [6]:
# dropping id for model training, and doing data prep
train_data = train_data.drop('building_id', axis=1) # axis 1 is to drop a column (0 would be for rows)
numeric_features = train_data.select_dtypes(include=['int64', 'float64']).drop(['damage_grade'], axis=1).columns
categorical_features = train_data.select_dtypes(include=['object']).columns
X = train_data.drop('damage_grade', axis=1)
y = train_data['damage_grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # test size does percentage split for train / test (20% in this case)

In [7]:
# pipeline that imputes any missing values, applies a standard scaler to the numerical features, converts any categorical features into numerical and then fits a classifier
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot', OneHotEncoder())])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',  LogisticRegression(class_weight='balanced', random_state=0))])
    
model = pipe.fit(X_train, y_train)



In [8]:
# inspect model quality
target_names = y_test.unique().astype(str)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           3       0.45      0.61      0.52      5002
           1       0.63      0.64      0.63     29669
           2       0.49      0.44      0.46     17450

    accuracy                           0.57     52121
   macro avg       0.52      0.56      0.54     52121
weighted avg       0.57      0.57      0.57     52121



In [9]:
# one hot encoding to add cat variables to numerical feature list
onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['one_hot'].get_feature_names(input_features=categorical_features))
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)

In [10]:
# feature importance
eli5.explain_weights(pipe.named_steps['classifier'], top=50, feature_names=numeric_features_list)

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+0.520,plan_configuration_o,
+0.497,plan_configuration_n,
+0.363,other_floor_type_s,
+0.352,position_s,
+0.272,foundation_type_i,
+0.261,foundation_type_h,
+0.242,geo_level_1_id,
+0.163,ground_floor_type_v,
+0.157,has_secondary_use_hotel,
+0.152,has_superstructure_timber,

Weight?,Feature
+0.520,plan_configuration_o
+0.497,plan_configuration_n
+0.363,other_floor_type_s
+0.352,position_s
+0.272,foundation_type_i
+0.261,foundation_type_h
+0.242,geo_level_1_id
+0.163,ground_floor_type_v
+0.157,has_secondary_use_hotel
+0.152,has_superstructure_timber

Weight?,Feature
+0.447,plan_configuration_m
+0.445,foundation_type_w
+0.428,position_o
+0.367,ground_floor_type_m
+0.260,plan_configuration_f
+0.258,other_floor_type_q
+0.239,has_secondary_use
+0.232,plan_configuration_c
+0.172,legal_ownership_status_v
+0.162,foundation_type_u

Weight?,Feature
+0.859,plan_configuration_q
+0.527,foundation_type_h
+0.476,foundation_type_r
+0.357,plan_configuration_d
+0.236,ground_floor_type_f
+0.233,count_floors_pre_eq
+0.226,position_t
+0.185,ground_floor_type_x
+0.185,foundation_type_u
+0.162,has_secondary_use_agriculture
