In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb

In [2]:
quakeData = pd.read_csv('train_values.csv')
quakeData.shape

(260601, 40)

In [3]:
columns = ['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 
           'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'legal_ownership_status',
          'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
          'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution',
          'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post',
          'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other']
quakeData_cat = quakeData.drop(columns, axis=1)

In [4]:
quakeData_cat.head()

Unnamed: 0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
0,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,3
1,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,2
2,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,3
3,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,2
4,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,3


In [5]:
quakeData_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 19 columns):
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non-null object
plan_configuration                        260601 non-null object
has_superstructure_adobe_mud              260601 non-null int64
has_superstructure_mud_mortar_stone       260601 non-null int64
has_superstructure_stone_flag             260601 non-null int64
has_superstructure_cement_mortar_stone    260601 non-null int64
has_superstructure_mud_mortar_brick       260601 non-null int64
has_superstructure_cement_mortar_brick    260601 non-null int64
has_superstructure_timber                 260601 n

## Explore Categorical Variables 

In [6]:
quakeData_cat = quakeData_cat.astype({"damage_grade": 'object', "land_surface_condition": 'object', "foundation_type": 'object'})
quakeData_cat = quakeData_cat.astype({"ground_floor_type": 'object', "position": 'object', "plan_configuration": 'object'})
quakeData_cat = quakeData_cat.astype({"has_superstructure_adobe_mud": 'object', "has_superstructure_mud_mortar_stone": 'object', "has_superstructure_stone_flag": 'object'})
quakeData_cat = quakeData_cat.astype({"has_superstructure_cement_mortar_stone": 'object', "has_superstructure_mud_mortar_brick": 'object', "has_superstructure_cement_mortar_brick": 'object'})
quakeData_cat = quakeData_cat.astype({"has_superstructure_timber": 'object', "has_superstructure_bamboo": 'object', "has_superstructure_rc_non_engineered": 'object'})
quakeData_cat = quakeData_cat.astype({"has_superstructure_rc_engineered": 'object', "has_superstructure_other": 'object'})

In [7]:
# find categorical variables

categorical = [var for var in quakeData_cat.columns if quakeData_cat[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :\n\n', categorical)

There are 19 categorical variables

The categorical variables are :

 ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'damage_grade']


In [8]:
# view the categorical variables

quakeData_cat[categorical].head()

Unnamed: 0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
0,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,3
1,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,2
2,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,3
3,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,2
4,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,3


### damage_grade is the target/response variable. Rest 18/19 cat. variables are predictors 

In [9]:
# check missing values in categorical variables

quakeData_cat[categorical].isnull().sum()

land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo                 0
has_superstructure_rc_non_engineered      0
has_superstructure_rc_engineered          0
has_superstructure_other                  0
damage_grade                              0
dtype: int64

In [10]:
# view frequency counts of values in categorical variables

for var in categorical: 
    
    print(quakeData_cat[var].value_counts())

t    216757
n     35528
o      8316
Name: land_surface_condition, dtype: int64
r    219196
w     15118
u     14260
i     10579
h      1448
Name: foundation_type, dtype: int64
n    182842
q     61576
x     16183
Name: roof_type, dtype: int64
f    209619
x     24877
v     24593
z      1004
m       508
Name: ground_floor_type, dtype: int64
q    165282
x     43448
j     39843
s     12028
Name: other_floor_type, dtype: int64
s    202090
t     42896
j     13282
o      2333
Name: position, dtype: int64
d    250072
q      5692
u      3649
s       346
c       325
a       252
o       159
m        46
n        38
f        22
Name: plan_configuration, dtype: int64
0    237500
1     23101
Name: has_superstructure_adobe_mud, dtype: int64
1    198561
0     62040
Name: has_superstructure_mud_mortar_stone, dtype: int64
0    251654
1      8947
Name: has_superstructure_stone_flag, dtype: int64
0    255849
1      4752
Name: has_superstructure_cement_mortar_stone, dtype: int64
0    242840
1     17761
Name: 

In [11]:
# view frequency distribution of categorical variables

for var in categorical: 
    
    print(quakeData_cat[var].value_counts()/np.float(len(quakeData_cat)))

t    0.831758
n    0.136331
o    0.031911
Name: land_surface_condition, dtype: float64
r    0.841117
w    0.058012
u    0.054720
i    0.040595
h    0.005556
Name: foundation_type, dtype: float64
n    0.701617
q    0.236285
x    0.062099
Name: roof_type, dtype: float64
f    0.804368
x    0.095460
v    0.094370
z    0.003853
m    0.001949
Name: ground_floor_type, dtype: float64
q    0.634234
x    0.166722
j    0.152889
s    0.046155
Name: other_floor_type, dtype: float64
s    0.775477
t    0.164604
j    0.050967
o    0.008952
Name: position, dtype: float64
d    0.959597
q    0.021842
u    0.014002
s    0.001328
c    0.001247
a    0.000967
o    0.000610
m    0.000177
n    0.000146
f    0.000084
Name: plan_configuration, dtype: float64
0    0.911355
1    0.088645
Name: has_superstructure_adobe_mud, dtype: float64
1    0.761935
0    0.238065
Name: has_superstructure_mud_mortar_stone, dtype: float64
0    0.965668
1    0.034332
Name: has_superstructure_stone_flag, dtype: float64
0    0.981765

### Number of labels: Cardinality
### The number of labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as high cardinality. High cardinality may pose some serious problems in the machine learning model. So, I will check for high cardinality.

In [12]:
# check for cardinality in categorical variables
for var in categorical:
    print(var, ' contains ', len(quakeData_cat[var].unique()), ' labels')

land_surface_condition  contains  3  labels
foundation_type  contains  5  labels
roof_type  contains  3  labels
ground_floor_type  contains  5  labels
other_floor_type  contains  4  labels
position  contains  4  labels
plan_configuration  contains  10  labels
has_superstructure_adobe_mud  contains  2  labels
has_superstructure_mud_mortar_stone  contains  2  labels
has_superstructure_stone_flag  contains  2  labels
has_superstructure_cement_mortar_stone  contains  2  labels
has_superstructure_mud_mortar_brick  contains  2  labels
has_superstructure_cement_mortar_brick  contains  2  labels
has_superstructure_timber  contains  2  labels
has_superstructure_bamboo  contains  2  labels
has_superstructure_rc_non_engineered  contains  2  labels
has_superstructure_rc_engineered  contains  2  labels
has_superstructure_other  contains  2  labels
damage_grade  contains  3  labels


In [13]:
column2 = ['damage_grade']
quakeData_cat = quakeData_cat.drop(column2, axis=1)

X = quakeData_cat

y = quakeData['damage_grade']

In [14]:
X.head()

Unnamed: 0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
0,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0
1,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0
2,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0
3,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0
4,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0


In [15]:
y.head()

0    3
1    2
2    3
3    2
4    3
Name: damage_grade, dtype: int64

In [16]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [17]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((182420, 18), (78181, 18))

### Feature Engineering is the process of transforming raw data into useful features that help us to understand our model better and increase its predictive power. I will carry out feature engineering on different types of variables. First, I will display the categorical and numerical variables again separately.

In [18]:
# check data types in X_train

X_train.dtypes

land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud              object
has_superstructure_mud_mortar_stone       object
has_superstructure_stone_flag             object
has_superstructure_cement_mortar_stone    object
has_superstructure_mud_mortar_brick       object
has_superstructure_cement_mortar_brick    object
has_superstructure_timber                 object
has_superstructure_bamboo                 object
has_superstructure_rc_non_engineered      object
has_superstructure_rc_engineered          object
has_superstructure_other                  object
dtype: object

In [19]:
# display categorical variables

categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']

categorical

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'has_superstructure_adobe_mud',
 'has_superstructure_mud_mortar_stone',
 'has_superstructure_stone_flag',
 'has_superstructure_cement_mortar_stone',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_cement_mortar_brick',
 'has_superstructure_timber',
 'has_superstructure_bamboo',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_rc_engineered',
 'has_superstructure_other']

In [23]:
# display numerical variables

numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']

numerical

['building_id',
 'geo_level_1_id',
 'geo_level_2_id',
 'geo_level_3_id',
 'count_floors_pre_eq',
 'age',
 'area_percentage',
 'height_percentage',
 'count_families',
 'has_secondary_use',
 'has_secondary_use_other']

In [20]:
# print percentage of missing values in the categorical variables in training set

X_train[categorical].isnull().mean()

land_surface_condition                    0.0
foundation_type                           0.0
roof_type                                 0.0
ground_floor_type                         0.0
other_floor_type                          0.0
position                                  0.0
plan_configuration                        0.0
has_superstructure_adobe_mud              0.0
has_superstructure_mud_mortar_stone       0.0
has_superstructure_stone_flag             0.0
has_superstructure_cement_mortar_stone    0.0
has_superstructure_mud_mortar_brick       0.0
has_superstructure_cement_mortar_brick    0.0
has_superstructure_timber                 0.0
has_superstructure_bamboo                 0.0
has_superstructure_rc_non_engineered      0.0
has_superstructure_rc_engineered          0.0
has_superstructure_other                  0.0
dtype: float64

In [21]:
# check missing values in categorical variables in X_test

X_test[categorical].isnull().sum()

land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo                 0
has_superstructure_rc_non_engineered      0
has_superstructure_rc_engineered          0
has_superstructure_other                  0
dtype: int64

In [22]:
#Encode categorical variables
import category_encoders as ce

In [23]:
# encode remaining variables with one-hot encoding

encoder = ce.OneHotEncoder(cols=['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 
                                 'position', 'plan_configuration'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)

In [24]:
X_train.head()

Unnamed: 0,land_surface_condition_1,land_surface_condition_2,land_surface_condition_3,foundation_type_1,foundation_type_2,foundation_type_3,foundation_type_4,foundation_type_5,roof_type_1,roof_type_2,...,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
244903,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
93952,1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
48580,1,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
114125,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
175913,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0


In [25]:
X_train.shape

(182420, 45)

In [26]:
X_test.shape

(78181, 45)

### We now have training and testing set ready for model building. Before that, we should map all the feature variables onto the same scale. It is called feature scaling.

In [27]:
cols = X_train.columns

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.head()

Unnamed: 0,land_surface_condition_1,land_surface_condition_2,land_surface_condition_3,foundation_type_1,foundation_type_2,foundation_type_3,foundation_type_4,foundation_type_5,roof_type_1,roof_type_2,...,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


### We now have X_train dataset ready to be fed into the Gaussian Naive Bayes classifier.

In [30]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB


# instantiate the model
gnb = GaussianNB()


# fit the model

gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [31]:
y_pred = gnb.predict(X_test)

print(y_pred)

[3 3 3 ... 3 3 3]


In [32]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.4001


In [33]:
#Now, I will compare the train-set and test-set accuracy to check for overfitting.

y_pred_train = gnb.predict(X_train)

y_pred_train

array([1, 3, 3, ..., 3, 3, 3])

In [34]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.3985


In [35]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(gnb.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(gnb.score(X_test, y_test)))

Training set score: 0.3985
Test set score: 0.4001


In [36]:
# check class distribution in test set

y_test.value_counts()

2    44423
3    26301
1     7457
Name: damage_grade, dtype: int64

In [37]:
# check null accuracy score

null_accuracy = (44423/(44423+26301+7457))

print('Null accuracy score: {0:0.4f}'. format(null_accuracy))

Null accuracy score: 0.5682


In [41]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.32      0.65      0.43      7457
           2       0.71      0.06      0.11     44423
           3       0.40      0.90      0.55     26301

    accuracy                           0.40     78181
   macro avg       0.48      0.54      0.37     78181
weighted avg       0.57      0.40      0.29     78181

