In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Specify dtype option on import or set low_memory=False.
pd.options.mode.chained_assignment = None  # default='warn'

import math
import scipy

# Datviz purposes
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
import missingno as msno

In [2]:
try :
    train_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/train.csv", low_memory=False)
    test_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/test.csv", low_memory=False) # For Kaggle

except :
    train_data = pd.read_csv("../2-DataInterpolation/train_data.csv", low_memory=False)
    test_data = pd.read_csv("../2-DataInterpolation/test_data.csv", low_memory=False) # For local

In [3]:
pd.DataFrame(train_data.dtypes, columns=['Data Type'])

Unnamed: 0,Data Type
floors_before_eq(total),float64
old_building,float64
plinth_area(ft^2),float64
height_before_eq(ft),float64
land_surface_condition,object
type_of_foundation,object
type_of_roof,object
type_of_ground_floor,object
type_of_other_floor,object
position,float64


In [4]:
pd.DataFrame(test_data.dtypes, columns=['Data Type'])

Unnamed: 0,Data Type
id,int64
floors_before_eq(total),int64
old_building,int64
plinth_area(ft^2),float64
height_before_eq(ft),int64
land_surface_condition,object
type_of_foundation,object
type_of_roof,object
type_of_ground_floor,object
type_of_other_floor,object


## Change Data Type

In [5]:
# change into int
train_data['floors_before_eq(total)'] = train_data['floors_before_eq(total)'].astype(int)
train_data['old_building'] = train_data['old_building'].astype(int)
train_data['plinth_area(ft^2)'] = train_data['plinth_area(ft^2)'].astype(int)
train_data['height_before_eq(ft)'] = train_data['height_before_eq(ft)'].astype(int)
train_data['position'] = train_data['position'].astype(int)
train_data['has_secondary_use'] = train_data['has_secondary_use'].astype(int)
train_data['no_family_residing'] = train_data['no_family_residing'].astype(int)
train_data['flexible_superstructure'] = train_data['flexible_superstructure'].astype(int)

In [6]:
pd.DataFrame(train_data.dtypes, columns=['Data Type'])

Unnamed: 0,Data Type
floors_before_eq(total),int64
old_building,int64
plinth_area(ft^2),int64
height_before_eq(ft),int64
land_surface_condition,object
type_of_foundation,object
type_of_roof,object
type_of_ground_floor,object
type_of_other_floor,object
position,int64


## Add feature to calculate how height per floor

In [10]:
train_data['height_per_floor'] = train_data['height_before_eq(ft)'] / train_data['floors_before_eq(total)'] * 0.1
test_data['height_per_floor'] = test_data['height_before_eq(ft)'] / test_data['floors_before_eq(total)'] * 0.1

In [11]:
train_data['height_per_floor'].describe()

count    335455.000000
mean          0.792138
std           0.209394
min           0.100000
25%           0.666667
50%           0.750000
75%           0.900000
max           9.900000
Name: height_per_floor, dtype: float64

## Sort ordinal variables by damage grade

In [15]:
def sort_by_damagegrade(df, cols):
    correlate = pd.DataFrame(
    index=['1', '2','3','4','5'], 
    columns=df[cols].unique())

    for j  in df[cols].unique():
        try : 
            
            correlate.loc[correlate.index == '1', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==1))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '2', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==2))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '3', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==3))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '4', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==4))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '5', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==5))/sum(df[cols]==j) * 100, 3)
            
        except :
            pass

    correlate = correlate.T
    correlate['damagepoint'] = correlate['1'] * 1 + correlate['2'] * 2 + correlate['3'] * 3 + correlate['4'] * 4 + correlate['5'] * 5

    # short based on damagepoint
    correlate = correlate.sort_values(by='damagepoint', ascending=True)
    return correlate.index

### land_surface_condition

In [16]:
i = 0

for value in sort_by_damagegrade(train_data, 'land_surface_condition'):
    # change value of land_surface_condition into i
    train_data.loc[train_data['land_surface_condition'] == value, 'land_surface_condition'] = i
    test_data.loc[test_data['land_surface_condition'] == value, 'land_surface_condition'] = i
    i += 1

In [17]:
train_data['land_surface_condition'].unique()

array([0, 1, 2], dtype=object)

### type_of_foundation

In [18]:
i = 0
for value in sort_by_damagegrade(train_data, 'type_of_foundation'):
    train_data.loc[train_data['type_of_foundation'] == value, 'type_of_foundation'] = i
    test_data.loc[test_data['type_of_foundation'] == value, 'type_of_foundation'] = i
    i += 1

train_data['type_of_foundation'].unique()

array([1, 4, 0, 2, 3], dtype=object)

### type_of_roof

In [19]:
i = 0
for value in sort_by_damagegrade(train_data, 'type_of_roof'):
    train_data.loc[train_data['type_of_roof'] == value, 'type_of_roof'] = i
    test_data.loc[test_data['type_of_roof'] == value, 'type_of_roof'] = i
    i += 1

train_data['type_of_roof'].unique()

array([1, 2, 0], dtype=object)

### type_of_ground_floor	

In [20]:
i = 0
for value in sort_by_damagegrade(train_data, 'type_of_ground_floor'):
    train_data.loc[train_data['type_of_ground_floor'] == value, 'type_of_ground_floor'] = i
    test_data.loc[test_data['type_of_ground_floor'] == value, 'type_of_ground_floor'] = i
    i += 1

train_data['type_of_ground_floor'].unique()

array([4, 3, 0, 1, 2], dtype=object)

### type_of_other_floor

In [21]:
i = 0
for value in sort_by_damagegrade(train_data, 'type_of_other_floor'):
    train_data.loc[train_data['type_of_other_floor'] == value, 'type_of_other_floor'] = i
    test_data.loc[test_data['type_of_other_floor'] == value, 'type_of_other_floor'] = i
    i += 1

train_data['type_of_other_floor'].unique()

array([3, 1, 0, 2], dtype=object)

### type_of_reinforcement_concrete

In [22]:
i = 0
for value in sort_by_damagegrade(train_data, 'type_of_reinforcement_concrete'):
    train_data.loc[train_data['type_of_reinforcement_concrete'] == value, 'type_of_reinforcement_concrete'] = i
    test_data.loc[test_data['type_of_reinforcement_concrete'] == value, 'type_of_reinforcement_concrete'] = i
    i += 1

train_data['type_of_reinforcement_concrete'].unique()

array([3, 0, 2, 1], dtype=object)

### wall_binding

In [23]:
i = 0
for value in sort_by_damagegrade(train_data, 'wall_binding'):
    train_data.loc[train_data['wall_binding'] == value, 'wall_binding'] = i
    test_data.loc[test_data['wall_binding'] == value, 'wall_binding'] = i
    i += 1

train_data['wall_binding'].unique()

array([1, 5, 0, 4, 3, 2], dtype=object)

### wall_material

In [24]:
i = 0
for value in sort_by_damagegrade(train_data, 'wall_material'):
    train_data.loc[train_data['wall_material'] == value, 'wall_material'] = i
    test_data.loc[test_data['wall_material'] == value, 'wall_material'] = i
    i += 1

train_data['wall_material'].unique()

array([1, 3, 0, 2], dtype=object)

## Add feature that calculate how distribution earthquake intensity

In [25]:
# calculate distribution of earthquake based on flinth area and height

gravity = 9.8

ft_to_meter_cubic = 0.092903
ft_to_meter = 0.3048

train_data['pressure'] = gravity * (train_data['plinth_area(ft^2)'] * ft_to_meter_cubic) * (train_data['height_before_eq(ft)'] * ft_to_meter)
test_data['pressure'] = gravity * (test_data['plinth_area(ft^2)'] * ft_to_meter_cubic) * (test_data['height_before_eq(ft)'] * ft_to_meter)

In [26]:
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (335455, 30)
Test data shape: (242082, 30)


## Balancing the data

In [27]:
print(f"Train data shape: {train_data.shape}")

one_damage_grade = train_data[train_data['damage_grade'] == 1]
two_damage_grade = train_data[train_data['damage_grade'] == 2].sample(n=len(one_damage_grade), random_state=42)
three_damage_grade = train_data[train_data['damage_grade'] == 3].sample(n=len(one_damage_grade), random_state=42)
four_damage_grade = train_data[train_data['damage_grade'] == 4].sample(n=len(one_damage_grade), random_state=42)
five_damage_grade = train_data[train_data['damage_grade'] == 5].sample(n=len(one_damage_grade), random_state=42)

balanced_train = pd.concat([one_damage_grade, two_damage_grade, three_damage_grade, four_damage_grade, five_damage_grade], ignore_index=True, sort=False)

print(f"Train data shape: {balanced_train.shape}")

## Export the cleaned data to csv file

In [28]:
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)