## Import Libraries and Data

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Specify dtype option on import or set low_memory=False.
pd.options.mode.chained_assignment = None  # default='warn'

import math
import scipy

# Datviz purposes
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px

In [5]:
try :
    train_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/train.csv", low_memory=False)
    test_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/test.csv", low_memory=False) # For Kaggle

except :
    train_data = pd.read_csv("../datasets/train.csv", low_memory=False)
    test_data = pd.read_csv("../datasets/test.csv", low_memory=False) # For local development

In [6]:
print(train_data.isnull().sum())
print("\n",train_data.shape)

Unnamed: 0                             0
floors_before_eq (total)          332806
old_building                      239204
plinth_area (ft^2)                421208
height_before_eq (ft)             332806
land_surface_condition            301606
type_of_foundation                239204
type_of_roof                      421208
type_of_ground_floor              332806
type_of_other_floor               301606
position                          312006
building_plan_configuration       301606
technical_solution_proposed       676014
legal_ownership_status            124802
has_secondary_use                 197604
type_of_reinforcement_concrete    291206
residential_type                  270404
no_family_residing                145602
public_place_type                      0
industrial_use_type               114402
govermental_use_type              249604
flexible_superstructure            62400
wall_binding                       62400
wall_material                     228804
damage_grade    

In [7]:
print(test_data.isnull().sum())
print("\n",test_data.shape)

id                                0
floors_before_eq (total)          0
old_building                      0
plinth_area (ft^2)                0
height_before_eq (ft)             0
land_surface_condition            0
type_of_foundation                0
type_of_roof                      0
type_of_ground_floor              0
type_of_other_floor               0
position                          0
building_plan_configuration       0
technical_solution_proposed       0
legal_ownership_status            0
has_secondary_use                 0
type_of_reinforcement_concrete    0
residential_type                  0
no_family_residing                0
public_place_type                 0
industrial_use_type               0
govermental_use_type              0
flexible_superstructure           0
wall_binding                      0
wall_material                     0
dtype: int64

 (242082, 24)


## Data Preprocessing

In [8]:
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
train_data.head(1).transpose()

Unnamed: 0,0
floors_before_eq (total),floor two
old_building,1.0
plinth_area (ft^2),256 ft^2
height_before_eq (ft),22.0
land_surface_condition,Flat
type_of_foundation,Bamboo or Timber
type_of_roof,Bamboo/Timber Light roof
type_of_ground_floor,Clay
type_of_other_floor,TImber/Bamboo-Mud
position,Not attached


### lower case and replace spaces

In [10]:
# remove the space in column names
train_data.columns = train_data.columns.str.replace(' ', '')
test_data.columns = test_data.columns.str.replace(' ', '')

In [11]:
string_columns = ['floors_before_eq(total)','land_surface_condition','type_of_foundation','type_of_roof','type_of_ground_floor',
                  'type_of_other_floor','position','building_plan_configuration' ,'technical_solution_proposed',
                  'legal_ownership_status','residential_type','public_place_type','industrial_use_type',
                  'govermental_use_type','flexible_superstructure']

In [12]:
# Make every values in every column to lowercase
def make_lower_case(column):
    column = str(column)
    return column.lower()

for col in string_columns:
    train_data[col] = train_data[col].apply(make_lower_case)

for col in string_columns:
    test_data[col] = test_data[col].apply(make_lower_case)

### floors column to be an integer

In [13]:
one_floors = ['one', '1']
two_floors = ['two', '2', 'second']
three_floors = ['three', '3', 'third']
four_floors = ['four', '4']
five_floors = ['five', '5', 'fifth']
six_floors = ['six', '6']
seven_floors = ['seven', '7']
eight_floors = ['eight', '8']
nine_floors = ['nine', '9']

def handle_floors_before_eq(total):
    if any(word in total for word in one_floors):
        return 1
    elif any(word in total for word in two_floors):
        return 2
    elif any(word in total for word in three_floors):
        return 3
    elif any(word in total for word in four_floors):
        return 4
    elif any(word in total for word in five_floors):
        return 5
    elif any(word in total for word in six_floors):
        return 6
    elif any(word in total for word in seven_floors):
        return 7
    elif any(word in total for word in eight_floors):
        return 8
    elif any(word in total for word in nine_floors):
        return 9
    else:
        return total

train_data['floors_before_eq(total)'] = train_data['floors_before_eq(total)'].apply(handle_floors_before_eq)
test_data['floors_before_eq(total)'] = test_data['floors_before_eq(total)'].apply(handle_floors_before_eq)

### 'plinth_area_sq_ft'

In [14]:
def handle_area(area):
    area = str(area)
    area = area.split(' ')[0]
    return area

train_data['plinth_area(ft^2)'] = train_data['plinth_area(ft^2)'].apply(handle_area)
test_data['plinth_area(ft^2)'] = test_data['plinth_area(ft^2)'].apply(handle_area)

### 'legal_ownership_status'

In [15]:
def make_eda_using_correlation_within_damagegrade(df, cols):
    correlate = pd.DataFrame(
    index=['1', '2','3','4','5'], 
    columns=df[cols].unique())

    for j  in df[cols].unique():
        try : 
            
            correlate.loc[correlate.index == '1', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==1))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '2', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==2))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '3', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==3))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '4', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==4))/sum(df[cols]==j) * 100, 3)
            
            correlate.loc[correlate.index == '5', j]= \
                np.round(sum((df[cols]==j)&\
                            (df['damage_grade']==5))/sum(df[cols]==j) * 100, 3)
            
        except :
            pass
    return correlate

In [16]:
make_eda_using_correlation_within_damagegrade(train_data, 'legal_ownership_status')

Unnamed: 0,private use,private,nan,prvt,public space,privste,public,unknown,public use,institutionals,unspecified,institutional use,other,institutional
1,9.817,9.911,10.136,9.577,32.164,9.786,29.237,18.866,30.972,5.037,19.163,3.372,16.316,4.139
2,11.158,11.304,11.455,11.197,22.234,11.666,22.394,11.339,22.09,4.835,12.281,5.246,15.175,5.701
3,17.88,17.857,17.791,18.101,19.216,18.304,20.929,14.272,20.061,13.23,16.734,13.911,15.088,13.354
4,24.578,24.323,24.194,23.994,11.426,24.313,13.337,18.084,12.366,29.684,16.329,28.618,15.439,28.7
5,36.567,36.605,36.424,37.131,14.96,35.93,14.103,37.439,14.51,47.213,35.493,48.852,37.982,48.106


In [17]:
private_values = ['private use','private','prvt','privste']
public_values = ['public space','public','public use']
institutional_values = ['institutionals','institutional use','institutional']
others_values = ['other','unknown','unspecified']

def handle_legal_ownership_status(status):
    if status in private_values:
        return 'private'
    elif status in public_values:
        return 'public'
    elif status in institutional_values:
        return 'institutional'
    elif status in others_values:
        return 'other'
    else:
        return status

train_data['legal_ownership_status'] = train_data['legal_ownership_status'].apply(handle_legal_ownership_status)
test_data['legal_ownership_status'] = test_data['legal_ownership_status'].apply(handle_legal_ownership_status)

### 'type_of_foundation'

In [18]:
make_eda_using_correlation_within_damagegrade(train_data, 'type_of_foundation')

Unnamed: 0,bamboo or timber,clay sand mixed mortar-stone/brick,mud mortar-stone/brick,nan,clay mortar-stone/brick,reinforced concrete,cement-stone or cement-brick,bamboo/timber,rc,others,cement-stone/brick,other
1,30.518,4.734,4.78,10.285,4.642,58.656,29.515,29.598,59.615,29.45,30.289,28.657
2,23.708,8.863,9.008,11.53,8.951,27.207,22.609,23.014,26.931,9.27,22.174,9.151
3,22.412,17.913,17.855,17.697,18.033,10.173,18.498,23.202,9.816,10.911,18.264,10.295
4,14.224,26.669,26.767,24.187,26.871,2.199,14.026,14.952,2.193,15.176,13.825,15.051
5,9.138,41.82,41.589,36.301,41.503,1.765,15.352,9.234,1.445,35.193,15.448,36.845


In [19]:
bambo_or_timber = ['bamboo or timber','bamboo/timber']
mixed =  ['clay sand mixed mortar-stone/brick', 'mud mortar-stone/brick','clay mortar-stone/brick']
other = ['others','other']
rc = ['rc', 'reinforced concrete']
cement_stone_or_cement_brick = ['cement-stone or cement-brick','cement-stone/brick']

def handle_type_of_foundation(foundation):
    if foundation in bambo_or_timber:
        return 'bamboo_or_timber'
    elif foundation in other:
        return 'other'
    elif foundation in mixed:
        return 'mixed'
    elif foundation in rc:
        return 'rc'
    elif foundation in cement_stone_or_cement_brick:
        return 'cement_stone_or_cement_brick'
    else:
        return foundation

train_data['type_of_foundation'] = train_data['type_of_foundation'].apply(handle_type_of_foundation)
test_data['type_of_foundation'] = test_data['type_of_foundation'].apply(handle_type_of_foundation)

### 'type_of_roof'

In [20]:
make_eda_using_correlation_within_damagegrade(train_data, 'type_of_roof')

Unnamed: 0,bamboo/timber light roof,wood light roof or bamboo heavy roof,nan,wood light roof or bamboo light roof,bamboo or timber light roof,bamboo/timber-light roof,reinforced brick slab/rcc/rbc,bamboo/timber heavy roof,reinforced cement concrete/rb/rbc,bamboo or timber heavy roof,bamboo/timber-heavy roof,rcc/rb/rbc,reinforced brick concrete/rcc/rbc
1,8.167,6.383,10.27,8.303,7.985,8.144,55.065,6.186,54.697,6.62,6.201,55.446,52.965
2,10.355,10.416,11.564,10.384,10.994,10.022,28.363,10.064,27.494,9.175,10.888,27.048,29.167
3,17.875,19.377,17.835,17.595,17.683,18.3,10.94,20.01,11.866,19.338,19.124,12.215,11.699
4,25.924,24.257,24.139,25.893,25.663,25.758,3.606,24.149,3.558,24.448,23.598,3.012,3.686
5,37.678,39.568,36.192,37.824,37.674,37.776,2.026,39.591,2.385,40.418,40.189,2.28,2.484


In [21]:
bambo_or_timber_light = ['bamboo/timber light roof','bamboo or timber light roof',
                         'bamboo/timber-light roof','wood light roof or bamboo light roof']
bambo_or_timber_heavy = ['bamboo/timber heavy roof','bamboo or timber heavy roof','bamboo/timber-heavy roof',
                         'wood light roof or bamboo heavy roof']
rcc_rb_rbc = ['reinforced cement concrete/rb/rbc','rcc/rb/rbc', 'rcc_rbc', 
              'reinforced brick slab/rcc/rbc', 'reinforced brick concrete/rcc/rbc']

def handle_type_of_roof(roof):
    if roof in bambo_or_timber_light:
        return 'bamboo_or_timber_light'
    elif roof in bambo_or_timber_heavy:
        return 'bamboo_or_timber_heavy'
    elif roof in rcc_rb_rbc:
        return 'rcc_rb_rbc'
    else:
        return roof
    
train_data['type_of_roof'] = train_data['type_of_roof'].apply(handle_type_of_roof)
test_data['type_of_roof'] = test_data['type_of_roof'].apply(handle_type_of_roof)

### 'type_of_ground_floor'

In [22]:
make_eda_using_correlation_within_damagegrade(train_data, 'type_of_ground_floor')

Unnamed: 0,clay,nan,mud,brick or stone,reinforced concrete,rc,"soil, water, loam mixed",brick/stone,other,lumber,timber,wood
1,6.191,10.249,6.24,9.091,46.509,45.843,6.396,8.514,26.562,18.933,19.209,19.101
2,9.551,11.581,9.579,10.304,27.141,27.912,10.441,9.85,17.708,18.0,14.689,15.73
3,18.554,17.804,18.23,16.345,14.89,14.634,17.861,16.828,23.785,15.467,16.949,12.36
4,26.141,24.128,26.166,24.629,6.644,6.803,26.207,25.008,17.882,20.667,18.079,19.101
5,39.563,36.238,39.786,39.631,4.815,4.808,39.095,39.8,14.062,26.933,31.073,33.708


In [23]:
brick_or_stone = ['brick/stone','brick or stone']
rc = ['rc','reinforced cement', 'reinforced concrete']
wood = ['timber','lumber']
clay_mud = ['clay', 'mud', 'soil, water, loam mixed']

def handle_type_of_ground_floor(floor):
    if floor in brick_or_stone:
        return 'brick_or_stone'
    elif floor in rc:
        return 'rc'
    elif floor in wood:
        return 'wood'
    elif floor in clay_mud:
        return 'clay_mud'
    else:
        return floor
    
train_data['type_of_ground_floor'] = train_data['type_of_ground_floor'].apply(handle_type_of_ground_floor)
test_data['type_of_ground_floor'] = test_data['type_of_ground_floor'].apply(handle_type_of_ground_floor)

### 'type_of_other_floor'

In [24]:
make_eda_using_correlation_within_damagegrade(train_data, 'type_of_other_floor')

Unnamed: 0,timber/bamboo-mud,wood-mud or bamboo mud,timber mud or bamboo-mud,nan,not applicable,wood or bamboo mud,rcc/rb/rbc,wood-plank,timber-planck,lumber-plank,reinforced brick concrete/rcc/rbc,reinforced cement concrete/rb/rbc
1,4.108,4.325,4.06,10.223,25.901,4.067,51.432,9.321,9.331,9.349,53.425,50.502
2,8.917,8.957,8.84,11.566,17.719,9.515,27.507,11.296,11.165,10.642,27.397,27.648
3,19.153,19.047,19.006,17.811,15.376,19.049,12.959,16.648,17.029,17.024,12.877,12.709
4,28.729,28.981,28.794,24.157,12.919,29.496,4.588,21.742,21.068,22.006,3.699,5.351
5,39.093,38.69,39.301,36.243,28.086,37.873,3.514,40.994,41.408,40.979,2.603,3.79


In [25]:
wood_or_bambo_mud = ['wood-mud or bamboo mud','wood or bamboo mud','timber/bamboo-mud','timber mud or bamboo-mud']
rcc_rb_rbc = ['rcc/rb/rbc', 'reinforced cement concrete/rb/rbc', 'reinforced brick concrete/rcc/rbc']
wood_plank = ['wood-plank','timber-planck','lumber-plank']

def handle_type_of_other_floor(floor):
    if floor in wood_or_bambo_mud:
        return 'wood_or_bambo_mud'
    elif floor in rcc_rb_rbc:
        return 'rcc_rb_rbc'
    elif floor in wood_plank:
        return 'wood_plank'
    else:
        return floor

train_data['type_of_other_floor'] = train_data['type_of_other_floor'].apply(handle_type_of_other_floor)
test_data['type_of_other_floor'] = test_data['type_of_other_floor'].apply(handle_type_of_other_floor)

### nan -> np.nan

In [26]:
# Change 'nan' to np.nan
train_data = train_data.replace('nan', np.nan)
test_data = test_data.replace('nan', np.nan)

### 'plinth_area(ft^2)'

In [27]:
train_data[train_data['plinth_area(ft^2)'] == 'More'].shape

(6242, 24)

In [28]:
train_data['ismorethanplintharea'] = train_data['plinth_area(ft^2)'].apply(lambda x: 1 if x == 'More' else 0)
test_data['ismorethanplintharea'] = test_data['plinth_area(ft^2)'].apply(lambda x: 1 if x == 'More' else 0)

In [29]:
# Change 'more' to 0
train_data['plinth_area(ft^2)'] = train_data['plinth_area(ft^2)'].apply(lambda x: -1 if x == 'More' else float(x))
test_data['plinth_area(ft^2)'] = test_data['plinth_area(ft^2)'].apply(lambda x: -1 if x == 'More' else float(x))

### 'no_family_residing'

In [30]:
def handle_no_family_residing(no_family):
    if no_family == 'None':
        return 0
    elif no_family == np.nan:
        return np.nan
    else:
        return float(no_family)

train_data['no_family_residing'] = train_data['no_family_residing'].apply(handle_no_family_residing)
test_data['no_family_residing'] = test_data['no_family_residing'].apply(handle_no_family_residing)

### 'public_place_type'

In [31]:
make_eda_using_correlation_within_damagegrade(train_data, 'public_place_type')

Unnamed: 0,non-public,religious sites,other institutional building,drugstore,health clinics,shopping sites,hospital,high school,university,middle school,primary school,recreational park,museum
1,10.263,50.588,48.936,23.529,19.643,49.804,40.373,39.362,39.175,39.535,36.0,25.0,57.447
2,11.465,16.471,23.404,25.21,25.0,23.137,24.845,19.149,34.021,13.953,21.6,43.75,10.638
3,17.884,17.059,18.085,15.966,23.214,10.98,16.77,15.957,11.34,19.767,18.4,18.75,10.638
4,24.144,9.412,5.319,10.924,16.071,11.765,11.801,3.191,11.34,15.116,12.0,6.25,14.894
5,36.245,6.471,4.255,24.37,16.071,4.314,6.211,22.34,4.124,11.628,12.0,6.25,6.383


In [32]:
public_places = ['shopping sites','recreational park','museum']
education_places = ['high school','university','middle school','primary school']
health_places = ['drugstore','health clinics','hospital']

def handle_public_place_type(public_place):
    if public_place in public_places:
        return 'public_places'
    elif public_place in education_places:
        return 'education_places'
    elif public_place in health_places:
        return 'health_places'
    else:
        return public_place

train_data['public_place_type'] = train_data['public_place_type'].apply(handle_public_place_type)
test_data['public_place_type'] = test_data['public_place_type'].apply(handle_public_place_type)

### 'position'

In [33]:
make_eda_using_correlation_within_damagegrade(train_data, 'position')

Unnamed: 0,not attached,NaN,attached-1 side,attached-2 side,attached-3 side
1,10.36,,8.478,19.366,27.337
2,11.606,,9.65,15.36,18.272
3,18.344,,16.242,17.329,16.147
4,24.121,,24.848,20.193,23.229
5,35.569,,40.782,27.753,15.014


In [34]:
def handle_position(position):
    if position == 'not attached':
        return 0
    elif position == 'attached-1 side':
        return 1
    elif position == 'attached-2 side':
        return 2
    elif position == 'attached-3 side':
        return 3
    else:
        return position

train_data['position'] = train_data['position'].apply(handle_position)
test_data['position'] = test_data['position'].apply(handle_position)

### 'flexible_superstructure'

In [35]:
def handle_flexible_superstructure(flexible):
    if flexible == 'unavailable':
        return 0
    elif flexible == 'available':
        return 1
    else:
        return flexible

train_data['flexible_superstructure'] = train_data['flexible_superstructure'].apply(handle_flexible_superstructure)
test_data['flexible_superstructure'] = test_data['flexible_superstructure'].apply(handle_flexible_superstructure)

### 'type_of_reinforcement_concrete'

In [36]:
make_eda_using_correlation_within_damagegrade(train_data, 'type_of_reinforcement_concrete')

Unnamed: 0,0.0,NaN,2.0,1.0,3.0
1,8.224,,67.879,38.48,71.429
2,10.61,,22.935,26.133,18.719
3,18.218,,7.096,15.558,4.926
4,25.125,,1.148,8.14,2.956
5,37.823,,0.942,11.69,1.97


In [37]:
# 0	No reinforcement concrete
# 1	Has non-engineered reinforcement concrete
# 2	Has engineered reinforcement concrete
# 3	Has both

def make_int(x):
    if x == float(0):
        return "no_rc"
    elif x == float(1):
        return "non_engineered_rc"
    elif x == float(2):
        return "engineered_rc"
    elif x == float(3):
        return "both"
    else:
        return np.nan

train_data['type_of_reinforcement_concrete'] = train_data['type_of_reinforcement_concrete'].apply(make_int)
test_data['type_of_reinforcement_concrete'] = test_data['type_of_reinforcement_concrete'].apply(make_int)

### 'wall_binding'

In [38]:
make_eda_using_correlation_within_damagegrade(train_data, 'wall_binding')

Unnamed: 0,0.0,5.0,NaN,2.0,1.0,7.0,3.0
1,41.64,3.742,,44.035,7.269,10.172,16.216
2,20.616,8.58,,30.306,13.747,18.378,28.716
3,15.514,17.989,,15.832,27.994,28.784,28.716
4,9.969,27.248,,5.621,28.612,26.942,16.892
5,12.261,42.441,,4.206,22.379,15.725,9.459


In [39]:
def make_int(val):
    if val == float(0):
        return "other"
    
    elif val == float(1):
        return "clay"
    
    elif val == float(2):
        return "mortar_cement"

    elif val == float(3):
        return "mortar_cement_clay"
    
    elif val == float(5):
        return "mud_mortar_clay"
    
    elif val == float(7):
        return "mud_mortar_clay_cement_mortar"
    
    else:
        return np.nan

train_data['wall_binding'] = train_data['wall_binding'].apply(make_int)
test_data['wall_binding'] = test_data['wall_binding'].apply(make_int)

### wall_material

In [40]:
make_eda_using_correlation_within_damagegrade(train_data, 'wall_material')

Unnamed: 0,0.0,2.0,NaN,1.0,3.0
1,40.844,3.813,,38.423,15.055
2,21.161,8.487,,28.842,17.038
3,17.646,17.89,,18.325,24.522
4,11.33,27.009,,9.936,22.539
5,9.019,42.8,,4.475,20.845


In [41]:
# 0	Unknown/not stated
# 1	Red Bricks
# 2	Stone Bricks
# 3	Red Bricks, Stone Bricks

def make_int(val):
    if val == float(0):
        return "unknown"
    
    elif val == float(1):
        return "red_bricks"
    
    elif val == float(2):
        return "stone_bricks"

    elif val == float(3):
        return "red_stone_bricks"
    
    else:
        return np.nan

train_data['wall_material'] = train_data['wall_material'].apply(make_int)
test_data['wall_material'] = test_data['wall_material'].apply(make_int)

In [42]:
# Drop duplicated rows
print("---------before drop duplicated rows---------")
print("Shape of train data: ", train_data.shape)
print(f"Duplicated rows: {train_data.duplicated().sum()}")
train_data = train_data.drop_duplicates()

print("---------after drop duplicated rows---------")
print(f"Dropped duplicated rows: {train_data.duplicated().sum()}")
print("Shape of train data: ", train_data.shape)

---------before drop duplicated rows---------
Shape of train data:  (722815, 25)
Duplicated rows: 373833
---------after drop duplicated rows---------
Dropped duplicated rows: 0
Shape of train data:  (348982, 25)


In [43]:
# damage_grade change into int

train_data['damage_grade'] = train_data['damage_grade'].astype(int)
print(train_data['damage_grade'].unique())

[1 5 4 2 3]


## Export the cleaned data to csv file

In [44]:
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)