In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Specify dtype option on import or set low_memory=False.
pd.options.mode.chained_assignment = None  # default='warn'

import math
import scipy

# Datviz purposes
import matplotlib.pyplot as plt
plt.style.use('bmh')
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px

In [32]:
try :
    train_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/train.csv", low_memory=False)
    test_data = pd.read_csv("/kaggle/input/penyisihan-joints-data-competition-2023/test.csv", low_memory=False) # For Kaggle

except :
    train_data = pd.read_csv("../1-Preprocessing/train_data.csv", low_memory=False)
    test_data = pd.read_csv("../datasets/test.csv", low_memory=False) # For local development

# Missing Values

In [33]:
print("Shape of train data: ", train_data.shape)

Shape of train data:  (349002, 25)


In [34]:
# Drop row when more than 40% of the values are missing
train_data = train_data.dropna(thresh=train_data.shape[1] * 0.6, axis=0) 

In [35]:
print("Shape of train data: ", train_data.shape)

Shape of train data:  (337593, 25)


In [36]:
print("Missing values in each column: \n\n")
df = pd.DataFrame(train_data.isnull().sum(), columns=['Missing values'])
df['percentage'] = (df['Missing values'] / train_data.shape[0] * 100).round(2)
df

Missing values in each column: 




Unnamed: 0,Missing values,percentage
floors_before_eq(total),11033,3.27
old_building,0,0.0
plinth_area(ft^2),48202,14.28
height_before_eq(ft),11033,3.27
land_surface_condition,2135,0.63
type_of_foundation,0,0.0
type_of_roof,48202,14.28
type_of_ground_floor,11033,3.27
type_of_other_floor,2135,0.63
position,5145,1.52


### 'technical_solution'

In [37]:
# Handle 'technical_solution_proposed' column
def reconstruction(x):
    if x == 'reconstruction':
        return 1
    else:
        return 0
def major_repair(x):
    if x == 'major repair':
        return 1
    else:
        return 0
def minor_repair(x):
    if x == 'minor repair':
        return 1
    else:
        return 0
def no_need(x):
    if x == 'no need':
        return 1
    else:
        return 0

train_data['reconstruction'] = train_data['technical_solution_proposed'].apply(reconstruction)
train_data['major_repair'] = train_data['technical_solution_proposed'].apply(major_repair)
train_data['minor_repair'] = train_data['technical_solution_proposed'].apply(minor_repair)
train_data['no_need'] = train_data['technical_solution_proposed'].apply(no_need)

train_data = train_data.drop(['technical_solution_proposed'], axis=1)

In [38]:
train_data.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
floors_before_eq(total),326560.0,,,,2.083188,0.665017,1.0,2.0,2.0,2.0,9.0
old_building,337593.0,,,,24.449023,65.457712,0.0,8.0,16.0,27.0,999.0
plinth_area(ft^2),289391.0,,,,379.757798,172.67742,0.0,269.0,350.0,458.0,1000.0
height_before_eq(ft),326560.0,,,,16.085632,5.624452,6.0,12.0,16.0,18.0,99.0
land_surface_condition,335458.0,3.0,flat,275245.0,,,,,,,
type_of_foundation,337593.0,5.0,mixed,273380.0,,,,,,,
type_of_roof,289391.0,3.0,bamboo_or_timber_light,190650.0,,,,,,,
type_of_ground_floor,326560.0,5.0,clay_mud,261403.0,,,,,,,
type_of_other_floor,335458.0,4.0,wood_or_bambo_mud,208058.0,,,,,,,
position,332448.0,,,,0.259544,0.529636,0.0,0.0,0.0,0.0,3.0


In [39]:
eda_for_missing_val = train_data.copy()

from sklearn.preprocessing import LabelEncoder
# Change object type to numerical using encoding
def encode_data(data):
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = LabelEncoder().fit_transform(data[col].astype(str))
    return data

eda_for_missing_val = encode_data(eda_for_missing_val)

In [40]:
cols = eda_for_missing_val.describe(include='all').transpose()
cols['mode'] = eda_for_missing_val.mode().iloc[0]
cols['median'] = eda_for_missing_val.median()

cols

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,mode,median
floors_before_eq(total),326560.0,2.083188,0.665017,1.0,2.0,2.0,2.0,9.0,2.0,2.0
old_building,337593.0,24.449023,65.457712,0.0,8.0,16.0,27.0,999.0,15.0,16.0
plinth_area(ft^2),289391.0,379.757798,172.67742,0.0,269.0,350.0,458.0,1000.0,300.0,350.0
height_before_eq(ft),326560.0,16.085632,5.624452,6.0,12.0,16.0,18.0,99.0,18.0,16.0
land_surface_condition,337593.0,0.260838,0.644158,0.0,0.0,0.0,0.0,3.0,0.0,0.0
type_of_foundation,337593.0,1.878164,0.747457,0.0,2.0,2.0,2.0,4.0,2.0,2.0
type_of_roof,337593.0,1.004831,0.767972,0.0,1.0,1.0,1.0,3.0,1.0,1.0
type_of_ground_floor,337593.0,1.263225,1.011238,0.0,1.0,1.0,1.0,5.0,1.0,1.0
type_of_other_floor,337593.0,2.776518,0.934797,0.0,3.0,3.0,3.0,4.0,3.0,3.0
position,332448.0,0.259544,0.529636,0.0,0.0,0.0,0.0,3.0,0.0,0.0


## Fill using manual imputer of mode groupby

In [41]:
# # Fill missing values with mode

# train_data['residential_type'].\
#     fillna(train_data.groupby(['legal_ownership_status', 'no_family_residing', 
#                                'public_place_type', 'plinth_area(ft^2)', 'floors_before_eq(total)'])\
#                                 ['residential_type'].agg(lambda x : pd.Series.mode(x)[0]).reset_index()['residential_type'], inplace=True)

In [47]:
def fill_na_using_mode_categorical(df,col,groupby_cols=[]):
    df[col].fillna(df.groupby(groupby_cols)[col] \
                   .agg(lambda x : pd.Series.mode(x)[0]).reset_index()[col], inplace=True)
    return df

def fill_na_using_mean_categorical(df,col,groupby_cols=[]):
    df[col].fillna(df.groupby(groupby_cols)[col] \
                   .agg(lambda x : pd.Series.mean(x)).reset_index()[col], inplace=True)
    return df

def fill_na_using_median_categorical(df,col,groupby_cols=[]):
    df[col].fillna(df.groupby(groupby_cols)[col] \
                   .agg(lambda x : pd.Series.median(x)).reset_index()[col], inplace=True)
    return df

In [49]:
fill_na_using_mode_categorical(train_data,'residential_type',
                               ['legal_ownership_status', 'no_family_residing','public_place_type', 
                                'plinth_area(ft^2)', 'floors_before_eq(total)'])

In [None]:
print("Missing values in each column: \n\n")
df = pd.DataFrame(train_data.isnull().sum(), columns=['Missing values'])
df['percentage'] = (df['Missing values'] / train_data.shape[0] * 100).round(2)
df

Missing values in each column: 




Unnamed: 0,Missing values,percentage
floors_before_eq(total),19437,6.4
old_building,0,0.0
plinth_area(ft^2),55259,18.19
height_before_eq(ft),19437,6.4
land_surface_condition,11147,3.67
type_of_foundation,0,0.0
type_of_roof,55259,18.19
type_of_ground_floor,19437,6.4
type_of_other_floor,11147,3.67
position,13984,4.6
