In [None]:
pip install 

In [5]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from scipy.stats import chi2_contingency
from sklearn.feature_selection import VarianceThreshold

# models
import optuna
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# metric used for the competition
from sklearn.metrics import f1_score

ModuleNotFoundError: No module named 'matplotlib'

In [31]:
path = "..\\Datasets\\train.csv.zip"
data = pd.read_csv(path,compression="zip", index_col = "Unnamed: 0")
data = data.sort_index()
data = data.reset_index(drop=True)

test_path = "..\\Datasets\\test.csv.zip"
test = pd.read_csv(test_path,compression="zip", index_col='id')
test = test.sort_index()
test = test.reset_index(drop=True)

data.head(2)

  data = pd.read_csv(path,compression="zip", index_col = "Unnamed: 0")


Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,...,type_of_reinforcement_concrete,residential_type,no_family_residing,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material,damage_grade
0,floor two,1.0,256 ft^2,22.0,Flat,Bamboo or Timber,Bamboo/Timber Light roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,0.0,0.0,1.0
1,Floor 3,3.0,985 ft^2,18.0,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light Roof or Bamboo Heavy Roof,Clay,TImber/Bamboo-Mud,Not attached,...,0.0,Non-residential,1.0,Non-public,Non-industrial,Non-govermental,unavailable,5.0,2.0,5.0


In [32]:
data.isna().sum()

floors_before_eq (total)          332806
old_building                      239204
plinth_area (ft^2)                421208
height_before_eq (ft)             332806
land_surface_condition            301606
type_of_foundation                239204
type_of_roof                      421208
type_of_ground_floor              332806
type_of_other_floor               301606
position                          312006
building_plan_configuration       301606
technical_solution_proposed       676014
legal_ownership_status            124802
has_secondary_use                 197604
type_of_reinforcement_concrete    291206
residential_type                  270404
no_family_residing                145602
public_place_type                      0
industrial_use_type               114402
govermental_use_type              249604
flexible_superstructure            62400
wall_binding                       62400
wall_material                     228804
damage_grade                           0
dtype: int64

In [33]:
# Drop rows with NaN values in wall binding, industrial type, and legal ownership column
data.dropna(subset=['wall_binding'], inplace=True)
data.dropna(subset=['industrial_use_type'], inplace=True)
data.dropna(subset=['legal_ownership_status'], inplace=True)
data.dropna(subset=['has_secondary_use'], inplace=True)
data.dropna(subset=['wall_material'], inplace=True)
data.dropna(subset=['old_building'], inplace=True)
data.dropna(subset=['govermental_use_type'], inplace=True)

In [34]:
data.isna().sum()

floors_before_eq (total)           83202
old_building                           0
plinth_area (ft^2)                171604
height_before_eq (ft)              83202
land_surface_condition             52002
type_of_foundation                     0
type_of_roof                      171604
type_of_ground_floor               83202
type_of_other_floor                52002
position                           62402
building_plan_configuration        52002
technical_solution_proposed       426410
legal_ownership_status                 0
has_secondary_use                      0
type_of_reinforcement_concrete     41602
residential_type                   20800
no_family_residing                     0
public_place_type                      0
industrial_use_type                    0
govermental_use_type                   0
flexible_superstructure                0
wall_binding                           0
wall_material                          0
damage_grade                           0
dtype: int64

In [35]:
data.shape

(473211, 24)

In [36]:
def preprocessor_for_eda(df) :

    # Preprocessing Kolom public_place_type
    df['public_place_type'] = df['public_place_type'].apply(lambda x : 'Public' if x != 'Non-public' else x)

    # Preprocessing Kolom floors_before_eq
    def transform_floors_before_eq(value):
        if isinstance(value, str):
            value = value.lower()
            if any(x in value for x in ['one','first','1']) :
                return 1
            elif any(x in value for x in ['two','second','2']) :
                return 2
            elif any(x in value for x in ['three','third','3','3.00']) :
                return 3
            elif any (x in value for x in ['four','fourth','4']) :
                return 4
            elif any (x in value for x in ['five','fifth','5']) :
                return 5
            else :
                return value
        else :
            return value
    
    df['floors_before_eq (total)'] = df['floors_before_eq (total)'].apply(lambda x : transform_floors_before_eq(x))
    # Convert column to numeric and replace non-numeric values with NaN
    df['floors_before_eq (total)'] = pd.to_numeric(df['floors_before_eq (total)'], errors='coerce')

    # Convert column to 'Int64' to allow for null values
    df['floors_before_eq (total)'] = df['floors_before_eq (total)'].astype('Int64')
    
    # Preprocessing Kolom type_of_foundation
    df['type_of_foundation'] = df['type_of_foundation'].replace({
    'Cement-Stone or Cement-Brick' : 'Cement-Stone/Brick',
    'Bamboo or Timber' : 'Bamboo/Timber',
    'RC' : 'Reinforced Concrete',
    'Bamboo/TImber' : 'Bamboo/Timber',
    'Others' : 'Other'
    })
    
    # Preprocessing Kolom type_of_foundation
    df['type_of_roof'] = df['type_of_roof'].replace({
    'Bamboo or Timber-Light roof' : 'Bamboo/Timber Light roof',
    'Bamboo or Timber Light roof' : 'Bamboo/Timber Light roof',
    'Bamboo or Timber Heavy roof' : 'Bamboo/Timber Heavy roof',
    'Wood Light Roof or Bamboo Light Roof' : 'Wood Light roof/Bamboo Light roof',
    'Wood Light Roof or Bamboo Heavy Roof' : 'Wood Light roof/Bamboo Heavy roof',
    'Bamboo/TImber-Light Roof' : 'Bamboo/Timber Light roof',
    'reinforced cement concrete/rb/rbc' : 'rcc/rb/rbc',
    'Reinforced Brick Slab/rcc/rbc' : 'rbs/rcc/rbc',
    'Bamboo/TImber-Heavy Roof' : 'Bamboo/Timber Heavy roof',
    'Bamboo or Timber Heavy Roof' : 'Bamboo/Timber Heavy roof',
    'Reinforced brick concrete/rcc/rbc' : 'rb/rbc'
    })

    # Preprocessing Kolom type_of_ground_floor
    df['type_of_ground_floor'] = df['type_of_ground_floor'].replace({
    'mud' : 'Mud',
    'Brick or Stone' : 'Brick/Stone',
    'reinforced concrete' : 'Reinforced Concrete',
    'RC' : 'Reinforced Concrete',
    'brick/stone' : 'Brick/Stone',
    'TImber' : 'Timber'
    })

    # Preprocessing Kolom type_of_other_floor
    df['type_of_other_floor'] = df['type_of_other_floor'].replace({
    'Timber-Planck' : 'Timber-Plank',
    'TImber/Bamboo-Mud' : 'Timber/Bamboo-Mud',
    'Timber Mud or Bamboo-Mud' : 'Timber Mud/Bamboo-Mud',
    'Wood or Bamboo Mud' : 'Wood/Bamboo-Mud',
    'Wood-Mud or Bamboo Mud' : 'Wood-Mud/Bamboo Mud',
    'Reinforced brick concrete/rcc/rbc' : 'rcc/rbc',
    'reinforced cement concrete/rb/rbc' : 'rcc/rb/rbc'
    })

    # Preprocessing Kolom legal_ownership_status
    def transform_use(x) :
        if x == 'Private Use' or x == 'Private' or x == 'Prvt' or x == 'Privste' :
            return 'private'
        elif x == 'Public' or x =='Public Space' or x == 'Public Use' :
            return 'public'
        elif x == 'Unknown' or x == 'Unspecified' or x == 'Other' :
            return 'other'
        elif x == 'Institutional Use' or x == 'Institutionals' or x == 'Institutional' :
            return 'institutional'
    
    df['legal_ownership_status'] = df['legal_ownership_status'].apply(lambda x : transform_use(x))

    # Preprocessing Kolom plinth_area (ft^2)
    # Define a function to handle splitting the value
    def split_value(x):
        if pd.isna(x):  # Check if value is NaN
            return x  # Return NaN as is
        elif x != "More than 1000 ft^2":  # Check if value is not "More than 1000 ft^2"
            return str(x).split(" ")[0]  # Split and return the first part of the string
        else:
            return str(x).split(" ")[-2]  # Split and return the second-to-last part of the string

    # Apply the function to the column
    df['plinth_area (ft^2)'] = df['plinth_area (ft^2)'].apply(split_value)
    df['plinth_area (ft^2)'] = pd.to_numeric(df['plinth_area (ft^2)'])

    df['height_before_eq (ft)'] = pd.to_numeric(df['height_before_eq (ft)'], errors='coerce')
    df['floors_before_eq (total)'] = pd.to_numeric(df['floors_before_eq (total)'], errors='coerce')

    df['volume'] = df['plinth_area (ft^2)'] * df['height_before_eq (ft)']
    df['heightPerFloor'] = df['height_before_eq (ft)'] / df['floors_before_eq (total)']

    def convert_to_int(x):
        if pd.isna(x):  # Check if value is NaN
            return None  # Return 0 as default value for NaN
        elif x != 'None':  # Check if value is not 'None'
            return int(float(x))  # Convert value to float, then to integer
            
        else:
            return 0  # Return 0 as default value for 'None'

    # Apply the function to the column
    df['no_family_residing'] = df['no_family_residing'].apply(convert_to_int)    
    
    df['has_secondary_use'] = df['has_secondary_use'].apply(str)

    # Replace string 'nan' with np.nan
    df['type_of_reinforcement_concrete'] = df['type_of_reinforcement_concrete'].replace('nan', np.nan)
    df['type_of_reinforcement_concrete'] = df['type_of_reinforcement_concrete'].replace('<NA>', np.nan)

    # Convert column to int64 data type
    df['type_of_reinforcement_concrete'] = df['type_of_reinforcement_concrete'].astype('float').astype('Int64')
    df['type_of_reinforcement_concrete'] = df['type_of_reinforcement_concrete'].apply(str)

    df['wall_material'] = df['wall_material'].astype('Int64')
    df['wall_material'] = df['wall_material'].apply(str)

    df['wall_binding'] = df['wall_binding'].astype('float').astype('Int64')
    df['wall_binding'] = df['wall_binding'].apply(str)
    
    return df

In [37]:
df_after = preprocessor_for_eda(data)

In [38]:
test_after = preprocessor_for_eda(test)

In [39]:
numerical_df = df_after.select_dtypes(include='number').columns  # Get the column names of numerical columns
numerical_test = test_after.select_dtypes(include='number').columns  # Get the column names of numerical columns

categorical_df = df_after.select_dtypes(include='object').columns  # Get the column names of categorical columns
categorical_test = test_after.select_dtypes(include='object').columns  # Get the column names of categorical columns

In [40]:
df_after[numerical_df].head(2)

Unnamed: 0,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),no_family_residing,damage_grade,volume,heightPerFloor
0,2,1.0,256.0,22.0,1,1.0,5632.0,11.0
1,3,3.0,985.0,18.0,1,5.0,17730.0,6.0


In [41]:
df_after[categorical_df].head(2)

Unnamed: 0,land_surface_condition,type_of_foundation,type_of_roof,type_of_ground_floor,type_of_other_floor,position,building_plan_configuration,technical_solution_proposed,legal_ownership_status,has_secondary_use,type_of_reinforcement_concrete,residential_type,public_place_type,industrial_use_type,govermental_use_type,flexible_superstructure,wall_binding,wall_material
0,Flat,Bamboo/Timber,Bamboo/Timber Light roof,Clay,Timber/Bamboo-Mud,Not attached,Rectangular,,private,0.0,0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,0,0
1,Flat,Clay Sand Mixed mortar-Stone/Brick,Wood Light roof/Bamboo Heavy roof,Clay,Timber/Bamboo-Mud,Not attached,Square,,private,0.0,0,Non-residential,Non-public,Non-industrial,Non-govermental,unavailable,5,2


In [42]:
train_data, val_data = train_test_split(df_after, test_size=0.2, random_state=33)
TARGET = 'damage_grade'

In [43]:
for col in train_data.columns[train_data.isna().sum() > 0]:
    if col in categorical_df:
        train_data[col] = train_data[col].fillna(train_data.groupby(['type_of_foundation', 'public_place_type', 'damage_grade'])[col].transform(lambda x: x.mode().values[0] if not x.mode().empty else None).astype(train_data[col].dtype))

for col in val_data.columns[val_data.isna().sum() > 0]:
    if col in categorical_df:
        val_data[col] = val_data[col].fillna(val_data.groupby(['type_of_foundation', 'public_place_type', 'damage_grade'])[col].transform(lambda x: x.mode().values[0] if not x.mode().empty else None).astype(val_data[col].dtype))

In [44]:
def fillna_with_median(x):
    median = x.median()
    if pd.isna(median):
        return None  # or any other appropriate value
    else:
        return median

for col in train_data.columns[train_data.isna().sum() > 0]:
    if col in numerical_df:
        train_data[col] = train_data[col].fillna(train_data.groupby(['type_of_foundation', 'damage_grade'])[col].transform(fillna_with_median))

for col in val_data.columns[val_data.isna().sum() > 0]:
    if col in numerical_df:
        val_data[col] = val_data[col].fillna(val_data.groupby(['type_of_foundation', 'damage_grade'])[col].transform(fillna_with_median))

In [45]:
for col in train_data.columns[train_data.isna().sum() > 0]:
    train_data[col] = train_data[col].fillna(train_data.groupby(['damage_grade'])[col].transform(lambda x: x.mode().values[0] if not x.mode().empty else None).astype(train_data[col].dtype))

for col in val_data.columns[val_data.isna().sum() > 0]:
    val_data[col] = val_data[col].fillna(val_data.groupby(['damage_grade'])[col].transform(lambda x: x.mode().values[0] if not x.mode().empty else None).astype(val_data[col].dtype))

In [46]:
train_data.isna().sum()

floors_before_eq (total)          0
old_building                      0
plinth_area (ft^2)                0
height_before_eq (ft)             0
land_surface_condition            0
type_of_foundation                0
type_of_roof                      0
type_of_ground_floor              0
type_of_other_floor               0
position                          0
building_plan_configuration       0
technical_solution_proposed       0
legal_ownership_status            0
has_secondary_use                 0
type_of_reinforcement_concrete    0
residential_type                  0
no_family_residing                0
public_place_type                 0
industrial_use_type               0
govermental_use_type              0
flexible_superstructure           0
wall_binding                      0
wall_material                     0
damage_grade                      0
volume                            0
heightPerFloor                    0
dtype: int64

In [47]:
val_data.isna().sum()

floors_before_eq (total)          0
old_building                      0
plinth_area (ft^2)                0
height_before_eq (ft)             0
land_surface_condition            0
type_of_foundation                0
type_of_roof                      0
type_of_ground_floor              0
type_of_other_floor               0
position                          0
building_plan_configuration       0
technical_solution_proposed       0
legal_ownership_status            0
has_secondary_use                 0
type_of_reinforcement_concrete    0
residential_type                  0
no_family_residing                0
public_place_type                 0
industrial_use_type               0
govermental_use_type              0
flexible_superstructure           0
wall_binding                      0
wall_material                     0
damage_grade                      0
volume                            0
heightPerFloor                    0
dtype: int64

In [48]:
scaler3 = RobustScaler()
scaler4 = StandardScaler()

no_std = ['no_family_residing', 'floors_before_eq (total)']

train_num = train_data.select_dtypes(include='number').drop(TARGET, axis=1)
train_num = train_num.select_dtypes(include='number').drop(no_std, axis=1)
train_num = pd.DataFrame(scaler4.fit_transform(train_num), columns = train_num.columns)

val_num = val_data.select_dtypes(include='number').drop(TARGET, axis=1)
val_num = val_num.select_dtypes(include='number').drop(no_std, axis=1)
val_num = pd.DataFrame(scaler4.transform(val_num), columns = val_num.columns)

train_cat = train_data.select_dtypes('object')
val_cat = val_data.select_dtypes('object')

encoder = OneHotEncoder(drop='first')

train_concat = pd.concat([train_data[no_std].reset_index(drop=True), train_num.reset_index(drop=True), pd.DataFrame(encoder.fit_transform(train_cat).toarray(), columns=encoder.get_feature_names(train_cat.columns)).reset_index(drop=True), train_data[TARGET].reset_index(drop=True)], axis=1)
val_concat = pd.concat([val_data[no_std].reset_index(drop=True), val_num.reset_index(drop=True), pd.DataFrame(encoder.transform(val_cat).toarray(), columns=encoder.get_feature_names(val_cat.columns)).reset_index(drop=True), val_data[TARGET].reset_index(drop=True)], axis=1)



In [49]:
test_num = test.select_dtypes(include='number')
test_num = test_num.select_dtypes(include='number').drop(no_std, axis=1)
test_num = pd.DataFrame(scaler4.transform(test_num), columns = test_num.columns)

test_cat = test.select_dtypes(include='object')

test_concat = pd.concat([test[no_std],test_num, pd.DataFrame(encoder.transform(test_cat).toarray(), columns=encoder.get_feature_names(test_cat.columns))], axis=1)
test_concat



Unnamed: 0,no_family_residing,floors_before_eq (total),old_building,plinth_area (ft^2),height_before_eq (ft),volume,heightPerFloor,land_surface_condition_Moderate slope,land_surface_condition_Steep slope,type_of_foundation_Cement-Stone/Brick,...,govermental_use_type_Police Offices,flexible_superstructure_unavailable,wall_binding_1,wall_binding_2,wall_binding_3,wall_binding_5,wall_binding_7,wall_material_1,wall_material_2,wall_material_3
0,1,2,-0.267000,0.201956,-0.390914,-0.120140,-0.428674,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,3,-0.174639,0.059115,0.997900,0.538343,-0.428674,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1,3,0.240984,0.085086,0.402694,0.240102,-0.953607,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1,2,0.010082,-0.057754,0.799498,0.336309,1.146126,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,2,-0.297787,-0.077233,0.799498,0.320274,1.146126,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242077,1,2,-0.143852,-0.953754,-0.192512,-0.721969,-0.166207,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
242078,1,3,-0.313181,-1.382276,0.402694,-0.847038,-0.953607,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
242079,1,2,-0.159246,-0.077233,-0.390914,-0.281020,-0.428674,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
242080,1,2,-0.236213,0.643463,0.005890,0.394033,0.096260,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [50]:
X_train = train_concat.drop(TARGET, axis=1)
X_test = val_concat.drop(TARGET, axis=1)
y_train = train_concat[TARGET]
y_test = val_concat[TARGET]

In [51]:
X_test.isna().sum().sum()

0

In [52]:
# def objective(trial):
#     n_estimators = trial.suggest_int('n_estimators', 120, 130)
#     max_depth = int(trial.suggest_loguniform('max_depth', 90, 110))
#     clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
#     return cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring='f1_macro').mean()

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

In [53]:
model = RandomForestClassifier(n_estimators=128, max_depth=97)
model.fit(X_train, y_train)
result = model.predict(X_test)
score = f1_score(y_test, result, average='macro')
score

0.8680811074688229

In [57]:
test_result = model.predict(test_concat)

final_result = pd.DataFrame(test_result)

final_result = final_result.rename(columns={0 : 'damage_grade'})
final_result.insert(0, 'id', final_result.index)
final_result = final_result.set_index('id')
final_result['damage_grade'] = final_result['damage_grade'].astype('int')

In [58]:
final_result.head(3)

Unnamed: 0_level_0,damage_grade
id,Unnamed: 1_level_1
0,4
1,5
2,5


In [59]:
final_result.to_csv('percobaan_agak_hopeless_2.csv')