In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

smote = SMOTE()
near_miss = NearMiss()

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
kfold = KFold(n_splits=3)
stratifier_kfold = StratifiedKFold(n_splits=3)

In [6]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()

In [7]:
from sklearn.svm import SVC
support_vector_clf = SVC()

In [19]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()

In [9]:
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(n_estimators=50)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_boost = GradientBoostingClassifier()

In [11]:
data = pd.read_csv("train_values.csv").drop(columns=["building_id"])
label = pd.read_csv("train_labels.csv")["damage_grade"]

In [12]:
data.dropna(inplace=True)

In [39]:
def label_encoder(data):
    for feature in data.columns:
        if data[feature].dtype == np.object:
            data[feature] = encoder.fit_transform(data[feature])
    return data

In [51]:
features = ["material_used"]
score = cross_val_score(random_forest, data[features], label, cv=3)

In [52]:
np.array(score).mean()

0.5752894271318989

In [26]:
np.array(score).mean()

0.7217547131438482

In [21]:
np.array(score).mean()

0.7136964171281001

In [48]:
data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [59]:
features = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type']

data["area_height_percentage"] = data["area_percentage"] + data["height_percentage"]
data["overall_type"] = data["foundation_type"] + data["roof_type"] + data["ground_floor_type"] + \
                       data["other_floor_type"]

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type
0,1,4,1,1,2,2,0,0,1
1,1,2,1,1,1,2,0,3,1
2,1,2,1,1,2,2,0,0,3
3,1,2,1,1,2,2,0,0,3
4,2,4,1,0,2,2,0,0,3


In [60]:
# New variable for condition of the house
feature = ['has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo']
cross_val_score(random_forest, data[["material_used"]], label, cv=3)

array([0.57422266, 0.57548897, 0.57615665])

In [58]:
np.array(score).mean()

0.5752894271318989

In [50]:
np.array(score).mean()

0.7009796585584859

In [34]:
np.array(score).mean()

0.6372500489253686

In [71]:
data["count_floors_pre_eq"] = data["count_floors_pre_eq"].apply(lambda floor: 4 if floor>=4 else floor)

In [72]:
upsampled_data, upsampled_label = smote.fit_sample(data, label)
undersample_data, undersample_label = near_miss.fit_sample(data, label)

In [73]:
score = cross_val_score(random_forest, upsampled_data, \
                                                    upsampled_label, cv=2, scoring="accuracy")

In [74]:
np.array(score).mean()

0.7770298758000675

In [75]:
score = cross_val_score(random_forest, undersample_data, \
                                                    undersample_label, cv=2, scoring="accuracy")

In [76]:
np.array(score).mean()

0.5721620761025314

In [22]:
data["material_used"] = data['has_superstructure_adobe_mud'] + data['has_superstructure_mud_mortar_stone'] + \
                        data['has_superstructure_stone_flag'] + data['has_superstructure_cement_mortar_stone'] + \
                        data['has_superstructure_mud_mortar_brick'] + data['has_superstructure_cement_mortar_brick']+\
                        data['has_superstructure_timber'] + data['has_superstructure_bamboo']

In [27]:
data["secondary_use"] = data['has_secondary_use'] + data['has_secondary_use_agriculture'] + \
                        data['has_secondary_use_hotel'] + data['has_secondary_use_rental'] + \
                        data['has_secondary_use_institution'] + data['has_secondary_use_school'] + \
                        data['has_secondary_use_industry'] + data['has_secondary_use_health_post'] + \
                        data['has_secondary_use_gov_office'] + data['has_secondary_use_use_police'] + \
                        data['has_secondary_use_other']

In [33]:
data["count_floors_pre_eq"] = data["count_floors_pre_eq"].apply(lambda floor: "4" if floor >= 4 else str(floor))

In [34]:
data["count_families"] = data["count_families"].apply(lambda floor: "4" if floor >= 4 else str(floor))

In [35]:
def new_age_variable(age):
    
    if (age <= 10):
        return "new"
    elif (age <= 25):
        return "moderate"
    elif (age <= 50):
        return "slightly old"
    elif (age <= 100):
        return "old"
    else:
        return "ancient"
    
data["age"] = data["age"].apply(new_age_variable)

In [36]:
def new_height_variable(height):
    
    if (height <= 4):
        return "small"
    elif (height <= 8):
        return "medium"
    else:
        return "large"
    
data["height_percentage"] = data["height_percentage"].apply(new_height_variable)

In [37]:
def new_area_variable(area):
    
    if (area <= 4):
        return "small"
    elif (area <= 10):
        return "medium"
    else:
        return "large"
    
data["area_percentage"] = data["area_percentage"].apply(new_area_variable)

In [44]:
data["condition"] = data["count_floors_pre_eq"] + data["count_families"] + data["area_percentage"] + \
                    data["height_percentage"] + data["age"]

In [41]:
data = label_encoder(data)

In [28]:
data["secondary_use"].value_counts()

0    231445
2     28599
3       557
Name: secondary_use, dtype: int64

In [42]:
data.columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
     

In [45]:
data["condition"].value_counts()

6     78423
5     61734
7     45546
8     33135
9     20037
4     15770
10     4135
3      1238
11      453
12       75
2        51
13        3
14        1
Name: condition, dtype: int64