In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Datasets/train_values.csv").set_index('building_id')
damages = pd.read_csv("Datasets/train_labels.csv").set_index('building_id')

# Functions for preprocessing

In this notebook I will use the conclusion from exploration to process the dataset. I will create 10 crossvalidation subsets. Each subset will be process as a train set and as a test set (with slight changes between the 2).

In [3]:
# This function substitutes geo level 1&2&3 by the expected damage value of geo level 2.
def geo_level_process(train, test, damages):
    dmgs = df.merge(damages, right_index=True, left_index=True).loc[:,['geo_level_2_id','damage_grade']]
    expected_value = pd.DataFrame(index=dmgs['geo_level_2_id'].unique(), columns=["expected_value", "ratio"])
    for i in expected_value.index:
        geo = dmgs[dmgs['geo_level_2_id']==i]
        expected_value.loc[i,"expected_value"] = geo['damage_grade'].mean()
        expected_value.loc[i, "ratio"] =  len(geo[geo['damage_grade']==2])/len(geo)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(expected_value["expected_value"].values.reshape(-1, 1))
    train['geo_level_mean'] = scaler.transform(train['geo_level_2_id'].apply(lambda x: expected_value.loc[x,"expected_value"]).values.reshape(-1, 1))
    test['geo_level_mean'] = scaler.transform(test['geo_level_2_id'].apply(lambda x: expected_value.loc[x,"expected_value"]).values.reshape(-1, 1))
    scaler.fit(expected_value["ratio"].values.reshape(-1, 1))
    train['geo_level_ratio'] = scaler.transform(train['geo_level_2_id'].apply(lambda x: expected_value.loc[x,"ratio"]).values.reshape(-1, 1))
    test['geo_level_ratio'] = scaler.transform(test['geo_level_2_id'].apply(lambda x: expected_value.loc[x,"ratio"]).values.reshape(-1, 1))
    train = train.drop(columns=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
    test = test.drop(columns=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])
    return train, test

In [4]:
# This function removes buildings with more than 7 floors and applies a 0 to 1 scaler
def count_floors_process(train, test, damages):
    train = train.drop(index=train[train["count_floors_pre_eq"] > 7].index)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train["count_floors_pre_eq"].values.reshape(-1, 1))
    train["count_floors_pre_eq"] = scaler.transform(train["count_floors_pre_eq"].values.reshape(-1, 1))  
    test["count_floors_pre_eq"] = scaler.transform(test["count_floors_pre_eq"].values.reshape(-1, 1))  
    return train, test

In [5]:
# This function removes buildings aged more than 100 years and applies a 0 to 1 scaler
def age_process(train, test, damages):
    train = train.drop(index=train[train["age"] > 100].index)
    test["age"] = test["age"].apply(lambda x: 100 if x>100 else x)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train["age"].values.reshape(-1, 1))
    train["age"] = scaler.transform(train["age"].values.reshape(-1, 1))
    test["age"] = scaler.transform(test["age"].values.reshape(-1, 1))   
    return train, test

In [6]:
# This function merges together buildings with area percentage >= 17 and applies a 0 to 1 scaler
def area_process(train, test, damages):
    train["area_percentage"] = train["area_percentage"].apply(lambda x: 17 if x>16 else x)
    test["area_percentage"] = test["area_percentage"].apply(lambda x: 17 if x>16 else x)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train["area_percentage"].values.reshape(-1, 1))
    train["area_percentage"] = scaler.transform(train["area_percentage"].values.reshape(-1, 1))
    test["area_percentage"] = scaler.transform(test["area_percentage"].values.reshape(-1, 1))   
    return train, test

In [7]:
# This function applies a 0 to 1 scaler to height
def height_process(train, test, damages):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train["height_percentage"].values.reshape(-1, 1))
    train["height_percentage"] = scaler.transform(train["height_percentage"].values.reshape(-1, 1))
    test["height_percentage"] = scaler.transform(test["height_percentage"].values.reshape(-1, 1))
    return train ,test

In [8]:
# This function applies one hot encoder (categories become binary features) to land_surface_condition
def land_surface_process(train, test, damages):
    ohe = OneHotEncoder()
    ohe.fit(train["land_surface_condition"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["land_surface_condition"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["land_surface_condition_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["land_surface_condition"].values.reshape(-1, 1)).toarray(), 
                                index=test.index, columns=["land_surface_condition_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="land_surface_condition")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="land_surface_condition")
    return train, test

In [9]:
# This function applies one hot encoder (categories become binary features) to fundation_type
def fundation_process(train, test, damages):
    ohe = OneHotEncoder()
    ohe.fit(train["foundation_type"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["foundation_type"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["foundation_type_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["foundation_type"].values.reshape(-1, 1)).toarray(),
                                index=test.index, columns=["foundation_type_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="foundation_type")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="foundation_type")
    return train, test

In [10]:
# This function applies one hot encoder (categories become binary features) to roof_type
def roof_process(train, test, damages):
    ohe = OneHotEncoder()
    ohe.fit(train["roof_type"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["roof_type"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["roof_type_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["roof_type"].values.reshape(-1, 1)).toarray(),
                                index=test.index, columns=["roof_type_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="roof_type")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="roof_type")
    return train, test

In [11]:
# This function first merges together categories m and z, then applies one hot encoder
# (categories become binary features) to ground_floor_type
def ground_floor_process(train, test, damages):
    ohe = OneHotEncoder()
    train["ground_floor_type"] = train["ground_floor_type"].apply(lambda x: "z" if x=="m" else x)
    test["ground_floor_type"] = test["ground_floor_type"].apply(lambda x: "z" if x=="m" else x)
    ohe.fit(train["ground_floor_type"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["ground_floor_type"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["ground_floor_type_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["ground_floor_type"].values.reshape(-1, 1)).toarray(), 
                                 index=test.index, columns=["ground_floor_type_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="ground_floor_type")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="ground_floor_type")
    return train, test

In [12]:
# This function applies one hot encoder (categories become binary features) to other_floor_type
def other_floor_process(train, test, damages):
    ohe = OneHotEncoder()
    ohe.fit(train["other_floor_type"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["other_floor_type"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["other_floor_type_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["other_floor_type"].values.reshape(-1, 1)).toarray(), 
                                 index=test.index, columns=["other_floor_type_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="other_floor_type")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="other_floor_type")
    return train, test

In [13]:
# This function applies one hot encoder (categories become binary features) to position
def position_process(train, test, damages):
    ohe = OneHotEncoder()
    ohe.fit(train["position"].values.reshape(-1, 1))
    train_encoded = pd.DataFrame(ohe.transform(train["position"].values.reshape(-1, 1)).toarray(), 
                                 index=train.index, columns=["position_" + i for i in ohe.categories_[0]])
    test_encoded = pd.DataFrame(ohe.transform(test["position"].values.reshape(-1, 1)).toarray(), 
                                 index=test.index, columns=["position_" + i for i in ohe.categories_[0]])
    train = train.merge(train_encoded, right_index=True, left_index=True).drop(columns="position")
    test = test.merge(test_encoded, right_index=True, left_index=True).drop(columns="position")
    return train, test

In [14]:
# This function this function replace each plan_configuration category by 2 new features : its mean and 
# its ratio len(damages==2)/len(category). It then applies a 0 to 1 scaler to those new features.
def plan_configuration_process(train, test, damages):
    dmgs = df.merge(damages, right_index=True, left_index=True).loc[:,['plan_configuration','damage_grade']]
    expected_value = pd.DataFrame(index=dmgs['plan_configuration'].unique(), columns=["expected_value", "ratio"])
    for i in expected_value.index:
        category = dmgs[dmgs['plan_configuration']==i]
        expected_value.loc[i,"expected_value"] = category['damage_grade'].mean()
        expected_value.loc[i, "ratio"] =  len(category[category['damage_grade']==2])/len(category)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(expected_value["expected_value"].values.reshape(-1, 1))
    train['plan_configuration_mean'] = scaler.transform(train['plan_configuration'].apply(lambda x: expected_value.loc[x,"expected_value"]).values.reshape(-1, 1))
    test['plan_configuration_mean'] = scaler.transform(test['plan_configuration'].apply(lambda x: expected_value.loc[x,"expected_value"]).values.reshape(-1, 1))
    scaler.fit(expected_value["ratio"].values.reshape(-1, 1))
    train['plan_configuration_ratio'] = scaler.transform(train['plan_configuration'].apply(lambda x: expected_value.loc[x,"ratio"]).values.reshape(-1, 1))
    test['plan_configuration_ratio'] = scaler.transform(test['plan_configuration'].apply(lambda x: expected_value.loc[x,"ratio"]).values.reshape(-1, 1))
    train = train.drop(columns="plan_configuration")
    test = test.drop(columns="plan_configuration")
    return train, test

In [15]:
# Here I will merge together several secondary use.
def secondary_use_process(train, test, damages):
    train.head()
    train["secondary_use_1"]= (train["has_secondary_use_institution"] +
                               train["has_secondary_use_rental"]).apply(lambda x: 1 if x>0 else 0)
    test["secondary_use_1"]= (test["has_secondary_use_institution"] +
                              test["has_secondary_use_rental"]).apply(lambda x: 1 if x>0 else 0)
    train["secondary_use_2"]= (train["has_secondary_use_health_post"] + train["has_secondary_use_school"] +
                               train["has_secondary_use_gov_office"]).apply(lambda x: 1 if x>0 else 0)
    test["secondary_use_2"]= (test["has_secondary_use_health_post"] + test["has_secondary_use_school"] +
                              test["has_secondary_use_gov_office"]).apply(lambda x: 1 if x>0 else 0)
    train["secondary_use_3"]= (train["has_secondary_use_hotel"] + train["has_secondary_use_industry"] +
                               train["has_secondary_use_other"]).apply(lambda x: 1 if x>0 else 0)
    test["secondary_use_3"]= (test["has_secondary_use_hotel"] + test["has_secondary_use_industry"] +
                              test["has_secondary_use_other"]).apply(lambda x: 1 if x>0 else 0)
    train = train.drop(columns=["has_secondary_use", "has_secondary_use_institution", "has_secondary_use_rental",
                               "has_secondary_use_health_post", "has_secondary_use_school", "has_secondary_use_gov_office",
                               "has_secondary_use_hotel", "has_secondary_use_industry", "has_secondary_use_other"]) 
    test = test.drop(columns=["has_secondary_use", "has_secondary_use_institution", "has_secondary_use_rental",
                              "has_secondary_use_health_post", "has_secondary_use_school", "has_secondary_use_gov_office",
                              "has_secondary_use_hotel", "has_secondary_use_industry", "has_secondary_use_other"])
    return train, test

# 10-fold cross validation subset

In [16]:
for i in range(0,10):
    testCV = df.iloc[int(len(df)*0.1*i):int(len(df)*0.1*(i+1)),:]
    trainCV = df.drop(index=testCV.index)
    trainCV, testCV = geo_level_process(trainCV, testCV, damages)
    trainCV, testCV = count_floors_process(trainCV, testCV, damages)
    trainCV, testCV = age_process(trainCV, testCV, damages)
    trainCV, testCV = area_process(trainCV, testCV, damages)
    trainCV, testCV = height_process(trainCV, testCV, damages)
    trainCV, testCV = trainCV.drop(columns="count_families"), testCV.drop(columns="count_families")
    trainCV, testCV = land_surface_process(trainCV, testCV, damages)
    trainCV, testCV = fundation_process(trainCV, testCV, damages)
    trainCV, testCV = roof_process(trainCV, testCV, damages)
    trainCV, testCV = ground_floor_process(trainCV, testCV, damages)
    trainCV, testCV = other_floor_process(trainCV, testCV, damages)
    trainCV, testCV = position_process(trainCV, testCV, damages)
    trainCV, testCV = plan_configuration_process(trainCV, testCV, damages)
    trainCV, testCV = trainCV.drop(columns="legal_ownership_status"), testCV.drop(columns="legal_ownership_status")
    trainCV, testCV = secondary_use_process(trainCV, testCV, damages)
    trainCV.to_csv('Datasets/CV0_train_subset_{}.csv'.format(i))
    testCV.to_csv('Datasets/CV0_test_subset_{}.csv'.format(i))

# Damages scaling 

For regression purposes, I will use scaled targets.

In [17]:
damages = (damages - 1)/2

In [18]:
damages.to_csv('Datasets/train_labels_scaled.csv')