# Merging Tables onto Train Data

## Import Packages and Data

In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import pandas as pd
import numpy as np

In [5]:
train_values = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/challenge_data/train_values.csv')
train_labels = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/challenge_data/train_labels.csv')
test_values = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/challenge_data/test_values.csv')

In [6]:
# household_dem = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/household/csv_household_demographics.csv')
b_structure = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/building/csv_building_structure.csv')
b_damage_assessment = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/building/csv_building_damage_assessment.csv', low_memory=False)
b_owner_use = pd.read_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/building/csv_building_ownership_and_use.csv')

## Drop Unrelated Info from DataFrames

### Building Structure

In [7]:
b_structure.columns

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'count_floors_post_eq', 'age_building',
       'plinth_area_sq_ft', 'height_ft_pre_eq', 'height_ft_post_eq',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'condition_post_eq', 'damage_grade', 'technical_solution_proposed'],
      dtype='object')

In [8]:
b_str_keep = ['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'count_floors_pre_eq', 'age_building',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other', 'damage_grade']

In [9]:
drop = []
for i in b_structure.columns:
    if i not in b_str_keep:
        drop.append(i)
        
drop

['count_floors_post_eq',
 'plinth_area_sq_ft',
 'height_ft_pre_eq',
 'height_ft_post_eq',
 'condition_post_eq',
 'technical_solution_proposed']

In [10]:
b_structure = b_structure.drop(drop, axis=1)

### Building Damage Assessment --> NOT NEEDED

In [11]:
b_damage_assessment.columns

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'damage_overall_collapse', 'damage_overall_leaning',
       'damage_overall_adjacent_building_risk', 'damage_foundation_severe',
       'damage_foundation_moderate', 'damage_foundation_insignificant',
       'damage_roof_severe', 'damage_roof_moderate',
       'damage_roof_insignificant', 'damage_corner_separation_severe',
       'damage_corner_separation_moderate',
       'damage_corner_separation_insignificant',
       'damage_diagonal_cracking_severe', 'damage_diagonal_cracking_moderate',
       'damage_diagonal_cracking_insignificant',
       'damage_in_plane_failure_severe', 'damage_in_plane_failure_moderate',
       'damage_in_plane_failure_insignificant',
       'damage_out_of_plane_failure_severe',
       'damage_out_of_plane_failure_moderate',
       'damage_out_of_plane_failure_insignificant',
       'damage_out_of_plane_failure_walls_ncfr_severe',
       'damage_out_of_plane_failure_walls_ncfr_moderate',
   

In [12]:
b_dam_keep = ['building_id',
        'damage_grade']

In [13]:
b_dam_drop = []
for i in b_damage_assessment.columns:
    if i not in b_dam_keep:
        b_dam_drop.append(i)
        
b_dam_drop

['district_id',
 'vdcmun_id',
 'ward_id',
 'damage_overall_collapse',
 'damage_overall_leaning',
 'damage_overall_adjacent_building_risk',
 'damage_foundation_severe',
 'damage_foundation_moderate',
 'damage_foundation_insignificant',
 'damage_roof_severe',
 'damage_roof_moderate',
 'damage_roof_insignificant',
 'damage_corner_separation_severe',
 'damage_corner_separation_moderate',
 'damage_corner_separation_insignificant',
 'damage_diagonal_cracking_severe',
 'damage_diagonal_cracking_moderate',
 'damage_diagonal_cracking_insignificant',
 'damage_in_plane_failure_severe',
 'damage_in_plane_failure_moderate',
 'damage_in_plane_failure_insignificant',
 'damage_out_of_plane_failure_severe',
 'damage_out_of_plane_failure_moderate',
 'damage_out_of_plane_failure_insignificant',
 'damage_out_of_plane_failure_walls_ncfr_severe',
 'damage_out_of_plane_failure_walls_ncfr_moderate',
 'damage_out_of_plane_failure_walls_ncfr_insignificant',
 'damage_gable_failure_severe',
 'damage_gable_failure

In [14]:
b_damage_assessment = b_damage_assessment.drop(b_dam_drop, axis=1)

### Building Ownership and Use

In [15]:
b_owner_use.columns

Index(['building_id', 'district_id', 'vdcmun_id', 'ward_id',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other'],
      dtype='object')

Here we are keeping all columns

In [16]:
b_owner_keep = ['building_id',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']

In [17]:
b_owner_drop = []
for i in b_owner_use.columns:
    if i not in b_owner_keep:
        b_owner_drop.append(i)
        
b_owner_drop

['district_id', 'vdcmun_id', 'ward_id']

In [18]:
b_owner_use = b_owner_use.drop(b_owner_drop, axis=1)

## Merging Tables

In [19]:
merged = pd.merge(b_structure, b_owner_use, on='building_id', how='inner')
merged.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,age_building,land_surface_condition,foundation_type,roof_type,ground_floor_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,120101000011,12,1207,120703,1,9,Flat,Other,Bamboo/Timber-Light roof,Mud,...,0,0,0,0,0,0,0,0,0,0
1,120101000021,12,1207,120703,1,15,Flat,Other,Bamboo/Timber-Light roof,Mud,...,0,0,0,0,0,0,0,0,0,0
2,120101000031,12,1207,120703,1,20,Flat,Other,Bamboo/Timber-Light roof,Mud,...,0,0,0,0,0,0,0,0,0,0
3,120101000041,12,1207,120703,1,20,Flat,Other,Bamboo/Timber-Light roof,Mud,...,0,0,0,0,0,0,0,0,0,0
4,120101000051,12,1207,120703,1,30,Flat,Other,Bamboo/Timber-Light roof,Mud,...,0,0,0,0,0,0,0,0,0,0


In [22]:
merged.isna().sum()

building_id                                0
district_id                                0
vdcmun_id                                  0
ward_id                                    0
count_floors_pre_eq                        0
age_building                               0
land_surface_condition                     0
foundation_type                            0
roof_type                                  0
ground_floor_type                          0
other_floor_type                           0
position                                   1
plan_configuration                         1
has_superstructure_adobe_mud               0
has_superstructure_mud_mortar_stone        0
has_superstructure_stone_flag              0
has_superstructure_cement_mortar_stone     0
has_superstructure_mud_mortar_brick        0
has_superstructure_cement_mortar_brick     0
has_superstructure_timber                  0
has_superstructure_bamboo                  0
has_superstructure_rc_non_engineered       0
has_supers

In [21]:
merged.dtypes

building_id                                 int64
district_id                                 int64
vdcmun_id                                   int64
ward_id                                     int64
count_floors_pre_eq                         int64
age_building                                int64
land_surface_condition                     object
foundation_type                            object
roof_type                                  object
ground_floor_type                          object
other_floor_type                           object
position                                   object
plan_configuration                         object
has_superstructure_adobe_mud                int64
has_superstructure_mud_mortar_stone         int64
has_superstructure_stone_flag               int64
has_superstructure_cement_mortar_stone      int64
has_superstructure_mud_mortar_brick         int64
has_superstructure_cement_mortar_brick      int64
has_superstructure_timber                   int64


In [54]:
merged.to_csv('/Users/chantalwuerschinger/code/chantalwuer/earthquake_damage/raw_data/merged_table.csv', index=False)

## Merge additional information

In [23]:
merged_df = merged.copy()

## Check out contents of dataframes

In [8]:
train_data_columns = ['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_industry',
       'has_secondary_use_health_post', 'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 'has_secondary_use_other']

In [10]:
household_columns = ['household_id',
                    'gender_household_head', 'age_household_head', 'caste_household',
                    'education_level_household_head', 'income_level_household',
                    'size_household']

columns_to_drop = ['district_id', 'vdcmun_id', 'ward_id', 'is_bank_account_present_in_household']

In [13]:
household_dem_s = household_dem.drop(columns_to_drop, axis=1)
household_dem.head()

Unnamed: 0,household_id,district_id,vdcmun_id,ward_id,gender_household_head,age_household_head,caste_household,education_level_household_head,income_level_household,size_household,is_bank_account_present_in_household
0,12010100001101,12,1207,120703,Male,31.0,Rai,Illiterate,Rs. 10 thousand,3.0,0.0
1,12010100002101,12,1207,120703,Female,62.0,Rai,Illiterate,Rs. 10 thousand,6.0,0.0
2,12010100003101,12,1207,120703,Male,51.0,Gharti/Bhujel,Illiterate,Rs. 10 thousand,13.0,0.0
3,12010100004101,12,1207,120703,Male,48.0,Gharti/Bhujel,Illiterate,Rs. 10 thousand,5.0,0.0
4,12010100005101,12,1207,120703,Male,70.0,Gharti/Bhujel,Illiterate,Rs. 10 thousand,8.0,0.0


In [21]:
train_values.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i