In [9]:
import pandas as pd
import pickle

In [10]:
df_test_values = pd.read_csv('Data/Nepal_Earthquake_test_values.csv', index_col='building_id')
df_submission_format = pd.read_csv('Data/Nepal_Earthquake_submission_format.csv', index_col='building_id')

In [11]:
binary_cols = []
for col in df_test_values.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(df_test_values.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

In [12]:
for col in geo_cols:
    print(f'There are {df_test_values[col].nunique()} unique values in {col}.')
    print(f'The range of values is: {df_test_values[col].min()}, {df_test_values[col].max()}.')
    print('---------------------------------')

There are 31 unique values in geo_level_1_id.
The range of values is: 0, 30.
---------------------------------
There are 1364 unique values in geo_level_2_id.
The range of values is: 0, 1427.
---------------------------------
There are 10213 unique values in geo_level_3_id.
The range of values is: 0, 12567.
---------------------------------


### Basline Forest Model

In [5]:
with open('../Models/baseline_forest.pickle', 'rb') as f:
    baseline_forest = pickle.load(f)

In [6]:
df_test_values_dummies = pd.get_dummies(df_test_values)

In [7]:
preds = baseline_forest.predict(df_test_values_dummies)

In [8]:
my_submission = pd.DataFrame(data=preds, columns=df_submission_format.columns, index=df_submission_format.index)

In [12]:
my_submission.to_csv('../Contest_submissions/submission_1.csv')

In [14]:
! head ../Contest_submissions/submission_1.csv

building_id,damage_grade
300051,2
99355,2
890251,2
745817,1
421793,3
871976,3
691228,1
896100,3
343471,2


### Forest_gs model

In [18]:
with open('../Models/forest_gs.pickle', 'rb') as f:
    forest_gs = pickle.load(f)

In [19]:
binary_cols_dropped = binary_cols.copy()
for col in binary_cols_dropped:
    if col.startswith('has_secondary'):
        binary_cols_dropped.remove(col)
binary_cols_dropped.append('has_secondary_use')

cat_cols_dropped = cat_cols.copy()
cat_cols_dropped.remove('legal_ownership_status')
cat_cols_dropped.remove('plan_configuration')

In [20]:
preds = forest_gs.predict(df_test_values)

In [21]:
my_submission = pd.DataFrame(data=preds, columns=df_submission_format.columns, index=df_submission_format.index)

In [22]:
my_submission.to_csv('../Contest_submissions/submission_forest_gs.csv')

In [23]:
! head ../Contest_submissions/submission_1.csv

building_id,damage_grade
300051,2
99355,2
890251,2
745817,1
421793,3
871976,3
691228,1
896100,3
343471,2


### XGBoost1 Model

In [24]:
with open('../Models/XGBoost1.pickle', 'rb') as f:
    XGBoost1 = pickle.load(f)

In [25]:
preds = XGBoost1.predict(df_test_values)

In [26]:
my_submission = pd.DataFrame(data=preds, columns=df_submission_format.columns, index=df_submission_format.index)

In [27]:
my_submission.to_csv('../Contest_submissions/submission_XGBoost1.csv')

In [28]:
! head ../Contest_submissions/submission_XGBoost1.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,1
421793,3
871976,2
691228,1
896100,3
343471,2
