In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)



In [2]:
#data preparation

df1 = pd.read_csv('Building_Ownership_Use.csv')
print df1.shape
df1.head()

print df1.isnull().sum()
df1['count_families']=df1['count_families'].fillna(1.0)
df1['has_secondary_use']=df1['has_secondary_use'].fillna(0.0)


df2 = pd.read_csv('Building_Structure.csv')
df2.head()

df2['count_floors_diff']=df2['count_floors_pre_eq']-df2['count_floors_post_eq']
df2['height_ft_diff']=df2['height_ft_pre_eq']-df2['height_ft_post_eq']
df2['height_ft_diff']=df2['height_ft_diff'].clip_lower(0)
df2['count_floors_diff']=df2['count_floors_diff'].clip_lower(0)


df2['position']=df2['position'].fillna('Not attached')
df2['plan_configuration']=df2['plan_configuration'].fillna('Rectangular')

df2.drop(['count_floors_pre_eq','count_floors_post_eq','height_ft_pre_eq', 'height_ft_post_eq'],inplace=True,axis=1)
df2.head()

df3 = pd.read_csv('train.csv')

df3.head(5)

df4 = pd.read_csv('test.csv')

df4.head(5)


combined = pd.merge(df1,df2, on='building_id')

train_data=pd.merge(df3,combined,on="building_id")

test_data=pd.merge(df4,combined,on="building_id")


(1052948, 17)
building_id                       0
district_id                       0
vdcmun_id                         0
ward_id                           0
legal_ownership_status            0
count_families                    2
has_secondary_use                10
has_secondary_use_agriculture     0
has_secondary_use_hotel           0
has_secondary_use_rental          0
has_secondary_use_institution     0
has_secondary_use_school          0
has_secondary_use_industry        0
has_secondary_use_health_post     0
has_secondary_use_gov_office      0
has_secondary_use_use_police      0
has_secondary_use_other           0
dtype: int64


In [3]:
#preprocessing
df1=train_data
df1['has_repair_started']=df1['has_repair_started'].fillna(0)
df1['has_geotechnical_risk_sum']=df1['has_geotechnical_risk_fault_crack']+df1['has_geotechnical_risk_flood']+df1['has_geotechnical_risk_land_settlement']+df1['has_geotechnical_risk_landslide']+df1['has_geotechnical_risk_liquefaction']+df1['has_geotechnical_risk_other']+df1['has_geotechnical_risk_rock_fall']
df1.drop(columns = ['has_geotechnical_risk', 
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_flood',
       'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_landslide',
       'has_geotechnical_risk_liquefaction', 
       'has_geotechnical_risk_other',
       'has_geotechnical_risk_rock_fall',
       'has_secondary_use_agriculture', 
       'has_secondary_use_hotel',
       'has_secondary_use_rental', 
       'has_secondary_use_institution',
       'has_secondary_use_school',
       'has_secondary_use_industry',
       'has_secondary_use_health_post', 
       'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 
       'has_secondary_use_other',
       'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 
       'has_superstructure_timber',
       'has_superstructure_bamboo',
       'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 
       'has_superstructure_other'],inplace=True)

df_n=df1[['building_id','damage_grade']]
df1[['district_id','vdcmun_id','ward_id_x']]=df1[['district_id','vdcmun_id','ward_id_x']].astype(str)
df1.drop(columns=['building_id','damage_grade','district_id_x','district_id_y','vdcmun_id_x','vdcmun_id_y','ward_id_y'],inplace=True)

ptrain=pd.concat([df_n,df1],axis=1)


df2=test_data

df2['has_repair_started']=df2['has_repair_started'].fillna(0)
df_n=df2[['building_id']]
df2['has_geotechnical_risk_sum']=df2['has_geotechnical_risk_fault_crack']+df2['has_geotechnical_risk_flood']+df2['has_geotechnical_risk_land_settlement']+df2['has_geotechnical_risk_landslide']+df2['has_geotechnical_risk_liquefaction']+df2['has_geotechnical_risk_other']+df2['has_geotechnical_risk_rock_fall']
df2.drop(columns = ['has_geotechnical_risk', 
       'has_geotechnical_risk_fault_crack',
       'has_geotechnical_risk_flood',
       'has_geotechnical_risk_land_settlement',
       'has_geotechnical_risk_landslide',
       'has_geotechnical_risk_liquefaction', 
       'has_geotechnical_risk_other',
       'has_geotechnical_risk_rock_fall',
       'has_secondary_use_agriculture', 
       'has_secondary_use_hotel',
       'has_secondary_use_rental', 
       'has_secondary_use_institution',
       'has_secondary_use_school',
       'has_secondary_use_industry',
       'has_secondary_use_health_post', 
       'has_secondary_use_gov_office',
       'has_secondary_use_use_police', 
       'has_secondary_use_other',
       'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 
       'has_superstructure_timber',
       'has_superstructure_bamboo',
       'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 
       'has_superstructure_other'],inplace=True)

df2[['district_id','vdcmun_id','ward_id_x']]=df2[['district_id','vdcmun_id','ward_id_x']].astype(str)
df2.drop(columns=['building_id','district_id_x','district_id_y','vdcmun_id_x','vdcmun_id_y','ward_id_y'],inplace=True)
ptest=pd.concat([df_n,df2],axis=1)




In [4]:
train=ptrain


train['damage_grade']=train['damage_grade'].str.replace('Grade ','')
train['damage_grade']=train['damage_grade'].str.replace('5','0').astype(float)

Y=train['damage_grade']

X=train.drop(['damage_grade','building_id'],axis=1)

X[['district_id','vdcmun_id','ward_id_x']]=X[['district_id','vdcmun_id','ward_id_x']].astype(str)





categorical_features_indices = np.where(X.dtypes==object)[0]
print categorical_features_indices
len(categorical_features_indices)

[ 0  1  3  4  5 10 11 12 13 14 15 16 17]


13

In [5]:
stack=pd.DataFrame()
stack["building_id"]=train.building_id
stack["target"]=Y


l=['0_cb','1_cb','2_cb','3_cb','4_cb']

for i in l:
    stack[i]=0



In [6]:
#fitting model
model = CatBoostClassifier(iterations=30,depth=8, learning_rate=0.5,loss_function='MultiClassOneVsAll',classes_count=5)
model.fit(X,Y,cat_features=categorical_features_indices,plot=False,verbose=True)

<catboost.core.CatBoostClassifier at 0x7f320bf31350>

In [7]:
Y_ = model.predict(X)
stack[l]=model.predict_proba(X)
stack['cb']=Y_
score=f1_score(Y,Y_,average='weighted')

In [8]:
score

0.8087300053770577

In [10]:
imp = model.get_feature_importance()
dic = X.columns
print len(imp),len(dic)

df = pd.DataFrame()
df['feature']=dic
df['importance']=imp
df.sort_values(by='importance',ascending=False)

21 21


Unnamed: 0,feature,importance
17,condition_post_eq,27.120198
19,height_ft_diff,19.28884
4,ward_id_x,15.548676
3,vdcmun_id,8.192835
8,age_building,6.378681
0,area_assesed,6.084757
2,has_repair_started,4.158398
1,district_id,3.202187
6,count_families,3.190809
11,foundation_type,2.210539


In [11]:
stack_234=stack[stack['target']>=2]
stack_234.shape


score=f1_score(stack_234['target'].values,stack_234['cb'].values,average='weighted',labels=[2,3,4])
print score

0.703167804038465


# model on top

In [12]:
l_1=['0_cb_1','1_cb_1','2_cb_1','3_cb_1','4_cb_1']

for i in l_1:
    stack[i]=0

In [13]:
#fitting model

X=train[train['damage_grade']>=2]
Y=X['damage_grade']-2
X=X.drop(['damage_grade','building_id'],axis=1)

X[['district_id','vdcmun_id','ward_id_x']]=X[['district_id','vdcmun_id','ward_id_x']].astype(str)


model_1 = CatBoostClassifier(iterations=150,depth=8, learning_rate=0.6,loss_function='MultiClass',classes_count=3)
model_1.fit(X,Y,cat_features=categorical_features_indices,plot=False,verbose=True)

0:	learn: -0.9088339	total: 2.89s	remaining: 7m 11s
1:	learn: -0.8519402	total: 5.69s	remaining: 7m 1s
2:	learn: -0.8031821	total: 7.72s	remaining: 6m 18s
3:	learn: -0.7833708	total: 10.4s	remaining: 6m 20s
4:	learn: -0.7711471	total: 13.5s	remaining: 6m 31s
5:	learn: -0.7641692	total: 16.8s	remaining: 6m 42s
6:	learn: -0.7587333	total: 19.9s	remaining: 6m 45s
7:	learn: -0.7529756	total: 22.7s	remaining: 6m 42s
8:	learn: -0.7478828	total: 26s	remaining: 6m 47s
9:	learn: -0.7447530	total: 29.9s	remaining: 6m 58s
10:	learn: -0.7413014	total: 32.6s	remaining: 6m 52s
11:	learn: -0.7389965	total: 35.9s	remaining: 6m 52s
12:	learn: -0.7372846	total: 39.5s	remaining: 6m 56s
13:	learn: -0.7340848	total: 42.9s	remaining: 6m 57s
14:	learn: -0.7325419	total: 45.9s	remaining: 6m 53s
15:	learn: -0.7310151	total: 49.1s	remaining: 6m 50s
16:	learn: -0.7288148	total: 52.3s	remaining: 6m 49s
17:	learn: -0.7276080	total: 55.8s	remaining: 6m 48s
18:	learn: -0.7262278	total: 59.6s	remaining: 6m 50s
19:	le

<catboost.core.CatBoostClassifier at 0x7f3217b75350>

In [14]:
pred = model_1.predict(X)+2
Y1=Y+2

score=f1_score(Y1,pred,average='weighted',labels=[2,3,4])
score

0.7178572876604519

In [15]:
filename = 'second_model.sav'
pickle.dump(model_1, open(filename, 'wb'))
 

# Final model

In [16]:
#test data 

test=ptest
print test.shape
b_id = test['building_id']
test_x=test.drop(['building_id'],axis=1)


(421175, 22)


In [17]:
#predicting on test-set

test_y = model.predict(test_x)

test_x['target']=test_y
test_x['b_id']=b_id
test_x.head()

Unnamed: 0,area_assesed,district_id,has_repair_started,vdcmun_id,ward_id_x,legal_ownership_status,count_families,has_secondary_use,age_building,plinth_area_sq_ft,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,condition_post_eq,count_floors_diff,height_ft_diff,has_geotechnical_risk_sum,target,b_id
0,Both,7,1.0,701,70102,Private,1.0,0.0,28,454,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,Not applicable,Attached-1 side,Rectangular,Damaged-Repaired and used,0,0,0,3.0,a3380c4f75
1,Both,7,1.0,701,70103,Private,1.0,0.0,25,542,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,Not applicable,Attached-1 side,Rectangular,Damaged-Rubble unclear,1,9,0,0.0,a338a4e653
2,Building removed,7,1.0,701,70103,Private,1.0,0.0,35,589,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,Damaged-Rubble Clear-New building built,2,18,0,0.0,a338a4e6b7
3,Both,7,1.0,701,70106,Private,1.0,0.0,22,468,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,TImber/Bamboo-Mud,Attached-1 side,Rectangular,Damaged-Repaired and used,0,0,0,3.0,a33a6eaa3a
4,Building removed,7,1.0,701,70107,Private,1.0,0.0,24,426,Moderate slope,Mud mortar-Stone/Brick,Bamboo/Timber-Light roof,Mud,Not applicable,Attached-1 side,Rectangular,Damaged-Rubble Clear-New building built,1,9,0,0.0,a33b073ff6


In [18]:
test_234= test_x[test_x['target']>=2]
test_234.shape

(245959, 23)

In [19]:
test_234_x = test_234.drop(['b_id','target'],axis=1)
t_234 = model_1.predict(test_234_x)+2
test_234['target']=t_234

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
test_x.update(test_234)

In [21]:
# writing into submission file

sample=pd.read_csv('sample_submission.csv')
print sample.head()
sample['damage_grade']=test_x['target']
print sample.head()
sample['damage_grade']=sample['damage_grade'].astype(int).astype(str).str.replace('0','5')
sample['damage_grade']='Grade '+sample['damage_grade']
sample.to_csv('submission.csv',index=False)
print sample.head()

  building_id damage_grade
0  a3380c4f75      Grade 3
1  a338a4e653      Grade 1
2  a338a4e6b7      Grade 1
3  a33a6eaa3a      Grade 5
4  a33b073ff6      Grade 4
  building_id  damage_grade
0  a3380c4f75           3.0
1  a338a4e653           0.0
2  a338a4e6b7           0.0
3  a33a6eaa3a           3.0
4  a33b073ff6           0.0
  building_id damage_grade
0  a3380c4f75      Grade 3
1  a338a4e653      Grade 5
2  a338a4e6b7      Grade 5
3  a33a6eaa3a      Grade 3
4  a33b073ff6      Grade 5
