In [7]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV

In [2]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')

In [3]:
sample = pd.read_csv('sample_submission.csv')

In [5]:
nb = BernoulliNB()

In [8]:
nb_isotonic = CalibratedClassifierCV(nb, cv=2, method='isotonic')

In [9]:
nb_isotonic.fit(train.loc[:, 'severity_type 1': 'event_type 9'], train.loc[:, 'fault_severity'])

CalibratedClassifierCV(base_estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
            cv=2, method='isotonic')

In [10]:
result = nb_isotonic.predict(test.loc[:, 'severity_type 1': 'event_type 9'])

In [14]:
np.unique(result)

array([0, 1, 2])

In [12]:
prob = nb_isotonic.predict_proba(test.loc[:, 'severity_type 1': 'event_type 9'])

In [13]:
prob

array([[ 0.8500883 ,  0.1499117 ,  0.        ],
       [ 0.37775789,  0.2995236 ,  0.32271851],
       [ 0.82768393,  0.17231607,  0.        ],
       ..., 
       [ 0.46716002,  0.44933374,  0.08350624],
       [ 0.75605867,  0.24394133,  0.        ],
       [ 0.752324  ,  0.247676  ,  0.        ]])

In [15]:
submit = pd.DataFrame({
    'id': sample.loc[:, 'id'].values, 
    'predict_0': prob[:, 0],
    'predict_1': prob[:, 1],
    'predict_2': prob[:, 2]})

In [18]:
submit.to_csv('bernoulli_nb.csv', index=False)

In [16]:
fault_location = pd.read_csv('fault_severity_frac.csv', index_col=[0])

In [19]:
fault_location = fault_location.reset_index()

In [21]:
submit = pd.merge(submit, test.loc[:, ['id', 'location']], on='id', how='left')

In [29]:
submit2 = pd.merge(submit, fault_location, on='location', how='left')
submit2 = submit2.fillna(0)

In [54]:
submit2.head()

Unnamed: 0_level_0,predict_0,predict_1,predict_2,location,0,1,2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11066,0.850088,0.149912,0.0,location 481,0.8,0.15,0.05
18000,0.377758,0.299524,0.322719,location 962,0.288889,0.266667,0.444444
16964,0.827684,0.172316,0.0,location 491,1.0,0.0,0.0
4795,0.453446,0.469111,0.077442,location 532,0.666667,0.333333,0.0
3392,0.467105,0.265996,0.266899,location 600,0.421875,0.125,0.453125


In [31]:
submit2 = submit2.set_index('id')

In [32]:
submit3 = submit2.loc[:, ['predict_0', 'predict_1', 'predict_2']].add(submit2.loc[:, ['0', '1', '2']].values) / 2

In [33]:
submit3.reset_index().to_csv('bernoulli_location.csv', index=False)

In [34]:
submit3.head()

Unnamed: 0_level_0,predict_0,predict_1,predict_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11066,0.825044,0.149956,0.025
18000,0.333323,0.283095,0.383581
16964,0.913842,0.086158,0.0
4795,0.560057,0.401222,0.038721
3392,0.44449,0.195498,0.360012


In [35]:
rf = pd.read_csv('randomforest_location.csv')

In [37]:
rf = rf.set_index('id')

In [39]:
blr = submit3 + rf

In [40]:
blr.head()

Unnamed: 0_level_0,predict_0,predict_1,predict_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11066,1.725044,0.224956,0.05
18000,0.509514,0.416428,1.074058
16964,1.913842,0.086158,0.0
4795,1.298152,0.663127,0.038721
3392,0.728488,0.684937,0.586575


In [43]:
blr = blr / 2

In [44]:
blr.reset_index().to_csv('bernoulli_location_random_forest.csv', index=False)

In [51]:
rf = pd.read_csv('randomforest_after_grid_search.csv')

In [52]:
rf.head()

Unnamed: 0,id,predict_0,predict_1,predict_2
0,11066,1.0,0.0,0.0
1,18000,0.095238,0.0,0.904762
2,16964,1.0,0.0,0.0
3,4795,0.746032,0.253968,0.0
4,3392,0.191628,0.808372,0.0


Unnamed: 0,id,predict_0,predict_1,predict_2,location
0,11066,0.850088,0.149912,0.0,location 481
1,18000,0.377758,0.299524,0.322719,location 962
2,16964,0.827684,0.172316,0.0,location 491
3,4795,0.453446,0.469111,0.077442,location 532
4,3392,0.467105,0.265996,0.266899,location 600


In [55]:
submit4 = pd.merge(submit2.reset_index(), rf, on='id', how='left')
submit4 = submit4.fillna(0)

In [56]:
submit4.head()

Unnamed: 0,id,predict_0_x,predict_1_x,predict_2_x,location,0,1,2,predict_0_y,predict_1_y,predict_2_y
0,11066,0.850088,0.149912,0.0,location 481,0.8,0.15,0.05,1.0,0.0,0.0
1,18000,0.377758,0.299524,0.322719,location 962,0.288889,0.266667,0.444444,0.095238,0.0,0.904762
2,16964,0.827684,0.172316,0.0,location 491,1.0,0.0,0.0,1.0,0.0,0.0
3,4795,0.453446,0.469111,0.077442,location 532,0.666667,0.333333,0.0,0.746032,0.253968,0.0
4,3392,0.467105,0.265996,0.266899,location 600,0.421875,0.125,0.453125,0.191628,0.808372,0.0


In [57]:
submit4 = submit4.set_index('id').drop('location', axis=1)

In [58]:
submit4.head()

Unnamed: 0_level_0,predict_0_x,predict_1_x,predict_2_x,0,1,2,predict_0_y,predict_1_y,predict_2_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11066,0.850088,0.149912,0.0,0.8,0.15,0.05,1.0,0.0,0.0
18000,0.377758,0.299524,0.322719,0.288889,0.266667,0.444444,0.095238,0.0,0.904762
16964,0.827684,0.172316,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4795,0.453446,0.469111,0.077442,0.666667,0.333333,0.0,0.746032,0.253968,0.0
3392,0.467105,0.265996,0.266899,0.421875,0.125,0.453125,0.191628,0.808372,0.0


In [59]:
blr = submit4.iloc[:, 0:3].add(submit4.iloc[:, 3:6].values).add(submit4.iloc[:, 6:9].values) / 3

In [61]:
blr.columns = ['predict_0', 'predict_1', 'predict_2']

In [64]:
blr.reset_index().to_csv('bernoulli_location_random_forest_equal.csv', index=False)

In [65]:
blr.head()

Unnamed: 0_level_0,predict_0,predict_1,predict_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11066,0.883363,0.099971,0.016667
18000,0.253962,0.18873,0.557308
16964,0.942561,0.057439,0.0
4795,0.622048,0.352138,0.025814
3392,0.360203,0.399789,0.240008
