## Data Overview
* train: 7,381 X 3, 7,381 unique id's same as number of rows, 929 unique locations
  - faulty_severity
    * 0: 4,784
    * 1: 1,871
    * 2:   726
* test: 11,171 X 2, 11,171 unique id's same as number of rows, 1,039 unique locations, 842 of them are in the train

Each of the following tables have 18,552 unique id's, 7,381 shared with train, 11,171 shared with test:

* event: 31,170 X 2, 18,552 unique id's, 53 unique event_type.
* logf: 58,671 X 3, 18,552 unique id's, 386 unique features, 341 unique volume.  test.
* sev: 18,552 X 2, id's all unique, 5 unique severity_type.
* resource: 21,076 X 2, 18,552 unique id's, 10 unique resource_type.

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
mlb = MultiLabelBinarizer()

In [4]:
train = pd.read_csv('train.csv')

In [30]:
train.fault_severity.value_counts()

0    4784
1    1871
2     726
dtype: int64

In [44]:
test = pd.read_csv('test.csv')

In [31]:
test.shape

(11171, 2)

In [32]:
test.loc[:, 'id'].nunique()

11171

In [33]:
test.loc[:, 'location'].nunique()

1039

In [28]:
test.loc[test.loc[:, 'location'].isin(train.loc[:, 'location']), 'location'].nunique()

842

In [23]:
test.id.nunique()

11171

In [26]:
test.location.nunique()

1039

In [5]:
event = pd.read_csv('event_type.csv')

In [6]:
event.head()

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [7]:
event.shape

(31170, 2)

In [36]:
event.loc[event.loc[:, 'id'].isin(train.loc[:, 'id']), 'id'].nunique()

7381

In [37]:
event.loc[event.loc[:, 'id'].isin(test.loc[:, 'id']), 'id'].nunique()

11171

In [22]:
event.id.nunique()

18552

In [8]:
# 53 event type
event.event_type.nunique()

53

In [11]:
event_by_id = event.groupby('id', sort=False)

In [12]:
event_agg = event_by_id['event_type'].apply(lambda x: x.tolist())
event_bin = pd.DataFrame(mlb.fit_transform(event_agg.values), columns=mlb.classes_, index=event_agg.index)

In [13]:
event_bin.head()

Unnamed: 0_level_0,event_type 1,event_type 10,event_type 11,event_type 12,event_type 13,event_type 14,event_type 15,event_type 17,event_type 18,event_type 19,...,event_type 5,event_type 50,event_type 51,event_type 52,event_type 53,event_type 54,event_type 6,event_type 7,event_type 8,event_type 9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6597,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8011,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2597,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5022,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6852,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
logf = pd.read_csv('log_feature.csv')

In [15]:
logf.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [40]:
logf.loc[logf.loc[:, 'id'].isin(train.loc[:, 'id']), 'id'].nunique()

7381

In [41]:
logf.loc[logf.loc[:, 'id'].isin(test.loc[:, 'id']), 'id'].nunique()

11171

In [39]:
logf.loc[:, 'id'].nunique()

18552

In [38]:
logf.shape

(58671, 3)

In [11]:
logf.log_feature.nunique()

386

In [12]:
logf.volume.nunique()

341

In [13]:
logf.volume.describe()

count    58671.000000
mean         9.685296
std         27.314433
min          1.000000
25%          1.000000
50%          2.000000
75%          7.000000
max       1310.000000
Name: volume, dtype: float64

In [16]:
logf_by_id = logf.groupby('id', sort=False)

In [19]:
logf_agg = logf_by_id['log_feature'].apply(lambda x: x.tolist())
logf_bin = pd.DataFrame(mlb.fit_transform(logf_agg.values), columns=mlb.classes_, index=logf_agg.index)

In [20]:
logf_bin.head()

Unnamed: 0_level_0,feature 1,feature 10,feature 100,feature 101,feature 102,feature 103,feature 104,feature 105,feature 106,feature 107,...,feature 90,feature 91,feature 92,feature 93,feature 94,feature 95,feature 96,feature 97,feature 98,feature 99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
logf_pivot = logf.pivot(index='id', columns='log_feature', values='volume').fillna(0)

In [36]:
logf_pivot.columns = logf_pivot.columns + ' volume'

In [37]:
logf_pivot.head()

Unnamed: 0_level_0,feature 1 volume,feature 10 volume,feature 100 volume,feature 101 volume,feature 102 volume,feature 103 volume,feature 104 volume,feature 105 volume,feature 106 volume,feature 107 volume,...,feature 90 volume,feature 91 volume,feature 92 volume,feature 93 volume,feature 94 volume,feature 95 volume,feature 96 volume,feature 97 volume,feature 98 volume,feature 99 volume
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
resource = pd.read_csv('resource_type.csv')

In [26]:
resource_by_id = resource.groupby('id', sort=False)

In [27]:
resource_agg = resource_by_id['resource_type'].apply(lambda x: x.tolist())
resource_bin = pd.DataFrame(mlb.fit_transform(resource_agg.values), columns=mlb.classes_, index=resource_agg.index)

In [28]:
resource_bin.head()

Unnamed: 0_level_0,resource_type 1,resource_type 10,resource_type 2,resource_type 3,resource_type 4,resource_type 5,resource_type 6,resource_type 7,resource_type 8,resource_type 9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6597,0,0,0,0,0,0,0,0,1,0
8011,0,0,0,0,0,0,0,0,1,0
2597,0,0,0,0,0,0,0,0,1,0
5022,0,0,0,0,0,0,0,0,1,0
6852,0,0,0,0,0,0,0,0,1,0


In [42]:
resource.loc[resource.loc[:, 'id'].isin(train.loc[:, 'id']), 'id'].nunique()

7381

In [43]:
resource.loc[resource.loc[:, 'id'].isin(test.loc[:, 'id']), 'id'].nunique()

11171

In [44]:
resource.shape

(21076, 2)

In [45]:
resource.loc[:, 'id'].nunique()

18552

In [15]:
resource.resource_type.nunique()

10

In [35]:
resource.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [29]:
sev = pd.read_csv('severity_type.csv')

In [30]:
sev_by_id = sev.groupby('id', sort=False)

In [31]:
sev_agg = sev_by_id['severity_type'].apply(lambda x: x.tolist())
sev_bin = pd.DataFrame(mlb.fit_transform(sev_agg.values), columns=mlb.classes_, index=sev_agg.index)

In [32]:
sev_bin.head()

Unnamed: 0_level_0,severity_type 1,severity_type 2,severity_type 3,severity_type 4,severity_type 5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6597,0,1,0,0,0
8011,0,1,0,0,0
2597,0,1,0,0,0
5022,1,0,0,0,0
6852,1,0,0,0,0


In [48]:
sev.loc[sev.loc[:, 'id'].isin(train.loc[:, 'id']), 'id'].nunique()

7381

In [49]:
sev.loc[sev.loc[:, 'id'].isin(test.loc[:, 'id']), 'id'].nunique()

11171

In [46]:
sev.shape

(18552, 2)

In [47]:
sev.loc[:, 'id'].nunique()

18552

In [17]:
sev.severity_type.nunique()

5

In [23]:
sev.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


In [25]:
sev.shape

(18552, 2)

In [38]:
features_agg = pd.concat([event_bin, logf_bin, resource_bin, sev_bin, logf_pivot], axis=1)

In [39]:
features_agg.head()

Unnamed: 0_level_0,event_type 1,event_type 10,event_type 11,event_type 12,event_type 13,event_type 14,event_type 15,event_type 17,event_type 18,event_type 19,...,feature 90 volume,feature 91 volume,feature 92 volume,feature 93 volume,feature 94 volume,feature 95 volume,feature 96 volume,feature 97 volume,feature 98 volume,feature 99 volume
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
train_merge = pd.merge(train, features_agg.reset_index(), on='id', how='left')

In [41]:
train_merge.shape

(7381, 843)

In [42]:
train_merge.to_csv('train_dataset.csv', index=False)

In [45]:
test_merge = pd.merge(test, features_agg.reset_index(), on='id', how='left')

In [47]:
test_merge.shape

(11171, 842)

In [48]:
test_merge.to_csv('test_dataset.csv', index=False)