In [1]:
################
################   Clean Data + join Data + Generate Model + Evaluate
################   Telstra Network Disruptions Kaggle Competition
################

import warnings; warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import seaborn as sns
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from scipy import stats

# import other methods
from clean import *


In [2]:
%matplotlib inline

In [3]:
# read the data
event = pd.read_csv("../../../github_data/telstradisr_data/event_type.csv")
log = pd.read_csv("../../../github_data/telstradisr_data/log_feature.csv")
sample = pd.read_csv("../../../github_data/telstradisr_data/sample_submission.csv")
severity = pd.read_csv("../../../github_data/telstradisr_data/severity_type.csv")
resource = pd.read_csv("../../../github_data/telstradisr_data/resource_type.csv")
train = pd.read_csv("../../../github_data/telstradisr_data/train.csv")
test = pd.read_csv("../../../github_data/telstradisr_data/test.csv")

In [4]:
# obtain integers out of strings with redundant data
event['event_type'] = event['event_type'].str.split(' ').str[1]
log['log_feature'] = log['log_feature'].str.split(' ').str[1]
severity['severity_type'] = severity['severity_type'].str.split(' ').str[1]
resource['resource_type'] = resource['resource_type'].str.split(' ').str[1]
test['location'] = test['location'].str.split(' ').str[1]
train['location'] = train['location'].str.split(' ').str[1]

In [16]:
# check in each log table if ids repeated
tables = [log,event,severity,resource]
names = ['log','event','severity','resource']
event.columns[-1]
for i in range(len(tables)):
    check = unique_column(tables[i],tables[i].columns[-1])
    
    # AGGREGATE CATEGORICAL VALUES INTO A DICTIO
    if check == False:
        if len(tables[i].columns) == 2:
            a = tables[i].groupby([tables[i].columns[-2],tables[i].columns[-1]]).agg({tables[i].columns[-1]:'count'})
            a.index.names = ['id','cat']
            a = a.reset_index()
            a = a.set_index('cat')
            a = a.groupby('id').apply(lambda x: x.to_dict()[tables[i].columns[-1]])
            tables[i] = pd.DataFrame(a,columns=[tables[i].columns[-1]])
            
        elif len(tables[i].columns) == 3:
            a = tables[i].groupby([tables[i].columns[-3],tables[i].columns[-2],tables[i].columns[-1]]).agg({tables[i].columns[-1]:'count',tables[i].columns[-2]:'count'})
            a.index.names = ['id','cat1','cat2']
            a = a.reset_index()
            a = a.set_index('cat1')
            b = a.set_index('cat2')
            a = a.groupby('id').apply(lambda x: x.to_dict()[tables[i].columns[-1]])
            b = b.groupby('id').apply(lambda x: x.to_dict()[tables[i].columns[-2]])
            log1 = pd.DataFrame(a,columns=[tables[i].columns[-1]])
            log2 = pd.DataFrame(b,columns=[tables[i].columns[-2]])
            log1['volume'].apply(lambda x: {int(k):int(v) for k,v in x.items()})
event = tables[1]
severity = tables[2]
resource = tables[3]

TypeError: unhashable type: 'dict'

In [6]:
# join tables with train data 
train = train.merge(event, left_on='id', right_index=True, how='left')
train = train.merge(severity, left_on='id', right_index=True, how='left')
train = train.merge(resource, left_on='id', right_index=True, how='left')
train = train.merge(log1, left_on='id', right_index=True, how='left')
train = train.merge(log2, left_on='id', right_index=True, how='left')
train.head()

Unnamed: 0,id,location,fault_severity,event_type,severity_type,resource_type,volume,log_feature
0,14121,118,1,"{'34': 1, '35': 1}",{'2': 1},{'2': 1},"{'232': 1, '312': 1}",{19: 1}
1,9320,91,0,"{'34': 1, '35': 1}",{'2': 1},{'2': 1},"{'235': 1, '315': 1}","{200: 1, 116: 1}"
2,14394,152,1,"{'34': 1, '35': 1}",{'2': 1},{'2': 1},"{'221': 1, '301': 1}",{1: 1}
3,8218,931,1,"{'11': 1, '15': 1}",{'1': 1},{'8': 1},"{'203': 1, '80': 1, '82': 1}","{1: 1, 12: 1, 9: 1}"
4,14804,120,0,"{'11': 1, '34': 1, '36': 1, '20': 1}",{'1': 1},"{'8': 1, '2': 1}","{'232': 1, '181': 1, '117': 1, '237': 1, '219'...","{1: 1, 2: 1}"


In [7]:
# join tables with test data
test = test.merge(event, left_on='id', right_index=True, how='left')
test = test.merge(severity, left_on='id', right_index=True, how='left')
test = test.merge(resource, left_on='id', right_index=True, how='left')
test = test.merge(log1, left_on='id', right_index=True, how='left')
test = test.merge(log2, left_on='id', right_index=True, how='left')
test.head()

Unnamed: 0,id,location,event_type,severity_type,resource_type,volume,log_feature
0,11066,481,"{'34': 1, '35': 1}",{'2': 1},{'2': 1},"{'310': 1, '230': 1, '308': 1, '228': 1}","{24: 1, 28: 1, 26: 1, 20: 1}"
1,18000,962,"{'11': 1, '15': 1}",{'1': 1},{'8': 1},"{'203': 1, '82': 1}","{9: 1, 20: 1}"
2,16964,491,"{'34': 1, '35': 1}",{'2': 1},{'2': 1},"{'235': 1, '315': 1}","{10: 1, 11: 1}"
3,4795,532,"{'10': 1, '27': 1}",{'5': 1},"{'9': 1, '3': 1}","{'38': 1, '240': 1, '37': 1}",{1: 1}
4,3392,600,{'15': 1},{'2': 1},{'8': 1},"{'203': 1, '82': 1}","{2: 1, 6: 1}"


In [8]:
##### split data  
tr_a, te_a = train_test_split(train, train_size = 0.8)

## define variables 
y_train = tr_a.id
y_test = te_a.id
columns = train.columns
x_train = tr_a[columns[3:]]
x_test = te_a[columns[2:]]

In [10]:
## change null to nan
nton = NullToNaNTrans()
nton = nton.fit(x_train)
x_tr_ntont = nton.transform(x_train)
x_te_ntont = nton.transform(x_test)

NullToNaNTrans fit done.
NullToNaNTrans transform done.
NullToNaNTrans transform done.


In [13]:
## check if nan values
nantr = ifNaN(x_tr_ntont)
nante = ifNaN(x_te_ntont)
print(nantr,nante)

False False


In [14]:
## transform each category in dicts to binary
enc = DictVectorizer()
enc = enc.fit(x_tr_ntont)
x_tr_catobit = enc.transform(x_tr_ntont)
x_te_catobit = enc.transform(x_te_ntont)

AttributeError: 'str' object has no attribute 'items'

In [9]:
##### generate pipeline 
call = PipelineTelstra(RandomForestClassifier)
call.set_params()
call = call.fit(x_train,y_train)

pipeline done.
NullToNaNTrans fit done.
NullToNaNTrans transform done.


TypeError: unhashable type: 'dict'

In [None]:
##### generate y_predict
y_predict = call.predict(x_test)

In [None]:
##### first evaluation confusion matrix
cm1 = confusion_matrix(y_test,y_predict)
cm1

In [None]:
ax = sns.heatmap(cm1)
ax.set_title('confusion matrix 1')

In [None]:
## save confusion matrix 1
fig = ax.get_figure()
fig.savefig("cm1_preCV.png")

In [None]:
##### cross validation 3 folds
kf = KFold(91456, n_folds=3)
itr = defaultdict(list)
ite = defaultdict(list)
c = 0
for trai, tes in kf:
    print("%s %s" % (trai, tes))
    itr[c] = trai
    ite[c] = tes
    c += 1

In [None]:
##### redefine my samples

#### 2.1 samples
x_train1 = x_train.iloc[itr[0],:]
x_test1 = x_train.iloc[ite[0],:]
y_train1 = tr_a.target.iloc[itr[0]]
y_test1 = tr_a.target.iloc[ite[0]]

In [None]:
#### 2.1 make prediction
call = PipelineTelstra(RandomForestClassifier)
call = call.fit(x_train1,y_train1)

In [None]:
y_predict1 = call.predict(x_test1)

In [None]:
#### 2.1 confusion matrix
cm21 = confusion_matrix(y_test1,y_predict1)
cm21

In [None]:
ax = sns.heatmap(cm21)
ax.set_title('confusion matrix 2.1')

In [None]:
fig = ax.get_figure()
fig.savefig("cm21_posCV.png")

In [None]:
#### 2.2 samples
x_train2 = x_train.iloc[itr[1],:]
x_test2 = x_train.iloc[ite[1],:]
y_train2 = tr_a.target.iloc[itr[1]]
y_test2 = tr_a.target.iloc[ite[1]]

In [None]:
#### 2.2 make prediction
call = PipelineTelstra(RandomForestClassifier)
call = call.fit(x_train2,y_train2)

In [None]:
y_predict2 = call.predict(x_test2)

In [None]:
#### 2.2 confusion matrix
cm22 = confusion_matrix(y_test2,y_predict2)
cm22

In [None]:
ax = sns.heatmap(cm22)
ax.set_title('confusion matrix 2.2')

In [None]:
fig = ax.get_figure()
fig.savefig("cm22_posCV.png")

In [None]:
#### 2.3 samples
x_train3 = x_train.iloc[itr[2],:]
x_test3 = x_train.iloc[ite[2],:]
y_train3 = tr_a.target.iloc[itr[2]]
y_test3 = tr_a.target.iloc[ite[2]]

In [None]:
#### 2.3 make prediction
call = PipelineTelstra(RandomForestClassifier)
call = call.fit(x_train3,y_train3)

In [None]:
y_predict3 = call.predict(x_test3)

In [None]:
#### 2.3 confusion matrix
cm23 = confusion_matrix(y_test3,y_predict3)
cm23

In [None]:
ax = sns.heatmap(cm23)
ax.set_title('confusion matrix 2.3')

In [None]:
fig = ax.get_figure()
fig.savefig("cm23_posCV.png")