In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

%matplotlib inline
plt.style.use('ggplot')

In [2]:
train_df = pd.read_csv('dataset/telstra/train.csv')
print 'train_df', train_df.shape
test_df = pd.read_csv('dataset/telstra/test.csv')
print 'test_df', test_df.shape
event_df = pd.read_csv('dataset/telstra/event_type.csv')
print 'event_df', event_df.shape
log_df = pd.read_csv('dataset/telstra/log_feature.csv')
print 'log_df', log_df.shape
resource_df = pd.read_csv('dataset/telstra/resource_type.csv')
print 'resource_df', resource_df.shape
severity_df = pd.read_csv('dataset/telstra/severity_type.csv')
print 'severity_df', severity_df.shape

train_df (7381, 3)
test_df (11171, 2)
event_df (31170, 2)
log_df (58671, 3)
resource_df (21076, 2)
severity_df (18552, 2)


In [3]:
# Set 'id' as index
train_df.set_index('id', drop=True, inplace=True)
test_df.set_index('id', drop=True, inplace=True)
event_df.set_index('id', drop=True, inplace=True)
log_df.set_index('id', drop=True, inplace=True)
resource_df.set_index('id', drop=True, inplace=True)
severity_df.set_index('id', drop=True, inplace=True)

In [4]:
# Drop target variable from training set
y = train_df['fault_severity']
train_df.drop('fault_severity', axis=1, inplace=True)
all_data = pd.concat([train_df, test_df])

In [5]:
# Categorize and create dummy variables for each categorical features
# Creating dummy variables will increase the number of features (since features are very less here).
all_data['location'] = all_data['location'].astype('category')
all_data = pd.get_dummies(all_data)
event_df['event_type'] = event_df['event_type'].astype('category')
event_df = pd.get_dummies(event_df)
resource_df['resource_type'] = resource_df['resource_type'].astype('category')
resource_df = pd.get_dummies(resource_df)
severity_df['severity_type'] = severity_df['severity_type'].astype('category')
severity_df = pd.get_dummies(severity_df)

In [6]:
# Remove duplicate ids
events = event_df.reset_index().groupby('id').agg(np.sum)
resources = resource_df.reset_index().groupby('id').agg(np.sum)
severities = severity_df.reset_index().groupby('id').agg(np.sum)

In [7]:
# Join every features into a single dataframe
all_data = pd.merge(all_data, events, left_index=True, right_index=True, how='inner')
all_data = pd.merge(all_data, resources, left_index=True, right_index=True, how='inner')
all_data = pd.merge(all_data, severities, left_index=True, right_index=True, how='inner')

In [8]:
# Split again into train and test dataframes
train = all_data.loc[train_df.index]
test = all_data.loc[test_df.index]

In [9]:
X_train, X_cv, y_train, y_cv = train_test_split(train, y, test_size=0.3, random_state=7)

In [10]:
n_splits = np.arange(2, 50, 2)
scores = []
for n in n_splits:
    dtc = DecisionTreeClassifier(min_samples_split=n)
    dtc.fit(X_train, y_train)
    score = dtc.score(X_cv, y_cv)
    scores.append(score)

In [11]:
best_split = n_splits[scores.index(max(scores))]

dtc = DecisionTreeClassifier(min_samples_split=best_split)
dtc.fit(X_train, y_train)
y_cv_pred = dtc.predict(X_cv)

In [12]:
print "classification report"
print classification_report(y_cv, y_cv_pred)
print "confusion matrix"
print confusion_matrix(y_cv, y_cv_pred)

classification report
             precision    recall  f1-score   support

          0       0.73      0.85      0.79      1401
          1       0.46      0.30      0.36       585
          2       0.47      0.39      0.43       229

avg / total       0.63      0.66      0.64      2215

confusion matrix
[[1196  157   48]
 [ 354  176   55]
 [  88   51   90]]


In [13]:
# This is how we read confusion matrix
pd.crosstab(y_cv, y_cv_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1196,157,48,1401
1,354,176,55,585
2,88,51,90,229
All,1638,384,193,2215


In [14]:
# Solution
y_test = dtc.predict(test)
sol = pd.DataFrame({'id': test.index, 'predict': y_test})
sol = sol.set_index('id')
sol['predict'] = sol['predict'].astype('category')
sol = pd.get_dummies(sol)
sol.to_csv('outputs/telstra.csv')