# PBL Project 3: Predicting Service Disruptions

## Business Understanding
Company is interested in developing an advance predictive model for service disruptions

## Data Understanding
The dataset is in a relational format, split among multiple files. The following provides a description of data in each file

* Event Type Data
* Log Feature Data
* Resource Type Data
* Severity Type Data
* Training Data
* Testing Data

In [1]:
from patsy import dmatrices
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
evetype = pd.read_csv('event_type.csv')
logfeat = pd.read_csv('log_feature.csv')
restype = pd.read_csv('resource_type.csv')
sevtype = pd.read_csv('severity_type.csv')
train = pd.read_csv('train.csv')

df1 = pd.merge(evetype, logfeat)
df2 = pd.merge(restype, sevtype)
df3 = pd.merge(df1, df2)

df = pd.merge(df3, train)

df.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2,location 1,0
1,2588,event_type 15,feature 82,9,resource_type 8,severity_type 1,location 1,0
2,2588,event_type 15,feature 201,5,resource_type 8,severity_type 1,location 1,0
3,2588,event_type 15,feature 80,15,resource_type 8,severity_type 1,location 1,0
4,2588,event_type 15,feature 203,5,resource_type 8,severity_type 1,location 1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 8 columns):
id                61839 non-null int64
event_type        61839 non-null object
log_feature       61839 non-null object
volume            61839 non-null int64
resource_type     61839 non-null object
severity_type     61839 non-null object
location          61839 non-null object
fault_severity    61839 non-null int64
dtypes: int64(3), object(5)
memory usage: 4.2+ MB


In [4]:
print "unique event_type: ", df['event_type'].nunique(), "unique log_feature: ", df['log_feature'].nunique(), "unique resource_type: ", df['resource_type'].nunique(), "unique severity_type: ", df['severity_type'].nunique(), "unique location: ", df['location'].nunique(), "unique fault_severity: ", df['fault_severity'].nunique()


unique event_type:  49 unique log_feature:  331 unique resource_type:  10 unique severity_type:  5 unique location:  929 unique fault_severity:  3


In [5]:
# Clean the data by removing unnecessary text
df['event_type'] = df['event_type'].map(lambda x: x.lstrip('event_type '))
df['log_feature'] = df['log_feature'].map(lambda x: x.lstrip('feature '))
df['resource_type'] = df['resource_type'].map(lambda x: x.lstrip('resource_type '))
df['severity_type'] = df['severity_type'].map(lambda x: x.lstrip('severity_type '))
df['location'] = df['location'].map(lambda x: x.lstrip('location '))

df.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0
2,2588,15,201,5,8,1,1,0
3,2588,15,80,15,8,1,1,0
4,2588,15,203,5,8,1,1,0


In [6]:
# Create the dummy variables using dmatrices
y, X = dmatrices('fault_severity ~ C(event_type) + C(log_feature) + C(resource_type) + C(severity_type) + C(location) + volume', df, return_type = 'dataframe')
print X

       Intercept  C(event_type)[T.10]  C(event_type)[T.11]  \
0            1.0                  0.0                  0.0   
1            1.0                  0.0                  0.0   
2            1.0                  0.0                  0.0   
3            1.0                  0.0                  0.0   
4            1.0                  0.0                  0.0   
5            1.0                  0.0                  1.0   
6            1.0                  0.0                  1.0   
7            1.0                  0.0                  1.0   
8            1.0                  0.0                  1.0   
9            1.0                  0.0                  1.0   
10           1.0                  0.0                  1.0   
11           1.0                  0.0                  1.0   
12           1.0                  0.0                  1.0   
13           1.0                  0.0                  1.0   
14           1.0                  0.0                  0.0   
15      

## Modeling

In [7]:
# Split the data into 70% Training Data and 30% Testing Data with seed(0)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [8]:
# Logistic Regression
lr = LogisticRegression(fit_intercept = False, C = 1e9)
logmodel = lr.fit(X_train, y_train)

lr.coef_

y_predlog = logmodel.predict(X_test)

acclog = accuracy_score(y_predlog, y_test)

print acclog

  y = column_or_1d(y, warn=True)


0.771021992238


In [9]:
# Gaussian Naive Bayes
nb = GaussianNB()
gnbmodel = nb.fit(X_train, y_train)

y_predgnb = gnbmodel.predict(X_test)

accgnb = accuracy_score(y_predgnb, y_test)

print accgnb

  y = column_or_1d(y, warn=True)


0.598318240621


In [10]:
# AdaBoosted Decision Trees
adt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=.1)
adtmodel = adt.fit(X_train, y_train)

y_predadt = adtmodel.predict(X_test)

accadt = accuracy_score(y_predadt, y_test)

print accadt

  y = column_or_1d(y, warn=True)


0.755713669685


In [11]:
# Random Forest
rf = RandomForestClassifier(max_depth=10)
rfmodel = rf.fit(X_train, y_train)

y_predrf = rfmodel.predict(X_test)

accrf = accuracy_score(y_predrf, y_test)

print accrf

  app.launch_new_instance()


0.625215610177


In [12]:
# Gradient Boosting
params = {'n_estimators': 600, 'max_depth': 1,
          'learning_rate': 0.1, 'min_samples_leaf': 1, 'random_state': 3}
gbc = GradientBoostingClassifier(**params)
gbcmodel = gbc.fit(X_train, y_train)

y_predgbc = gbcmodel.predict(X_test)

accgbc = accuracy_score(y_predgbc, y_test)

print accgbc

  y = column_or_1d(y, warn=True)


0.721108236309


In [13]:
print y_predlog

[ 0.  2.  2. ...,  1.  1.  0.]


In [14]:
print coef

[[  1.56825711e+00  -2.33336361e+00  -2.29406750e+00 ...,   3.78724888e-01
    1.85484889e+00   2.30763479e-03]
 [ -3.26152900e+00   2.75201908e+00   2.15788768e+00 ...,   7.31104614e-01
   -3.20104327e-01  -2.64016852e-02]
 [ -3.16436972e+00   1.83568572e+00   2.34105594e+00 ...,  -1.08129577e+00
   -6.39788699e+00   1.47470441e-02]]


In [15]:
RMSElog = mean_squared_error(y_test, y_predlog)**0.5
RMSEgnb = mean_squared_error(y_test, y_predgnb)**0.5
RMSEadt = mean_squared_error(y_test, y_predadt)**0.5
RMSErf = mean_squared_error(y_test, y_predrf)**0.5
RMSEgbc = mean_squared_error(y_test, y_predgbc)**0.5


print RMSElog, RMSEgnb, RMSEadt, RMSErf, RMSEgbc

0.606535130199 0.86919391364 0.603952412531 0.868418387589 0.677281810728
