# PBL Project 3: Predicting Service Disruptions

## Business Understanding
Company is interested in developing an advance predictive model for service disruptions

## Data Understanding
The dataset is in a relational format, split among multiple files. The following provides a description of data in each file

* Event Type Data
* Log Feature Data
* Resource Type Data
* Severity Type Data
* Training Data
* Testing Data

In [1]:
from patsy import dmatrices
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

First, we need to read the CSV files and merge them into one data frame

In [2]:
evetype = pd.read_csv('event_type.csv')
logfeat = pd.read_csv('log_feature.csv')
restype = pd.read_csv('resource_type.csv')
sevtype = pd.read_csv('severity_type.csv')
train = pd.read_csv('train.csv')

df1 = pd.merge(evetype, logfeat)
df2 = pd.merge(restype, sevtype)
df3 = pd.merge(df1, df2)

df = pd.merge(df3, train)

df.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2,location 1,0
1,2588,event_type 15,feature 82,9,resource_type 8,severity_type 1,location 1,0
2,2588,event_type 15,feature 201,5,resource_type 8,severity_type 1,location 1,0
3,2588,event_type 15,feature 80,15,resource_type 8,severity_type 1,location 1,0
4,2588,event_type 15,feature 203,5,resource_type 8,severity_type 1,location 1,0


Let's explore the data set to understand the data type and unique values

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Data columns (total 8 columns):
id                61839 non-null int64
event_type        61839 non-null object
log_feature       61839 non-null object
volume            61839 non-null int64
resource_type     61839 non-null object
severity_type     61839 non-null object
location          61839 non-null object
fault_severity    61839 non-null int64
dtypes: int64(3), object(5)
memory usage: 4.2+ MB


In [4]:
print "unique event_type: ", df['event_type'].nunique(), "unique log_feature: ", df['log_feature'].nunique(), "unique resource_type: ", df['resource_type'].nunique(), "unique severity_type: ", df['severity_type'].nunique(), "unique location: ", df['location'].nunique(), "unique fault_severity: ", df['fault_severity'].nunique()


unique event_type:  49 unique log_feature:  331 unique resource_type:  10 unique severity_type:  5 unique location:  929 unique fault_severity:  3


In [5]:
# Clean the data by removing unnecessary text
df['event_type'] = df['event_type'].map(lambda x: x.lstrip('event_type '))
df['log_feature'] = df['log_feature'].map(lambda x: x.lstrip('feature '))
df['resource_type'] = df['resource_type'].map(lambda x: x.lstrip('resource_type '))
df['severity_type'] = df['severity_type'].map(lambda x: x.lstrip('severity_type '))
df['location'] = df['location'].map(lambda x: x.lstrip('location '))

df.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0
2,2588,15,201,5,8,1,1,0
3,2588,15,80,15,8,1,1,0
4,2588,15,203,5,8,1,1,0


In [6]:
# Create the dummy variables using dmatrices
y, X = dmatrices('fault_severity ~ C(event_type) + C(log_feature) + C(resource_type) + C(severity_type) + C(location) + volume', df, return_type = 'dataframe')
X.head()

Unnamed: 0,Intercept,C(event_type)[T.10],C(event_type)[T.11],C(event_type)[T.12],C(event_type)[T.13],C(event_type)[T.14],C(event_type)[T.15],C(event_type)[T.18],C(event_type)[T.19],C(event_type)[T.2],...,C(location)[T.989],C(location)[T.99],C(location)[T.990],C(location)[T.991],C(location)[T.994],C(location)[T.995],C(location)[T.996],C(location)[T.998],C(location)[T.999],volume
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [7]:
y.head()

Unnamed: 0,fault_severity
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Modeling

In [8]:
# Split the data into 70% Training Data and 30% Testing Data with seed(0)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [9]:
# Logistic Regression
lr = LogisticRegression(fit_intercept = False, C = 1e9)
logmodel = lr.fit(X_train, y_train)

lr.coef_

y_predlog = logmodel.predict(X_test)
y_predproba = logmodel.predict_proba(X_test)

acclog = accuracy_score(y_predlog, y_test)

print acclog

  y = column_or_1d(y, warn=True)


0.771021992238


In [10]:
# Drop Intercept column to fit different models
X = X.drop(['Intercept'], axis = 1)

In [11]:
# Split the data into 70% Training Data and 30% Testing Data with seed(0)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [12]:
# AdaBoosted Decision Trees
adt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=0.1)
adtmodel = adt.fit(X_train, y_train)

y_predadt = adtmodel.predict(X_test)

accadt = accuracy_score(y_predadt, y_test)

print accadt

  y = column_or_1d(y, warn=True)


0.75706123329


In [13]:
# Random Forest
rf = RandomForestClassifier(n_estimators= 25, max_depth= None, max_features = 0.4, random_state = 42)
rfmodel = rf.fit(X_train, y_train)

y_predrf = rfmodel.predict(X_test)

accrf = accuracy_score(y_predrf, y_test)

print accrf

  app.launch_new_instance()


0.811071582579


In [14]:
# Gradient Boosting
params = {'n_estimators': 600, 'max_depth': 1,
          'learning_rate': 0.1, 'min_samples_leaf': 1, 'random_state': 3}
gbc = GradientBoostingClassifier(**params)
gbcmodel = gbc.fit(X_train, y_train)

y_predgbc = gbcmodel.predict(X_test)

accgbc = accuracy_score(y_predgbc, y_test)

print accgbc

  y = column_or_1d(y, warn=True)


0.721108236309


In the end, Random Forest resulted in having the highest accuracy score of the algorithms tested with an accuracy score of 81.1%. Although Logistic Regression had the second highest accuracy of 77%, it was the quickest to train and predict, making it less computationally expensive compared to the ensemble methods. 