# Service Disruption
### The objective of this project was not necessarily to make predictions but to clean the data to prep it for modeling. 

#### The data provided for a telecom project that is attempting to predict which cellular towers need repairs without disrupting service for accessing essential services. 

#### First the necessary libraries were imported

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')



#### The data sets are loaded

In [2]:
event = pd.read_csv('event_type.csv')
log = pd.read_csv('log_feature.csv')
resource = pd.read_csv('resource_type.csv')
severity = pd.read_csv('severity_type.csv')
train = pd.read_csv('train.csv')

#### The data sets are merged so that it is easier to take out the first string and convert all of the categorical variables into dummy variables

In [3]:
raw_data = event.merge(log, on='id')
raw_data = raw_data.merge(resource, on='id')
raw_data = raw_data.merge(severity, on='id')
raw_data = raw_data.merge(train, on='id')
raw_data.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2,location 1,0
1,2588,event_type 15,feature 82,9,resource_type 8,severity_type 1,location 1,0
2,2588,event_type 15,feature 201,5,resource_type 8,severity_type 1,location 1,0
3,2588,event_type 15,feature 80,15,resource_type 8,severity_type 1,location 1,0
4,2588,event_type 15,feature 203,5,resource_type 8,severity_type 1,location 1,0


#### A data frame for the clean data set is created 

In [4]:
clean_data = pd.DataFrame(columns=raw_data.columns)
clean_data['id'] = raw_data['id']
clean_data['fault_severity'] = raw_data['fault_severity']
clean_data['volume'] = raw_data['volume']

#### The first string in the value is stripped and remains an object type to be used as dummy variables later for each column

In [5]:
location = []
for i in range(len(raw_data)):
    l = raw_data['location'][i].strip('location ')
    location.append(l)
    
clean_data['location'] = location    

In [6]:
log_feature = []

for i in range(len(raw_data)):
    lf = raw_data['log_feature'][i].strip('feature')
    log_feature.append(lf)

clean_data['log_feature'] = log_feature    

In [7]:
event_type = []

for i in range(len(raw_data)):
    et = raw_data['event_type'][i].strip('event_type')
    event_type.append(et)

clean_data['event_type'] = event_type    

In [8]:
resource_type = []

for i in range(len(raw_data)):
    rt = raw_data['resource_type'][i].strip('resource_type ')
    resource_type.append(rt)

clean_data['resource_type'] = resource_type    

In [9]:
severity_type = []

for i in range(len(raw_data)):
    st = raw_data['severity_type'][i].strip('severity_type ')
    severity_type.append(st)
    
clean_data['severity_type'] = severity_type    

In [10]:
clean_data.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0
2,2588,15,201,5,8,1,1,0
3,2588,15,80,15,8,1,1,0
4,2588,15,203,5,8,1,1,0


In [11]:
categories = clean_data.copy()
categories.drop('fault_severity', axis=1, inplace=True)
categories.drop('id', axis=1, inplace=True)
categories.drop('volume', axis=1, inplace=True)

categories.head()

Unnamed: 0,event_type,log_feature,resource_type,severity_type,location
0,15,68,8,2,1
1,15,82,8,1,1
2,15,201,8,1,1
3,15,80,8,1,1
4,15,203,8,1,1


#### The dummy variables are created with the column values

In [12]:
cat_cols = categories.columns

prefixes = ['event', 'log_f', 'resource', 'severity', 'location']
for i in range(len(prefixes)):
    dummies = pd.get_dummies(categories[cat_cols[i]], prefix=prefixes[i])
    categories = categories.join(dummies)
    categories.drop(cat_cols[i], axis=1, inplace=True)
categories.head()    

Unnamed: 0,event_ 1,event_ 10,event_ 11,event_ 12,event_ 13,event_ 14,event_ 15,event_ 18,event_ 19,event_ 2,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### The duplicate data is removed using a groupby on the location

In [13]:
df = categories.copy()
df['location'] = clean_data['location']
df = df.groupby(by='location').sum()

column_length = df.columns[0:len(df.columns)+1].unique()
for i in range(len(column_length)):
    df[column_length[i]] = df[column_length[i]].apply(lambda x: 1 if x > 0 else 0)
        

df1 = clean_data[['location', 'fault_severity']]
df1 = df1.groupby(by='location').mean()
df1['fault_severity'] = df1['fault_severity'].apply(lambda x: round(x,0))

df2 = clean_data[['location', 'volume']]
df2 = df2.groupby(by='location').sum()

#### The x and y variables are defined with y being the fault severity

In [14]:
X = df2.join(df)
y = df1.fault_severity

#### The data set is split into training and testing sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

#### Gradient Boosting is used as the model used for prediction 
#### The model is fitted to the training set and the prediction is made

In [16]:
model = GradientBoostingClassifier(n_estimators = 10)
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(pred, y_test)

0.64157706093189959

#### The objective was to make a prediction on the fault severity level which consists of 0, 1, 2, in addition to the probability of each severity level. So a separate data frame is created so display the prediction and the probabilities and stored into a csv file.

In [17]:
model_cols = ['location', 'predicted', '0', '1', '2']
pred_model = pd.DataFrame(columns = model_cols)
pred_model['location'] = np.unique(location)

for i in range(len(pred_model)):
    x = X.ix[i]
    x = x.values.reshape(1, -1)
    pred_model[model_cols[1]][i] = model.predict(x).item(0)
    pred_model[model_cols[2]][i] = model.predict_proba(x).item(0)
    pred_model[model_cols[3]][i] = model.predict_proba(x).item(1)
    pred_model[model_cols[4]][i] = model.predict_proba(x).item(2)

In [18]:
pred_model.head(20)

Unnamed: 0,location,predicted,0,1,2
0,1,1,0.164565,0.755929,0.0795057
1,10,0,0.540291,0.33695,0.122759
2,100,0,0.678033,0.222254,0.0997128
3,1000,0,0.540291,0.33695,0.122759
4,1002,0,0.63951,0.264226,0.0962638
5,1005,0,0.63951,0.264226,0.0962638
6,1006,1,0.210197,0.666532,0.123271
7,1007,1,0.361394,0.48336,0.155246
8,1008,1,0.238812,0.645811,0.115377
9,1009,0,0.610458,0.28552,0.104022


In [19]:
pred_model.to_csv('output_result.csv')