# Service Disruption
### The objective of this project was not to make predictions but to clean the data to prep it for modeling. 

#### The data provided for a telecom project that is attempting to predict which cellular towers need repairs without disrupting service for accessing essential services. 

#### First the necessary libraries were imported

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import xgboost

  from pandas.core import datetools


#### The data sets are loaded

In [2]:
event = pd.read_csv('event_type.csv')
log_feature = pd.read_csv('log_feature.csv')
resource = pd.read_csv('resource_type.csv')
severity = pd.read_csv('severity_type.csv')
train = pd.read_csv('train.csv')

#### For each of the data sets the first string is removed and the type remains an object so that the values can be used as dummy variables

In [3]:
event.info()
event['event_type'] = event['event_type'].str.strip("event_type")
event.reset_index()
event.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31170 entries, 0 to 31169
Data columns (total 2 columns):
id            31170 non-null int64
event_type    31170 non-null object
dtypes: int64(1), object(1)
memory usage: 487.1+ KB


Unnamed: 0,id,event_type
0,6597,11
1,8011,15
2,2597,15
3,5022,15
4,5022,11


In [4]:
log_feature.info()
log_feature['log_feature'] = log_feature['log_feature'].str.strip("log_feature")
log_feature.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58671 entries, 0 to 58670
Data columns (total 3 columns):
id             58671 non-null int64
log_feature    58671 non-null object
volume         58671 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


Unnamed: 0,id,log_feature,volume
0,6597,68,6
1,8011,68,7
2,2597,68,1
3,5022,172,2
4,5022,56,1


In [5]:
resource.info()
resource['resource_type'] = resource['resource_type'].str.strip('resource_type ')
resource.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21076 entries, 0 to 21075
Data columns (total 2 columns):
id               21076 non-null int64
resource_type    21076 non-null object
dtypes: int64(1), object(1)
memory usage: 329.4+ KB


Unnamed: 0,id,resource_type
0,6597,8
1,8011,8
2,2597,8
3,5022,8
4,6852,8


In [6]:
severity.info()
severity['severity_type'] = severity['severity_type'].str.strip('severity_type ')
severity.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18552 entries, 0 to 18551
Data columns (total 2 columns):
id               18552 non-null int64
severity_type    18552 non-null object
dtypes: int64(1), object(1)
memory usage: 289.9+ KB


Unnamed: 0,id,severity_type
0,6597,2
1,8011,2
2,2597,2
3,5022,1
4,6852,1


In [7]:
train.info()
train['location'] = train['location'].str.strip('location ')
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7381 entries, 0 to 7380
Data columns (total 3 columns):
id                7381 non-null int64
location          7381 non-null object
fault_severity    7381 non-null int64
dtypes: int64(2), object(1)
memory usage: 173.1+ KB


Unnamed: 0,id,location,fault_severity
0,14121,118,1
1,9320,91,0
2,14394,152,1
3,8218,931,1
4,14804,120,0


#### The data sets are merged and fault severity is purposely left out since it is the target variable in this case.  

In [8]:
features = event.merge(log_feature)
features = features.merge(resource)
features = features.merge(severity)
features = features.merge(train[['id', 'location']])
target = train[['fault_severity']]
features.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location
0,8011,15,68,7,8,2,1
1,2588,15,82,9,8,1,1
2,2588,15,201,5,8,1,1
3,2588,15,80,15,8,1,1
4,2588,15,203,5,8,1,1


In [9]:
target.head()

Unnamed: 0,fault_severity
0,1
1,0
2,1
3,1
4,0


#### The dummy variables are created and the merged data frame is grouped by ID to deal with the problem with duplicate IDs 

In [10]:
df = pd.get_dummies(features)
df = df.groupby('id').sum()

In [11]:
df.head()

Unnamed: 0_level_0,volume,event_type_ 1,event_type_ 10,event_type_ 11,event_type_ 12,event_type_ 13,event_type_ 14,event_type_ 15,event_type_ 18,event_type_ 19,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20,0.0,0.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### The merged data frame is assigned as the features or x and the target is assigned as y.

In [12]:
x = df
y = target
x.shape, y.shape

((7381, 1325), (7381, 1))

#### The data set is split into training and testing sets for fitting and making predictions

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.6, random_state = 10)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4428, 1325), (2953, 1325), (4428, 1), (2953, 1))

#### Extra Gradient Boosting classifier was used to make the predictions

In [14]:
xgb = xgboost.XGBClassifier()
xgb = xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
accuracy_score(y_test, pred)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.65052488994243141