In [50]:
#general
import datetime
import pandas as pd
import numpy as np
import os
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
#AWS
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.estimator import Estimator
from sagemaker.model import Model

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, fbeta_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import pickle

In [3]:
#get information for later use
#role
role = get_execution_role()
#session
sess = sagemaker.Session()
#sagemaker region
region = sess.boto_session.region_name
#account number
account = sess.boto_session.client('sts').get_caller_identity()['Account']
#s3 bucket name
bucket = 'distribution-reliability-nlp'
#create a sagemaker client object
smclient = boto3.client(service_name='sagemaker')
#model folder name
model_location = 's3://{}/modeling'.format(bucket)
print(model_location)

s3://distribution-reliability-nlp/modeling


# Import and Prep Data

In [5]:
#import data
we_outage = pd.read_csv('We_Cleaned.csv')
we_outage.head()

Unnamed: 0,Volt_D,Volt_E,Volt_F,Volt_H,Volt_J,Volt_M,Volt_R,Volt_SD,Volt_TV,Volt_US,...,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,outage category,outage subcategory
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Equipment,
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Equipment,Conductor/Cable
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Vegetation,Out Clearance Zone (ROW)
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Equipment,Conductor/Cable
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Equipment,


In [13]:
#import text data
outage = pd.read_csv('We_WPS_comments_categories.csv')
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks
0,Equipment,Conductor/Cable,one leg of urd is dead customer said we can f...
1,Vegetation,Out Clearance Zone (ROW),removed tree from line nearpole63 1688 and cl...
2,Equipment,Conductor/Cable,repaired bad primary urd cables cable repaire...
3,Equipment,,bad connector on cust serv pole conn replaced...
4,Equipment,Simple Interrupting Device,replaced broken cutout and installed wlp taq ...


In [7]:
#split features and target
we_outage_limited = we_outage.drop(['outage subcategory'], axis=1)
we_outage_limited['outage category'] = pd.factorize(we_outage_limited['outage category'])[0]
we_outage_limited = we_outage_limited[['outage category']+list(we_outage_limited)[:-1]]
we_outage_limited.head()

Unnamed: 0,outage category,Volt_D,Volt_E,Volt_F,Volt_H,Volt_J,Volt_M,Volt_R,Volt_SD,Volt_TV,...,week_9,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [14]:
#seperating out unknown causes
all_unknowns = outage[outage['outage category'] == 'Unknown']
#save off unknowns
#all_unknowns.to_csv('Unknown Outcomes.csv')
#remove unknowns and not reporteable
outage = outage[outage['outage category'] != 'Unknown']
outage = outage[outage['outage category'] != 'Not Reportable']

#convert target to numeric
outage['outage category'] = pd.factorize(outage['outage category'])[0]
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks
0,0,Conductor/Cable,one leg of urd is dead customer said we can f...
1,1,Out Clearance Zone (ROW),removed tree from line nearpole63 1688 and cl...
2,0,Conductor/Cable,repaired bad primary urd cables cable repaire...
3,0,,bad connector on cust serv pole conn replaced...
4,0,Simple Interrupting Device,replaced broken cutout and installed wlp taq ...


In [8]:
#train, test split
train_data, test_data = train_test_split(we_outage_limited, test_size=0.1, random_state=42)
train_data.head()

NameError: name 'we_outage_limited' is not defined

In [19]:
print(outage.shape)
outage = outage[outage['Mobile Data Remarks'].notna()]
print(outage.shape)

(139826, 3)
(139785, 3)


In [20]:
train_data, test_data = train_test_split(outage, test_size=0.1, random_state=42)
#create count object
count_vectorizer = CountVectorizer(stop_words='english')

#Transform the training data using only the 'text' column values
X_train = count_vectorizer.fit_transform(train_data['Mobile Data Remarks'])

#Transform the test data using only the 'text' column values
X_test = count_vectorizer.transform(test_data['Mobile Data Remarks'])
type(X_train)

scipy.sparse.csr.csr_matrix

In [51]:
tfidf_train_data, tfidf_test_data = train_test_split(outage, test_size=0.1, random_state=42)
#create count object
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

#Transform the training data using only the 'text' column values
tfidf_X_train = tfidf_vectorizer.fit_transform(tfidf_train_data['Mobile Data Remarks'])

#Transform the test data using only the 'text' column values
tfidf_X_test = tfidf_vectorizer.transform(tfidf_test_data['Mobile Data Remarks'])

In [52]:
hash_train_data, hash_test_data = train_test_split(outage, test_size=0.1, random_state=42)
#create count object
hash_vectorizer = HashingVectorizer(stop_words='english')

#Transform the training data using only the 'text' column values
hash_X_train = hash_vectorizer.fit_transform(hash_train_data['Mobile Data Remarks'])

#Transform the test data using only the 'text' column values
hash_X_test = hash_vectorizer.transform(hash_test_data['Mobile Data Remarks'])

In [21]:
pickle.dump(count_vectorizer, open('count-vectorizer.pkl', 'wb'))

In [11]:
decoder.to_csv('decoder.csv', index=False)

In [22]:
y_train = train_data['outage category']
y_test = test_data['outage category']

50083     4
37297     1
125019    4
5111      3
77071     1
Name: outage category, dtype: int64

In [53]:
tfidf_y_train = tfidf_train_data['outage category']
tfidf_y_test = tfidf_test_data['outage category']

In [54]:
hash_y_train = hash_train_data['outage category']
hash_y_test = hash_test_data['outage category']

In [None]:
#adapt test data to batch transform restrictions for later
test_data = test_data.head(1000)
test_data_chopped = test_data.drop(['outage category'], axis=1)
test_data_chopped.head()

In [None]:
#write to csv
train_data.to_csv('Non-Text_train.csv', header=False, index=False)
test_data_chopped.to_csv('Non-Text_test.csv', header=False, index=False)

In [23]:
#write to csv
scipy.sparse.save_npz('train.npz', X_train)
y_train.to_csv('train_target.csv', header=False, index=False)
scipy.sparse.save_npz('test.npz', X_test)

In [55]:
#write to csv
scipy.sparse.save_npz('tfidf_train.npz', tfidf_X_train)
tfidf_y_train.to_csv('tfidf_train_target.csv', header=False, index=False)
scipy.sparse.save_npz('tfidf_test.npz', X_test)

In [56]:
#write to csv
scipy.sparse.save_npz('hash_train.npz', X_train)
hash_y_train.to_csv('hash_train_target.csv', header=False, index=False)
scipy.sparse.save_npz('hash_test.npz', X_test)

In [7]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path='Non-Text_train.csv', bucket=bucket,
    key_prefix='sagemaker/data')

testpath = sess.upload_data(
    path='Non-Text_test.csv', bucket=bucket,
    key_prefix='sagemaker/data')

In [24]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(
    path='train.npz', bucket=bucket,
    key_prefix='sagemaker/data')

train_target_path = sess.upload_data(
    path='train_target.csv', bucket=bucket,
    key_prefix='sagemaker/data')

test_path = sess.upload_data(
    path='test.npz', bucket=bucket,
    key_prefix='sagemaker/data')

In [64]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(
    path='tfidf_train.npz', bucket=bucket,
    key_prefix='sagemaker/data')

train_target_path = sess.upload_data(
    path='tfidf_train_target.csv', bucket=bucket,
    key_prefix='sagemaker/data')

test_path = sess.upload_data(
    path='tfidf_test.npz', bucket=bucket,
    key_prefix='sagemaker/data')

In [60]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(
    path='hash_train.npz', bucket=bucket,
    key_prefix='sagemaker/data')

train_target_path = sess.upload_data(
    path='hash_train_target.csv', bucket=bucket,
    key_prefix='sagemaker/data')

test_path = sess.upload_data(
    path='hash_test.npz', bucket=bucket,
    key_prefix='sagemaker/data')

# Train

In [25]:
def get_estimator(algorithm_name, algorithm_type, account, sess, role):
    '''
    This function takes a model name, account number, sagemaker session, and role to return the model estimator
    '''
    if algorithm_type == 'Classification':
        if algorithm_name == 'Neural Network':
            algorithm_name_type = 'sagemaker-basic-neural-network-classification'
        elif algorithm_name == 'Decision Tree':
            algorithm_name_type = 'sagemaker-decision-tree-classification'
        elif algorithm_name == 'Gradient Boosting':
            algorithm_name_type = 'sagemaker-gradient-boosting-classification'
        elif algorithm_name == 'K-Nearest Neighbors':
            algorithm_name_type = 'sagemaker-k-nearest-neighbors-classification'
        elif algorithm_name == 'Logitstic Regression':
            algorithm_name_type = 'sagemaker-logistic-regression'
        elif algorithm_name == 'Naive Bayes':
            algorithm_name_type = 'sagemaker-naive-bayes-multinomial'
        elif algorithm_name == 'Random Forest':
            algorithm_name_type = 'sagemaker-random-forest-classification'
        elif algorithm_name == 'Stochastic Gradient Descent':
            algorithm_name_type = 'sagemaker-stochastic-gradient-descent-classification'
        elif algorithm_name == 'Support Vector Machine':
            algorithm_name_type = 'sagemaker-support-vector-classification'
        else:
            return print('ModelTypeError: The model entered is not available. Please select a new model type.')
    elif algorithm_type == 'Regression':
        if algorithm_name == 'Neural Network':
            algorithm_name_type = 'sagemaker-basic-neural-network-regression'
        elif algorithm_name == 'Decision Tree':
            algorithm_name_type = 'sagemaker-decision-tree-regression'
        elif algorithm_name == 'Gradient Boosting':
            algorithm_name_type = 'sagemaker-gradient-boosting-regression'
        elif algorithm_name == 'K-Nearest Neighbors':
            algorithm_name_type = 'sagemaker-k-nearest-neighbors-regression'
        elif algorithm_name == 'Random Forest':
            algorithm_name_type = 'sagemaker-random-forest-regression'
        elif algorithm_name == 'Stochastic Gradient Descent':
            algorithm_name_type = 'sagemaker-stochastic-gradient-descent-regression'
        elif algorithm_name == 'Support Vector Machine':
            algorithm_name_type = 'sagemaker-support-vector-regression'
        elif algorithm_name == 'Lasso Regression':
            algorithm_name_type = 'sagemaker-lasso-regression'
        elif algorithm_name == 'Linear Regression':
            algorithm_name_type = 'sagemaker-linear-regression'
        elif algorithm_name == 'Ridge Regression':
            algorithm_name_type = 'sagemaker-ridge-regression'    
        else:
            return print('ModelTypeError: The model entered is not available. Please select a new model type.')

    #get docker image name in ECR
    image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name_type)

    #update where the model will be stored
    updated_model_location = model_location + '/' + algorithm_name

    #create estimator with the image selected
    est = Estimator(image_name=image,
                     role=role, 
                     train_instance_count=1, 
                     train_instance_type ='ml.c5.18xlarge',
                     output_path=updated_model_location,
                     sagemaker_session=sess,
                     subnets=['subnet-535e1629', 'subnet-28756740', 'subnet-2253e36e'],
                     security_group_ids=['sg-3168f952'])
    #https://sagemaker.readthedocs.io/en/stable/overview.html#secure-training-and-inference-with-vpc
    #https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html
    return est

In [69]:
def get_turner(algorithm_name, algorithm_type, data_type, eval_metric, eval_direction, estimator):
    '''
    this function takes in a model, it's type and the sagemaker estimator and returns the hyperparameter turner
    with the full hyperparamerter ranges.
    '''

    #ContinuousParameter, IntegerParameter, CategoricalParameter
    
    #get hyperparmeter ranges
    if algorithm_name == 'Neural Network':
        hyperparameter_ranges = {'hidden_layer_sizes': IntegerParameter(2, 250),
                                'activation': CategoricalParameter(['identity', 'logistic', 'tanh', 'relu']),
                                'solver': CategoricalParameter(['lbfgs', 'sgd', 'adam']),
                                'alpha': ContinuousParameter(0.0001, 0.1),
                                'batch_size': IntegerParameter(100, 1000),
                                'learning_rate': CategoricalParameter(['constant', 'invscaling', 'adaptive']),
                                'learning_rate_init' : ContinuousParameter(0.0001, 0.1),
                                'power_t': ContinuousParameter(0.1, 0.8),
                                'max_iter': IntegerParameter(100, 250),
                                'beta_1': ContinuousParameter(0.1, 0.9),
                                'beta_2': ContinuousParameter(0.1, 0.999)}

    elif algorithm_name == 'Decision Tree':
        hyperparameter_ranges = {'criterion': CategoricalParameter(['gini', 'entropy']),
                                 'splitter': CategoricalParameter(['best', 'random']),
                                'max_depth': IntegerParameter(10, 100),
                                'min_samples_split': IntegerParameter(2, 100),
                                'min_samples_leaf': IntegerParameter(1, 100),
                                'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                'max_features' : IntegerParameter(1, 437),
                                'max_leaf_nodes': IntegerParameter(2, 100),
                                'min_impurity_decrease': ContinuousParameter(0, .25)}
        
    elif algorithm_name == 'Random Forest':
        hyperparameter_ranges = {'n_estimators': IntegerParameter(250, 1000),
                                'criterion': CategoricalParameter(['gini', 'entropy']),
                                #'max_depth': IntegerParameter(10, 100),
                                #'min_samples_split': IntegerParameter(2, 100),
                                #'min_samples_leaf': IntegerParameter(1, 100),
                                #'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                #'max_features' : IntegerParameter(1, 437),
                                #'max_leaf_nodes': IntegerParameter(2, 100),
                                #'min_impurity_decrease': ContinuousParameter(0, .25),
                                'bootstrap': CategoricalParameter([True, False])}    
        
    elif algorithm_name == 'Gradient Boosting':
        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.001, .2),
                                'n_estimators': IntegerParameter(250, 1000),
                                'subsample': ContinuousParameter(.1, 1.0),
                                'criterion': CategoricalParameter(['friedman_mse', 'mse', 'mae']),
                                'max_depth': IntegerParameter(10, 100),
                                'min_samples_split': IntegerParameter(2, 100),
                                'min_samples_leaf': IntegerParameter(1, 100),
                                'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                'max_features' : IntegerParameter(1, 437),
                                'max_leaf_nodes': IntegerParameter(2, 100),
                                'min_impurity_decrease': ContinuousParameter(0, 0.25)
                                }       
        
    elif algorithm_name == 'K-Nearest Neighbors':
        hyperparameter_ranges = {'n_neighbors': IntegerParameter(3, 250),
                                'weights': CategoricalParameter(['uniform', 'distance']),
                                'algorithm': CategoricalParameter(['auto', 'ball_tree', 'kd_tree', 'brute']),
                                'leaf_size': IntegerParameter(10, 50),
                                'p': IntegerParameter(0, 5)}  
        
    elif algorithm_name == 'Logitstic Regression':
        hyperparameter_ranges = {'C': ContinuousParameter(0.1, 1.0),
                                'multi_class': CategoricalParameter(['multinomial', 'auto']),
                                'solver': CategoricalParameter(['newton-cg', 'sag', 'saga', 'lbfgs'])}
        
    elif algorithm_name == 'Naive Bayes':
        hyperparameter_ranges = {'alpha': ContinuousParameter(0, 1.0),
                                 'fit_prior': CategoricalParameter([True, False])}    

    elif algorithm_name == 'Stochastic Gradient Descent':
        hyperparameter_ranges = {'loss': CategoricalParameter(['hinge', 'log', 'modified_huber', 'squared_hinge',
                                                              'perceptron']),
                                'penalty': CategoricalParameter(['none', 'l2', 'l1', 'elasticnet']),
                                'alpha': ContinuousParameter(0.0001, 0.01),
                                'l1_ratio': ContinuousParameter(0.01, 1.0),
                                'learning_rate': CategoricalParameter(['constant', 'optimal', 'invscaling', 'adaptive']),
                                'eta0': ContinuousParameter(0.0, 0.1),
                                'power_t': ContinuousParameter(0.1, 1.0)}    
        
    elif algorithm_name == 'Support Vector Machine':
        hyperparameter_ranges = {'C': ContinuousParameter(0.1, 1.0),
                                'kernel': CategoricalParameter(['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
                                'degree': IntegerParameter(1, 5),
                                'gamma': IntegerParameter(1, 5),
                                'shrinking': CategoricalParameter([True, False]),
                                'probability': CategoricalParameter([True, False]),
                                'decision_function_shape': CategoricalParameter(['ovr', 'ovo'])}    
    
    #add the data type to the hyperparamter dict
    if data_type == 'text':
        hyperparameter_ranges['data_type'] = CategoricalParameter(['text', 'word'])
    else:
        hyperparameter_ranges['data_type'] = CategoricalParameter(['numeric', 'numbers'])
    
    
    
    #create metrics definitions
    if eval_metric == 'F1-Score':
        metric_defs = [{'Name': 'F1-Score',
                        'Regex': "F1-Score: ([0-9.]+).*$"}]
    elif eval_metric == 'Accuracy':
        metric_defs = [{'Name': 'Accuracy',
                        'Regex': "Accuracy: ([0-9.]+).*$"}]
    elif eval_metric == 'Recall':
        metric_defs = [{'Name': 'Recall',
                        'Regex': "Recall: ([0-9.]+).*$"}]
    elif eval_metric == 'Precision':
        metric_defs = [{'Name': 'Precision',
                        'Regex': "Precision: ([0-9.]+).*$"}]        
    
    
    #make name for tuning job
    tuning_name = algorithm_name.replace(' ', '-') + '-' + algorithm_type
    
    #set up hyperparamater turner
    tuner = HyperparameterTuner(estimator=estimator,
                                hyperparameter_ranges=hyperparameter_ranges,
                                base_tuning_job_name=tuning_name,
                                objective_type=eval_direction,
                                objective_metric_name=eval_metric,
                                metric_definitions=metric_defs, 
                                max_jobs=10)
    
    return tuner

In [62]:
# all model options
algorithms_1 = [#'K-Nearest Neighbors', 
                #'Naive Bayes', 
                #'Support Vector Machine' 
                #'Logitstic Regression'
                #'Decision Tree'
]
algorithms_2 = [#'Stochastic Gradient Descent'
                'Random Forest'
                #'Gradient Boosting',
                 #'Neural Network'
]

algorithms = algorithms_1 + algorithms_2
print(algorithms)
algorithm_type = 'Classification'
eval_metric = 'Accuracy'
eval_direction = 'Maximize'
data_type = 'text'

['Random Forest']


In [68]:
for algorithm in algorithms:
    print(algorithm)
    #get estimater
    est = get_estimator(algorithm, algorithm_type, account, sess, role)
    #get hyper tuner
    tuner = get_turner(algorithm_name=algorithm, 
                       algorithm_type=algorithm_type,
                       data_type=data_type,
                       eval_metric=eval_metric, 
                       eval_direction=eval_direction,
                       estimator=est)
    #start training
    tuner.fit({'training': train_path,
               'target': train_target_path},
               wait=False)
    print('Model Started Training.')

Random Forest
Model Started Training.


# Testing

In [73]:
# Search the training job by Amazon S3 location of model artifacts
search_params={
   "MaxResults": 100,
   "Resource": "TrainingJob",
   "SearchExpression": { 
      "Filters": [ 
         { 
            "Name": "InputDataConfig.DataSource.S3DataSource.S3Uri",
            "Operator": "Contains",
             
             # set this to have a word that is in your bucket name
            "Value": '{}'.format(bucket)
         },
        { 
            "Name": "TrainingJobStatus",
            "Operator": "Equals",
            "Value": 'Completed'
         }, 
    ],
     
   },
    
    "SortBy": "LastModifiedTime",
    "SortOrder": "Descending"
}
results = smclient.search(**search_params)

In [74]:
#look a validation results
images = []
scores = []
hypers = []
models = []
for each in results['Results']:
    try:
        if each['TrainingJob']['FinalMetricDataList'][0]['MetricName'] == eval_metric:
            images.append(each['TrainingJob']['AlgorithmSpecification']['TrainingImage'].split('/')[1].split(':')[0][10:])
            scores.append(each['TrainingJob']['FinalMetricDataList'][0]['Value'])
            hypers.append(each['TrainingJob']['HyperParameters'])
            
        #find job name
        job_name = each['TrainingJob']['TrainingJobName']
        #find model artifact
        artifact = each['TrainingJob']['ModelArtifacts']['S3ModelArtifacts']
        # get training image
        image =  each['TrainingJob']['AlgorithmSpecification']['TrainingImage']
        #make a sagemaker model
        m = Model(artifact, image, role = role, sagemaker_session = sess, name = job_name)
        #append the sagemaker model to the models list
        models.append(m)
    except:
        pass

In [76]:
#looking at the model results
modeling_df = pd.DataFrame({'Model Name': images,
                            'Metric Results': scores,
                            'HyperParameters': hypers,
                            'Actual Model': models})
modeling_df = modeling_df.sort_values('Metric Results', ascending=False)
modeling_df = modeling_df.reset_index(drop=True)
modeling_df

Unnamed: 0,Model Name,Metric Results,HyperParameters,Actual Model
0,logistic-regression,0.826418,"{'C': '0.865280262065544', '_tuning_objective_...",<sagemaker.model.Model object at 0x7f2a89ec75c0>
1,stochastic-gradient-descent-classification,0.826418,"{'_tuning_objective_metric': 'Accuracy', 'alph...",<sagemaker.model.Model object at 0x7f2842e7dd68>
2,logistic-regression,0.826339,"{'C': '0.9995955195233935', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2842e7d048>
3,logistic-regression,0.826339,"{'C': '1.0', '_tuning_objective_metric': 'Accu...",<sagemaker.model.Model object at 0x7f2a8a488908>
4,logistic-regression,0.826339,"{'C': '0.8778731048356974', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2a8a488e10>
5,logistic-regression,0.826339,"{'C': '0.8682699554216349', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2842e7d978>
6,logistic-regression,0.826339,"{'C': '0.9199331634294474', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2a89ec7320>
7,basic-neural-network-classification,0.826259,"{'_tuning_objective_metric': 'Accuracy', 'acti...",<sagemaker.model.Model object at 0x7f2a89851cf8>
8,logistic-regression,0.826259,"{'C': '0.9105947845231974', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2842e7d630>
9,logistic-regression,0.826101,"{'C': '0.9741506927263224', '_tuning_objective...",<sagemaker.model.Model object at 0x7f2a8a488748>


In [79]:
#show hyper params for best text models
how_many_models_to_test = 3
for row_num in range(0,how_many_models_to_test):
    print(modeling_df.loc[row_num, 'Model Name'] + ':\n' + str(modeling_df.loc[row_num, 'HyperParameters']))

logistic-regression:
{'C': '0.865280262065544', '_tuning_objective_metric': 'Accuracy', 'data_type': 'text', 'multi_class': 'multinomial', 'solver': 'newton-cg'}
stochastic-gradient-descent-classification:
{'_tuning_objective_metric': 'Accuracy', 'alpha': '0.0036573825316065425', 'data_type': 'word', 'eta0': '0.046752434891641693', 'l1_ratio': '0.4414895769855142', 'learning_rate': 'adaptive', 'loss': 'log', 'penalty': 'none', 'power_t': '0.3536959988101933'}
logistic-regression:
{'C': '0.9995955195233935', '_tuning_objective_metric': 'Accuracy', 'data_type': 'word', 'multi_class': 'auto', 'solver': 'sag'}


In [7]:
logit_hyper = {'C': '0.865280262065544', 
               'multi_class': 'multinomial', 
               'solver': 'newton-cg'}
sgd_hyper = {'alpha': '0.0036573825316065425', 
               'eta0': '0.046752434891641693',
               'l1_ratio': '0.4414895769855142',
               'learning_rate': 'adaptive',
               'loss': 'log',
               'penalty': 'none',
               'power_t': '0.3536959988101933'}

In [10]:
#train the logit
logit = LogisticRegression(C=float(logit_hyper['C']),
                            solver=logit_hyper['solver'],
                            max_iter=10000,
                            multi_class=logit_hyper['multi_class'])

logit = logit.fit(X_train, y_train)

In [86]:
#train sgd
sgd = SGDClassifier(loss=sgd_hyper['loss'],
                        penalty=sgd_hyper['penalty'],
                        alpha=float(sgd_hyper['alpha']),
                        l1_ratio=float(sgd_hyper['l1_ratio']),
                        max_iter=10000,
                        learning_rate=sgd_hyper['learning_rate'],
                        eta0=float(sgd_hyper['eta0']),
                        power_t=float(sgd_hyper['power_t']))    


sgd = sgd.fit(X_train, y_train)



In [87]:
print('Testing Model.')
y_pred_logit = logit.predict(X_test)

#model results
print('Model Results')
print(confusion_matrix(y_test, y_pred_logit))
print('Accuracy: ' + str(accuracy_score(y_test, y_pred_logit)))
print('Recall: ' + str(recall_score(y_test, y_pred_logit, average='weighted')))
print('Precision: ' + str(precision_score(y_test, y_pred_logit, average='weighted')))
print('F1-Score: ' + str(f1_score(y_test, y_pred_logit, average='weighted')))

Testing Model.
Model Results
[[4011  311   35   26  214   11    0   28]
 [ 226 4118   45   25  195    1    0   12]
 [ 115  128  728   16   15    1    0   12]
 [  80   53   27  934  152    6    0    0]
 [ 292  236   14   31 1233    2    0    5]
 [  39   22    5    6   14   25    0    3]
 [   0    0    0    0    0    0   18    1]
 [  33   28    8    0    3    0    0  462]]
Accuracy: 0.8232059978579079
Recall: 0.8232059978579079
Precision: 0.8229488327335591
F1-Score: 0.8211746065116802


In [88]:
print('Testing Model.')
y_pred_sgd = sgd.predict(X_test)

#model results
print('Model Results')
print(confusion_matrix(y_test, y_pred_sgd))
print('Accuracy: ' + str(accuracy_score(y_test, y_pred_sgd)))
print('Recall: ' + str(recall_score(y_test, y_pred_sgd, average='weighted')))
print('Precision: ' + str(precision_score(y_test, y_pred_sgd, average='weighted')))
print('F1-Score: ' + str(f1_score(y_test, y_pred_sgd, average='weighted')))

Testing Model.
Model Results
[[3703  375  106   81  309   29    0   33]
 [ 362 3807   85   55  288    9    0   16]
 [ 119  111  706   31   31    8    1    8]
 [  88   78   37  929  117    3    0    0]
 [ 294  297   34   67 1091   17    0   13]
 [  38   12    8   12    9   33    0    2]
 [   1    0    0    0    0    0   17    1]
 [  29   29   10    1    3    4    0  458]]
Accuracy: 0.7671545876472688
Recall: 0.7671545876472688
Precision: 0.7670445127403522
F1-Score: 0.7669836866347125


In [93]:
#safe off best model
pickle.dump(logit, open('text-model.pkl', 'wb'))

In [96]:
unknowns = count_vectorizer.transform(all_unknowns['Mobile Data Remarks'])
unknown_pred = logit.predict(unknowns)

In [119]:
unknown_pred_df = pd.DataFrame({'comments': all_unknowns['Mobile Data Remarks'].reset_index(drop=True),
                                   'predictions': pd.Series(data=unknown_pred)})
unknown_pred_df.head()

Unnamed: 0,comments,predictions
0,25 ALF BLOWN AT PT 72 5159 UNKNOWN CAUSE BACK...,0
1,8 ATF BLOWN AT PT 73 263 UNKNOWN CAUSE REPLAC...,0
2,8 AMP TRANS FUSE WAS BLOWN UNKNOWN CAUSE AT P...,0
3,CLOSED RISER FUSE REPLACED FUSE,4
4,CLOSED 12K LINE FUSE ON POLE 71 2346 TIME WAS...,1


In [125]:
decoder_dict = decoder.to_dict()
decoder_dict

{'Word': {0: 'Equipment',
  1: 'Vegetation',
  2: 'Public',
  3: 'Wildlife',
  4: 'Weather',
  5: 'Other',
  6: 'Power Supply',
  7: 'Planned'},
 'Number': {0: 0, 1: 7, 2: 5, 3: 9, 4: 8, 5: 2, 6: 4, 7: 3}}

In [126]:
def category_name(data):
    
    return decoder_dict['Word'][data]

unknown_pred_df['outage_category_pred'] = unknown_pred_df['predictions'].apply(category_name)
unknown_pred_df.head()

Unnamed: 0,comments,predictions,outage_category_pred
0,25 ALF BLOWN AT PT 72 5159 UNKNOWN CAUSE BACK...,0,Equipment
1,8 ATF BLOWN AT PT 73 263 UNKNOWN CAUSE REPLAC...,0,Equipment
2,8 AMP TRANS FUSE WAS BLOWN UNKNOWN CAUSE AT P...,0,Equipment
3,CLOSED RISER FUSE REPLACED FUSE,4,Weather
4,CLOSED 12K LINE FUSE ON POLE 71 2346 TIME WAS...,1,Vegetation


In [128]:
unknown_pred_df.to_csv('Unknown Outage Category Predictions.csv', index=False)

output_path = sess.upload_data(
    path='Unknown Outage Category Predictions.csv', bucket=bucket,
    key_prefix='sagemaker/results')

In [27]:
best_words = pd.DataFrame({'Equipment': logit.coef_[0],
                          'Vegetation': logit.coef_[1],
                           'Public': logit.coef_[2],
                           'Wildlife': logit.coef_[3],
                           'Weather': logit.coef_[4],
                           'Other': logit.coef_[5],
                           'Power Supply': logit.coef_[6],
                           'Planned': logit.coef_[7]},
                         index=count_vectorizer.get_feature_names())

best_words.to_csv('Best Words.csv')

output_path = sess.upload_data(
    path='Best Words.csv', bucket=bucket,
    key_prefix='sagemaker/results')

In [59]:
def run_batch_transform(model, test_data_location):
    '''
    Test the data by running the test data through batch transform job.
    '''

    transformer = model.transformer(instance_count=1,
                                    instance_type='ml.c5.4xlarge',
                                    output_path='s3://{}/batch_results/{}'.format(bucket, model.name)
                                   )

    transformer.transform(data=test_data_location, content_type='text/csv')


how_many_models_to_test = 1
for model in modeling_df['Actual Model'][:how_many_models_to_test]:
    run_batch_transform(model, test_path)

Using already existing model: Stochastic-Gradient--191016-1544-003-625076be


In [None]:
#copy batch transform data to local instance
os.system('aws s3 sync s3://{}/batch_results/ /home/ec2-user/SageMaker/AmazonSageMaker-we-energies-customer-retention/batch_results/'.format(bucket))

In [None]:
def get_dataframe():
    '''
    Loops through the directory on your local notebook instance where the batch results were stored, 
        and generates a dataframe where each column is the output from a different model.
    '''
    frames  = []
    
    for sub_dir in os.listdir('/home/ec2-user/SageMaker/AmazonSageMaker-we-energies-customer-retention/batch_results'):
        if '.ipynb' not in sub_dir and '.out' not in sub_dir:

            old_file = '/home/ec2-user/SageMaker/AmazonSageMaker-we-energies-customer-retention/batch_results/{}/test_data_180.csv.out'.format(sub_dir)
            
            new_file = '/home/ec2-user/SageMaker/AmazonSageMaker-we-energies-customer-retention/batch_results/{}/test_data_180.csv'.format(sub_dir)
            
            # remove the .out file formate
            os.system('cp {} {}'.format( old_file, new_file))
            
            df = pd.read_csv('/home/ec2-user/SageMaker/AmazonSageMaker-we-energies-customer-retention/batch_results/{}/test_data_180.csv'.format(sub_dir), names = [sub_dir])

            frames.append(df)
            
    df = pd.concat(frames, axis=1)
                
    return df

In [None]:
def consolidate_results(df):
    '''
    find range of predictions
    '''

    df['max'] = 0
    df['min'] = 0
    df['diff'] = 0

    for idx, row in df.iterrows():

        top = max(row)
        bottom = min(row)

        diff = top - bottom

        df.loc[idx, 'max'] = top
        df.loc[idx, 'min'] = bottom
        df.loc[idx, 'diff'] = diff

    return df

bare_df = get_dataframe()
consolidated_df = consolidate_results(bare_df)

In [None]:
def add_label_to_results(df):
    '''
    add the true outcomes to the dataframe
    '''
    #test_data = pd.read_csv('test_data_180.csv')
    y_true = test_data['close_in_180'].values.tolist()
    df['y_true'] = y_true
    return df
    
    
testing_df = add_label_to_results(consolidated_df)

In [None]:
def get_confusion_matrix(df, model_column, accuracy=None):
    '''
    create a confusion matrix.
    '''
    
    mx = pd.crosstab(index=df['y_true'], columns=np.round(df[model_column]), rownames=['actuals'], colnames=['predictions'])

    # lower right corner
    tps = mx.iloc[1, 1]
        
    # upper right corner
    fps = mx.iloc[0, 1]
    
    # lower left corner
    fns = mx.iloc[1, 0]
    
    precision = np.round(tps / (tps + fns), 4) * 100
    
    recall = np.round(tps / (tps + fps), 4) * 100
    
    print ('Precision = {}%, Recall = {}%'.format(precision, recall))
    
    if accuracy:
        
        # upper left corner 
        tns = mx.iloc[0, 0]
        
        accuracy = (tps + tns) / (fns + fps + tps + tns) * 100
        
        print ('Overall binary classification accuracy = {}%'.format(accuracy))
        
    return mx

In [None]:
#get model results
get_confusion_matrix(testing_df,'Random-Forest-Classi-191009-1813-006-ec402e88', accuracy=True)

In [None]:
#enssembling?
get_confusion_matrix(testing_df, 'max', accuracy=True)