In [1]:
#general
import datetime
import pandas as pd
import numpy as np
import os
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
#AWS
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.estimator import Estimator
from sagemaker.model import Model

In [2]:
#get information for later use
#role
role = get_execution_role()
#session
sess = sagemaker.Session()
#sagemaker region
region = sess.boto_session.region_name
#account number
account = sess.boto_session.client('sts').get_caller_identity()['Account']
#s3 bucket name
bucket = 'distribution-reliability-nlp'
#create a sagemaker client object
smclient = boto3.client(service_name='sagemaker')
#model folder name
model_location = 's3://{}/modeling'.format(bucket)
print(model_location)

s3://distribution-reliability-nlp/modeling


# Import and Prep Data

In [24]:
#import text data
outage = pd.read_csv('We_WPS_comments_categories.csv')
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks,Company
0,Equipment,Conductor/Cable,one leg of urd is dead customer said we can f...,WE
1,Vegetation,Out Clearance Zone (ROW),removed tree from line nearpole63 1688 and cl...,WE
2,Equipment,Conductor/Cable,repaired bad primary urd cables cable repaire...,WE
3,Equipment,,bad connector on cust serv pole conn replaced...,WE
4,Equipment,Simple Interrupting Device,replaced broken cutout and installed wlp taq ...,WE


In [25]:
#seperating out unknown causes
all_unknowns = outage[outage['outage category'] == 'Unknown']
#save off unknowns
#all_unknowns.to_csv('Unknown Outcomes.csv')
#remove unknowns and not reporteable
outage = outage[outage['outage category'] != 'Unknown']
outage = outage[outage['outage category'] != 'Not Reportable']

#convert target to numeric
outage['outage category'] = pd.factorize(outage['outage category'])[0]
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks,Company
0,0,Conductor/Cable,one leg of urd is dead customer said we can f...,WE
1,1,Out Clearance Zone (ROW),removed tree from line nearpole63 1688 and cl...,WE
2,0,Conductor/Cable,repaired bad primary urd cables cable repaire...,WE
3,0,,bad connector on cust serv pole conn replaced...,WE
4,0,Simple Interrupting Device,replaced broken cutout and installed wlp taq ...,WE


In [26]:
print(outage.shape)
outage = outage[outage['Mobile Data Remarks'].notna()]
print(outage.shape)

(139826, 4)
(139785, 4)


In [27]:
company = 'WPS'
print(outage.shape)
outage = outage[outage['Company'] == company]
print(outage.shape)
#create count object
count_vectorizer = CountVectorizer(stop_words='english')
#Transform the training data using only the 'text' column values
X_train = count_vectorizer.fit_transform(outage['Mobile Data Remarks'])
#select target variable
y_train = outage['outage category']

#write to csv
scipy.sparse.save_npz('train_{}.npz'.format(company), X_train)
y_train.to_csv('train_target_{}.csv'.format(company), header=False, index=False)

# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(
    path='train_{}.npz'.format(company), bucket=bucket,
    key_prefix='sagemaker/data')

train_target_path = sess.upload_data(
    path='train_target_{}.csv'.format(company), bucket=bucket,
    key_prefix='sagemaker/data')

(139785, 4)
(37917, 4)


# Train

In [8]:
def get_estimator(algorithm_name, algorithm_type, account, sess, role):
    '''
    This function takes a model name, account number, sagemaker session, and role to return the model estimator
    '''
    if algorithm_type == 'Classification':
        if algorithm_name == 'Neural Network':
            algorithm_name_type = 'sagemaker-basic-neural-network-classification'
        elif algorithm_name == 'Decision Tree':
            algorithm_name_type = 'sagemaker-decision-tree-classification'
        elif algorithm_name == 'Gradient Boosting':
            algorithm_name_type = 'sagemaker-gradient-boosting-classification'
        elif algorithm_name == 'K-Nearest Neighbors':
            algorithm_name_type = 'sagemaker-k-nearest-neighbors-classification'
        elif algorithm_name == 'Logitstic Regression':
            algorithm_name_type = 'sagemaker-logistic-regression'
        elif algorithm_name == 'Naive Bayes':
            algorithm_name_type = 'sagemaker-naive-bayes-multinomial'
        elif algorithm_name == 'Random Forest':
            algorithm_name_type = 'sagemaker-random-forest-classification'
        elif algorithm_name == 'Stochastic Gradient Descent':
            algorithm_name_type = 'sagemaker-stochastic-gradient-descent-classification'
        elif algorithm_name == 'Support Vector Machine':
            algorithm_name_type = 'sagemaker-support-vector-classification'
        else:
            return print('ModelTypeError: The model entered is not available. Please select a new model type.')
    elif algorithm_type == 'Regression':
        if algorithm_name == 'Neural Network':
            algorithm_name_type = 'sagemaker-basic-neural-network-regression'
        elif algorithm_name == 'Decision Tree':
            algorithm_name_type = 'sagemaker-decision-tree-regression'
        elif algorithm_name == 'Gradient Boosting':
            algorithm_name_type = 'sagemaker-gradient-boosting-regression'
        elif algorithm_name == 'K-Nearest Neighbors':
            algorithm_name_type = 'sagemaker-k-nearest-neighbors-regression'
        elif algorithm_name == 'Random Forest':
            algorithm_name_type = 'sagemaker-random-forest-regression'
        elif algorithm_name == 'Stochastic Gradient Descent':
            algorithm_name_type = 'sagemaker-stochastic-gradient-descent-regression'
        elif algorithm_name == 'Support Vector Machine':
            algorithm_name_type = 'sagemaker-support-vector-regression'
        elif algorithm_name == 'Lasso Regression':
            algorithm_name_type = 'sagemaker-lasso-regression'
        elif algorithm_name == 'Linear Regression':
            algorithm_name_type = 'sagemaker-linear-regression'
        elif algorithm_name == 'Ridge Regression':
            algorithm_name_type = 'sagemaker-ridge-regression'    
        else:
            return print('ModelTypeError: The model entered is not available. Please select a new model type.')

    #get docker image name in ECR
    image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name_type)

    #update where the model will be stored
    updated_model_location = model_location + '/' + algorithm_name

    #create estimator with the image selected
    est = Estimator(image_name=image,
                     role=role, 
                     train_instance_count=1, 
                     train_instance_type ='ml.c5.18xlarge',
                     output_path=updated_model_location,
                     sagemaker_session=sess,
                     subnets=['subnet-535e1629', 'subnet-28756740', 'subnet-2253e36e'],
                     security_group_ids=['sg-3168f952'])
    #https://sagemaker.readthedocs.io/en/stable/overview.html#secure-training-and-inference-with-vpc
    #https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html
    return est

In [9]:
def get_turner(algorithm_name, algorithm_type, data_type, eval_metric, eval_direction, estimator):
    '''
    this function takes in a model, it's type and the sagemaker estimator and returns the hyperparameter turner
    with the full hyperparamerter ranges.
    '''

    #ContinuousParameter, IntegerParameter, CategoricalParameter
    
    #get hyperparmeter ranges
    if algorithm_name == 'Neural Network':
        hyperparameter_ranges = {'hidden_layer_sizes': IntegerParameter(2, 250),
                                'activation': CategoricalParameter(['identity', 'logistic', 'tanh', 'relu']),
                                'solver': CategoricalParameter(['lbfgs', 'sgd', 'adam']),
                                'alpha': ContinuousParameter(0.0001, 0.1),
                                'batch_size': IntegerParameter(100, 1000),
                                'learning_rate': CategoricalParameter(['constant', 'invscaling', 'adaptive']),
                                'learning_rate_init' : ContinuousParameter(0.0001, 0.1),
                                'power_t': ContinuousParameter(0.1, 0.8),
                                'max_iter': IntegerParameter(100, 250),
                                'beta_1': ContinuousParameter(0.1, 0.9),
                                'beta_2': ContinuousParameter(0.1, 0.999)}

    elif algorithm_name == 'Decision Tree':
        hyperparameter_ranges = {'criterion': CategoricalParameter(['gini', 'entropy']),
                                 'splitter': CategoricalParameter(['best', 'random']),
                                'max_depth': IntegerParameter(10, 100),
                                'min_samples_split': IntegerParameter(2, 100),
                                'min_samples_leaf': IntegerParameter(1, 100),
                                'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                'max_features' : IntegerParameter(1, 437),
                                'max_leaf_nodes': IntegerParameter(2, 100),
                                'min_impurity_decrease': ContinuousParameter(0, .25)}
        
    elif algorithm_name == 'Random Forest':
        hyperparameter_ranges = {'n_estimators': IntegerParameter(250, 1000),
                                'criterion': CategoricalParameter(['gini', 'entropy']),
                                #'max_depth': IntegerParameter(10, 100),
                                #'min_samples_split': IntegerParameter(2, 100),
                                #'min_samples_leaf': IntegerParameter(1, 100),
                                #'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                #'max_features' : IntegerParameter(1, 437),
                                #'max_leaf_nodes': IntegerParameter(2, 100),
                                #'min_impurity_decrease': ContinuousParameter(0, .25),
                                'bootstrap': CategoricalParameter([True, False])}    
        
    elif algorithm_name == 'Gradient Boosting':
        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.001, .2),
                                'n_estimators': IntegerParameter(250, 1000),
                                'subsample': ContinuousParameter(.1, 1.0),
                                'criterion': CategoricalParameter(['friedman_mse', 'mse', 'mae']),
                                'max_depth': IntegerParameter(10, 100),
                                'min_samples_split': IntegerParameter(2, 100),
                                'min_samples_leaf': IntegerParameter(1, 100),
                                'min_weight_fraction_leaf': ContinuousParameter(0, .5),
                                'max_features' : IntegerParameter(1, 437),
                                'max_leaf_nodes': IntegerParameter(2, 100),
                                'min_impurity_decrease': ContinuousParameter(0, 0.25)
                                }       
        
    elif algorithm_name == 'K-Nearest Neighbors':
        hyperparameter_ranges = {'n_neighbors': IntegerParameter(3, 250),
                                'weights': CategoricalParameter(['uniform', 'distance']),
                                'algorithm': CategoricalParameter(['auto', 'ball_tree', 'kd_tree', 'brute']),
                                'leaf_size': IntegerParameter(10, 50),
                                'p': IntegerParameter(0, 5)}  
        
    elif algorithm_name == 'Logitstic Regression':
        hyperparameter_ranges = {'C': ContinuousParameter(0.1, 1.0),
                                'multi_class': CategoricalParameter(['multinomial', 'auto']),
                                'solver': CategoricalParameter(['newton-cg', 'sag', 'saga', 'lbfgs'])}
        
    elif algorithm_name == 'Naive Bayes':
        hyperparameter_ranges = {'alpha': ContinuousParameter(0, 1.0),
                                 'fit_prior': CategoricalParameter([True, False])}    

    elif algorithm_name == 'Stochastic Gradient Descent':
        hyperparameter_ranges = {'loss': CategoricalParameter(['hinge', 'log', 'modified_huber', 'squared_hinge',
                                                              'perceptron']),
                                'penalty': CategoricalParameter(['none', 'l2', 'l1', 'elasticnet']),
                                'alpha': ContinuousParameter(0.0001, 0.01),
                                'l1_ratio': ContinuousParameter(0.01, 1.0),
                                'learning_rate': CategoricalParameter(['constant', 'optimal', 'invscaling', 'adaptive']),
                                'eta0': ContinuousParameter(0.0, 0.1),
                                'power_t': ContinuousParameter(0.1, 1.0)}    
        
    elif algorithm_name == 'Support Vector Machine':
        hyperparameter_ranges = {'C': ContinuousParameter(0.1, 1.0),
                                'kernel': CategoricalParameter(['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
                                'degree': IntegerParameter(1, 5),
                                'gamma': IntegerParameter(1, 5),
                                'shrinking': CategoricalParameter([True, False]),
                                'probability': CategoricalParameter([True, False]),
                                'decision_function_shape': CategoricalParameter(['ovr', 'ovo'])}    
    
    #add the data type to the hyperparamter dict
    if data_type == 'text':
        hyperparameter_ranges['data_type'] = CategoricalParameter(['text', 'word'])
    else:
        hyperparameter_ranges['data_type'] = CategoricalParameter(['numeric', 'numbers'])
    
    
    
    #create metrics definitions
    if eval_metric == 'F1-Score':
        metric_defs = [{'Name': 'F1-Score',
                        'Regex': "F1-Score: ([0-9.]+).*$"}]
    elif eval_metric == 'Accuracy':
        metric_defs = [{'Name': 'Accuracy',
                        'Regex': "Accuracy: ([0-9.]+).*$"}]
    elif eval_metric == 'Recall':
        metric_defs = [{'Name': 'Recall',
                        'Regex': "Recall: ([0-9.]+).*$"}]
    elif eval_metric == 'Precision':
        metric_defs = [{'Name': 'Precision',
                        'Regex': "Precision: ([0-9.]+).*$"}]        
    
    
    #make name for tuning job
    tuning_name = company + '-' + algorithm_name.replace(' ', '-') + '-' + algorithm_type
    
    #set up hyperparamater turner
    tuner = HyperparameterTuner(estimator=estimator,
                                hyperparameter_ranges=hyperparameter_ranges,
                                base_tuning_job_name=tuning_name,
                                objective_type=eval_direction,
                                objective_metric_name=eval_metric,
                                metric_definitions=metric_defs, 
                                max_jobs=10)
    
    return tuner

In [10]:
# all model options
algorithms_1 = [#'K-Nearest Neighbors', 
                #'Naive Bayes', 
                #'Support Vector Machine' 
                'Logitstic Regression'
                #'Decision Tree'
]
algorithms_2 = [#'Stochastic Gradient Descent'
                #'Random Forest'
                #'Gradient Boosting',
                 #'Neural Network'
]

algorithms = algorithms_1 + algorithms_2
print(algorithms)
algorithm_type = 'Classification'
eval_metric = 'Accuracy'
eval_direction = 'Maximize'
data_type = 'text'

['Logitstic Regression']


In [28]:
for algorithm in algorithms:
    print(algorithm)
    #get estimater
    est = get_estimator(algorithm, algorithm_type, account, sess, role)
    #get hyper tuner
    tuner = get_turner(algorithm_name=algorithm, 
                       algorithm_type=algorithm_type,
                       data_type=data_type,
                       eval_metric=eval_metric, 
                       eval_direction=eval_direction,
                       estimator=est)
    #start training
    tuner.fit({'training': train_path,
               'target': train_target_path},
               wait=False)
    print('Model Started Training.')

Logitstic Regression
Model Started Training.
