In [7]:
#general
import datetime
import pandas as pd
import numpy as np
import os
import re
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#AWS
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.estimator import Estimator
from sagemaker.model import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, fbeta_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
#get information for later use
#role
role = get_execution_role()
#session
sess = sagemaker.Session()
#sagemaker region
region = sess.boto_session.region_name
#account number
account = sess.boto_session.client('sts').get_caller_identity()['Account']
#s3 bucket name
bucket = 'distribution-reliability-nlp'
#create a sagemaker client object
smclient = boto3.client(service_name='sagemaker')
#model folder name
model_location = 's3://{}/modeling'.format(bucket)
print(model_location)

s3://distribution-reliability-nlp/modeling


In [5]:
#import text data
outage = pd.read_csv('We-WPS-Comments.csv')
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks
0,Equipment,,ON LtOut TRBL SAYS {OTHER SEE COMMENTS}
1,Equipment,Conductor/Cable,ON LtOut TRBL SAYS {ONE LEG OF URD IS DEAD. CU...
2,Vegetation,Out Clearance Zone (ROW),ON HAZ TRBL SAYS {REMOVED TREE FROM LINE NEARP...
3,Equipment,Conductor/Cable,ON LtOut TRBL SAYS {REPAIRED BAD PRIMARY URD C...
4,Equipment,,ON LtOut TRBL SAYS {BAD CONNECTOR ON CUST. SER...


In [9]:
def clean_comments(text):
    '''
    This function will remove the common text before the comments
    '''
    pattern_1 = r"ON LtOut WE\d{4,5} SAYS\s"
    regex_1 = re.sub(pattern_1, '', text)
    pattern_2 = r"ON LtOut TRBL SAYS"
    regex_2 = re.sub(pattern_2, '', regex_1)
    pattern_3 = r"ON HAZ TRBL SAYS"
    regex_3 = re.sub(pattern_3, '', regex_2)
    new_text = regex_3.replace('@', 'at')
    pattern_4 = r"[^a-zA-Z0-9]+"
    return re.sub(pattern_4, ' ', new_text)

outage['Mobile Data Remarks'] = outage['Mobile Data Remarks'].map(str)
outage['Mobile Data Remarks'] = outage['Mobile Data Remarks'].apply(clean_comments)

In [10]:
#remove unknowns and not reporteable
outage = outage[outage['outage category'] != 'Unknown']
outage = outage[outage['outage category'] != 'Not Reportable']

In [46]:
outage_cats = ['Equipment', 'Vegetation', 'Public', 'Wildlife', 'Weather', 
               'Other', 'Power Supply', 'Planned']

equip_outage = outage[outage['outage category'] == 'Equipment']
veg_outage = outage[outage['outage category'] == 'Vegetation']
pub_outage = outage[outage['outage category'] == 'Public']
wild_outage = outage[outage['outage category'] == 'Wildlife']
weather_outage = outage[outage['outage category'] == 'Weather']
other_outage = outage[outage['outage category'] == 'Other']
ps_outage = outage[outage['outage category'] == 'Power Supply']
plan_outage = outage[outage['outage category'] == 'Planned']

In [47]:
#convert target to numeric
def get_sub_category(data, general_category):
    data = data[['outage subcategory', 'Mobile Data Remarks']]

    data = data.dropna()

    sub_cats = data['outage subcategory'].unique()

    with open(general_category + '.txt', 'w') as filehandle:
        for listitem in sub_cats:
            filehandle.write('%s\n' % listitem)
        
    data['outage subcategory'] = pd.factorize(data['outage subcategory'])[0]

    return data

In [76]:
equip_outage_cleaned = get_sub_category(equip_outage, 'Equipment')
veg_outage_cleaned = get_sub_category(veg_outage, 'Vegetation')
pub_outag_cleaned = get_sub_category(pub_outage, 'Public')
wild_outage_cleaned = get_sub_category(wild_outage, 'Wildlife')
weather_outage_cleaned = get_sub_category(weather_outage, 'Weather')
other_outage_cleaned = get_sub_category(other_outage, 'Other')
ps_outage_cleaned = get_sub_category(ps_outage, 'Power Supply')
plan_outage_cleaned = get_sub_category(plan_outage, 'Planned')

In [51]:
#write to csv
equip_outage_cleaned.to_csv('Equipment.csv', header=False, index=False)
veg_outage_cleaned.to_csv('Vegetation.csv', header=False, index=False)
pub_outag_cleaned.to_csv('Public.csv', header=False, index=False)
wild_outage_cleaned.to_csv('Wildlife.csv', header=False, index=False)
weather_outage_cleaned.to_csv('Weather.csv', header=False, index=False)
other_outage_cleaned.to_csv('Other.csv', header=False, index=False)
ps_outage_cleaned.to_csv('Power_Supply.csv', header=False, index=False)
plan_outage_cleaned.to_csv('Planned.csv', header=False, index=False)

In [52]:
equip_path = sess.upload_data(path='Equipment.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

veg_path = sess.upload_data(path='Vegetation.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

pub_path = sess.upload_data(path='Public.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

wild_path = sess.upload_data(path='Wildlife.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

weather_path = sess.upload_data(path='Weather.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

other_path = sess.upload_data(path='Other.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

ps_path = sess.upload_data(path='Power_Supply.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

plan_path = sess.upload_data(path='Planned.csv', 
                                 bucket=bucket,
                                 key_prefix='sagemaker/textdata')

In [54]:
def start_hyper_tune(train_path, general_category):
    algorithm_type = 'Classification'
    algorithm_name = 'Logitstic Regression'
    algorithm_name_type = 'sagemaker-logistic-regression'

    #get docker image name in ECR
    image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name_type)

    #update where the model will be stored
    updated_model_location = model_location + '/' + algorithm_name

    #create estimator with the image selected
    est = Estimator(image_name=image,
                     role=role, 
                     train_instance_count=1, 
                     train_instance_type ='ml.c5.18xlarge',
                     output_path=updated_model_location,
                     sagemaker_session=sess,
                     subnets=['subnet-535e1629', 'subnet-28756740', 'subnet-2253e36e'],
                     security_group_ids=['sg-3168f952'])

    hyperparameter_ranges = {'C': ContinuousParameter(0.1, 1.0),
                            'multi_class': CategoricalParameter(['multinomial', 'auto']),
                            'solver': CategoricalParameter(['newton-cg', 'sag', 'saga', 'lbfgs']),
                            'data_type': CategoricalParameter(['text', 'word'])}

    metric_defs = [{'Name': 'Accuracy',
                    'Regex': "Accuracy: ([0-9.]+).*$"}]

    #make name for tuning job
    tuning_name = 'LR-' + general_category

    #set up hyperparamater turner
    tuner = HyperparameterTuner(estimator=est,
                                hyperparameter_ranges=hyperparameter_ranges,
                                base_tuning_job_name=tuning_name,
                                objective_type='Maximize',
                                objective_metric_name='Accuracy',
                                metric_definitions=metric_defs, 
                                max_jobs=15)


    tuner.fit({'training': train_path}, wait=False)
    
    return print('Tuning Started')

In [37]:
start_hyper_tune(equip_path, 'Equipment')

Tuning Started


In [53]:
start_hyper_tune(veg_path, 'Vegetation')

Tuning Started


In [56]:
start_hyper_tune(pub_path, 'Public')

Tuning Started


In [57]:
start_hyper_tune(wild_path, 'Wildlife')

Tuning Started


In [58]:
start_hyper_tune(weather_path, 'Weather')

Tuning Started


In [60]:
start_hyper_tune(other_path, 'Other')

Tuning Started


In [61]:
start_hyper_tune(ps_path, 'Power-Supply')

Tuning Started


In [62]:
start_hyper_tune(plan_path, 'Planned')

Tuning Started


In [69]:
def get_hyper_prams(general_category):
    if general_category == 'Equipment':
        return {'C': 1.0,  'solver': 'saga'}
    elif general_category == 'Vegetation':
        return {'C': 0.9213044405167753, 'solver': 'newton-cg'}
    elif general_category == 'Public':
        return {'C': 0.8505176838173545, 'solver': 'saga'}
    elif general_category == 'Wildlife':
        return {'C': 0.16984521446826012, 'solver': 'newton-cg'}
    elif general_category == 'Weather':
        return {'C': 0.9907417422692157, 'solver': 'newton-cg'}
    elif general_category == 'Other':
        return {'C': 0.3550248362985299, 'solver': 'lbfgs'}
    else:
        return print('general category not supported')

In [66]:
def retrain_model(data, general_category):
    if general_category == 'Planned':
        return 'New Construction'
    elif general_category == 'Power Supply':
        return 'Transmission'
    else:
        X_train, X_test, y_train, y_test = train_test_split(data['Mobile Data Remarks'],
                                                            data['outage subcategory'],
                                                            test_size=0.25, random_state=42)

        #create count object
        count_vectorizer = CountVectorizer(stop_words='english')

        #Transform the training data using only the 'text' column values
        X_train = count_vectorizer.fit_transform(X_train)

        #Transform the test data using only the 'text' column values
        X_test = count_vectorizer.transform(X_test)

        pickle.dump(count_vectorizer, open(general_category + '-count-vectorizer.pkl', 'wb'))

        logit_hyper = get_hyper_prams(general_category)

        #train the logit
        logit = LogisticRegression(C=float(logit_hyper['C']),
                                solver=logit_hyper['solver'],
                                max_iter=10000,
                                multi_class='multinomial')

        logit = logit.fit(X_train, y_train)

        pickle.dump(logit, open(general_category +'-model.pkl', 'wb'))

        y_pred = logit.predict(X_test)

        return print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))

In [67]:
retrain_model(equip_outage_cleaned, 'Equipment')

Accuracy: 0.9063403781979977


In [68]:
retrain_model(veg_outage_cleaned, 'Vegetation')

Accuracy: 0.8295033996040967


In [70]:
retrain_model(pub_outag_cleaned, 'Public')

Accuracy: 0.8800847457627119


In [71]:
retrain_model(wild_outage_cleaned, 'Wildlife')

Accuracy: 0.9075661572866195


In [72]:
retrain_model(weather_outage_cleaned, 'Weather')

Accuracy: 0.7685325264750378


In [73]:
retrain_model(other_outage_cleaned, 'Other')

Accuracy: 0.8755186721991701


In [74]:
retrain_model(ps_outage_cleaned, 'Power Supply')

'Transmission'

In [75]:
retrain_model(plan_outage_cleaned, 'Planned')

'New Construction'