In [1]:
#cleaning data
import pandas as pd
import numpy as np
import datetime
import re
import os
import time
import boto3
#prep for model
from sklearn.feature_extraction.text import CountVectorizer
#pulling in the model
from sklearn.externals import joblib

In [2]:
def clean_comments(text):
    '''
    This function will remove the common text before the comments
    '''
    pattern_1 = r"ON LtOut WE\d{4,5} SAYS\s"
    regex_1 = re.sub(pattern_1, '', text)
    pattern_2 = r"ON LtOut TRBL SAYS"
    regex_2 = re.sub(pattern_2, '', regex_1)
    pattern_3 = r"ON HAZ TRBL SAYS"
    regex_3 = re.sub(pattern_3, '', regex_2)
    new_text = regex_3.replace('@', 'at')
    pattern_4 = r"[^a-zA-Z0-9]+"
    return re.sub(pattern_4, ' ', new_text)

In [3]:
def category_name(data, decoder_obj):
    '''
    This function takes model predictions and returns the word category
    '''
    result = []
    for pred in data:
        result.append(decoder_obj[pred])
    return result

In [4]:
def get_decoder(general_category):

    decoder_list = []
    f = open(general_category + '.txt', 'r')
    f1 = f.readlines()
    for x in f1:
        x = x[:-1]
        decoder_list.append(x)

    decoder_dict = {}
    for i in range(0, len(decoder_list)):
        decoder_dict[i] = decoder_list[i]
        
    return decoder_dict

In [5]:
def predict_category(data, category):
    if category == 'Planned':
        data['Subcategory Predictions'] = 'New Construction'
        return data
    elif category == 'Power-Supply':
        data['Subcategory Predictions'] = 'Transmission'
        return data
    else:
        if category == 'General':
            var_name = 'General'
        else:
            var_name = 'Subcategory'
        
        
        #import text vectorizer
        count_vectorizer = joblib.load(category + '-count-vectorizer.pkl')
        #import model
        model = joblib.load(category +'-model.pkl')  
        
        if category != 'General':
        #limit the data
            data = data[data['General Predictions'] == str(category)]
            
        #vectorize text
        X = count_vectorizer.transform(data['Mobile Data Remarks'])

        # Use the loaded model to make predictions
        try:
            pred = model.predict(X)
            pred_prob = model.predict_proba(X)
            
            confidence = []
            for i in range(0,len(pred_prob)):
                confidence.append(pred_prob[i][pred[i]])
        except:
            data[var_name + ' Predictions'] = ''
            data[var_name + ' Confidence'] = ''
            return data

        decoder = get_decoder(category)
        

        data[var_name + ' Predictions'] = category_name(pred, decoder)
        data[var_name + ' Confidence'] = confidence

        return data

In [None]:
os.chdir(path)

today = datetime.datetime.today()

we_file = 'We-Outages_{}_{}_{}.xlsx'.format(today.year, 
                                            today.month,
                                           today.day)

wps_file = 'WPS-Outages_{}_{}_{}.xlsx'.format(today.year, 
                                            today.month,
                                           today.day)


all_files = [we_file, wps_file]

In [26]:
bucket = 'distribution-reliability-nlp'
#make an s3 object
s3 = boto3.client('s3')
asset_files = s3.list_objects(Bucket=bucket, Prefix='testing/assets/')['Contents']

for file in asset_files:
    the_file = file['Key']
    file_name = the_file.split('/')[-1]
    import_location = '/home/ec2-user/SageMaker/AmazonSageMaker-distribution-reliability-nlp/test/{}'.format(file_name)
    s3.download_file(bucket, the_file, import_location)

In [None]:
#copy model results from s3 to instance
for file in all_files:
    import_location = 'testing/import/{}'.format(file)
    s3.download_file(bucket, import_location, file)
                                                            
#import data
we_outages = pd.read_excel(we_file)
wps_outages = pd.read_excel(wps_file)


#save raw data to new s3 location
we_response = s3.upload_file(we_file,
                                Bucket='distribution-reliability-nlp', 
                                Key='testing/raw/{}'.format(we_file))

wps_response = s3.upload_file(wps_file,
                                Bucket='distribution-reliability-nlp', 
                                Key='testing/raw/{}'.format(wps_file))

In [7]:
code_time =[]
t0 = time.time()
we_keep = ['Outage', 'Mobile Data Remarks']
wps_keep = ['Event', 'ClosureRemarks']

we_save = we_outages[we_keep]
wps_save = wps_outages[wps_keep]

we_save['Company'] = 'We Energies'
wps_save['Company'] = 'WPS'

we_save = we_save.rename(columns={"Outage": "Outage ID"})
wps_save = wps_save.rename(columns={"Event": "Outage ID", 'ClosureRemarks': 'Mobile Data Remarks'})

all_outages = pd.concat([we_save, wps_save])


#make sure comments are strings
all_outages['Mobile Data Remarks'] = all_outages['Mobile Data Remarks'].map(str)
all_outages['Mobile Data Remarks'] = all_outages['Mobile Data Remarks'].apply(clean_comments)


t1 = time.time()
code_time.append(t1-t0)
print(code_time[-1])

0.40266895294189453


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
t0 = time.time()
all_outages = predict_category(all_outages, 'General')

outage_types = ['Equipment', 'Vegetation', 'Public', 'Wildlife', 'Weather', 
                'Other', 'Power-Supply', 'Planned']

results_list = []
for types in outage_types:
    results_list.append(predict_category(all_outages, types))

results_df = pd.concat(results_list, sort=True)

results_df = results_df[['Outage ID', 'Company', 
                         'General Predictions', 'General Confidence', 
                        'Subcategory Predictions', 'Subcategory Confidence']]

results_df['Was the Outage Reportable?'] = ''
results_df['Reportable Reference Outage ID'] = ''
results_df['Did You Change the General Category?'] = ''
results_df['What was the Correct General Category?'] = ''
results_df['Did You Change the Subcategory Category?'] = ''
results_df['What was the Correct Subcategory Category?'] = ''

t1 = time.time()
code_time.append(t1-t0)
print(code_time[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


3.3985447883605957


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [None]:
results_file_location = ('NLP-Results_{}_{}_{}.csv').format(today.year, 
                                                         today.month,
                                                        today.day)

#save to csv
results_df.to_csv(results_file_location, header=False, index=False)
    
results_response = s3.upload_file(Filename=results_file_location,
                                Bucket='distribution-reliability-nlp', 
                                Key='testing/export/{}'.format(results_file_location))
    

In [2]:
#make an s3 object
s3 = boto3.client('s3')
#get the bucket from the event object
bucket = 'distribution-reliability-nlp' #event['Records'][0]['s3']['bucket']['name']


#get to day's date
today = datetime.datetime.today()

#file names in s3
model_file = 'Model_{}_{}_{}.csv'.format(today.year, 
                                        today.month,
                                         today.day)


import_location = 'testing/to_model/{}'.format(model_file)
s3.download_file(bucket, import_location, model_file)

#import data
all_outages = pd.read_csv(model_file)

Unnamed: 0,Company,Mobile Data Remarks,Outage ID
0,We Energies,ON LtOut W41569 SAYS XFMR AT 84 24423 HAD MULT...,2793281
1,We Energies,120V LEG OF SERVICE DROP DAMAGED BY GUNFIRE I...,2793285
2,We Energies,1 INSULINK PULLED OFF REPAIRED POWER OK 0755 ...,2793323
3,We Energies,A PHASE RECLOSER DEFECTIVE POLE 80 03632 BYPA...,2793337
4,We Energies,A PHASE RECLOSER DEFECTIVE POLE 80 03632 BYPA...,2793337


In [11]:
all_outages = all_outages.dropna()

In [12]:
all_outages = all_outages[all_outages['Mobile Data Remarks'] != 'nan']
all_outages = all_outages[all_outages['Mobile Data Remarks'] != ' ']

In [13]:
all_outages

Unnamed: 0,Company,Mobile Data Remarks,Outage ID
0,We Energies,ON LtOut W41569 SAYS XFMR AT 84 24423 HAD MULT...,2793281
1,We Energies,120V LEG OF SERVICE DROP DAMAGED BY GUNFIRE I...,2793285
2,We Energies,1 INSULINK PULLED OFF REPAIRED POWER OK 0755 ...,2793323
3,We Energies,A PHASE RECLOSER DEFECTIVE POLE 80 03632 BYPA...,2793337
4,We Energies,A PHASE RECLOSER DEFECTIVE POLE 80 03632 BYPA...,2793337
5,We Energies,ON HAZ XXWB13 SAYS CUSTOMER CUT TREE FELL INTO...,2793413
6,We Energies,ON HAZ XXWB13 SAYS CUSTOMER CUT TREE FELL INTO...,2793413
7,We Energies,ON HAZ XXWB13 SAYS CUSTOMER CUT TREE FELL INTO...,2793413
8,We Energies,REPLACED DEFT INSULINKS AT WEATHERHEAD POWER ...,2793436
9,We Energies,GARAGE FIRE SERVICE DROPS DOWN TO BOTH 4620 A...,2793456


In [14]:
comments = list(all_outages['Mobile Data Remarks'])

for comment in comments:
    if len(str(comment)) < 10:
        print(comment)

planned
planned
no
squirrel
squirrel
tree fell
