In [1]:
#general
import datetime
import pandas as pd
import numpy as np
import os
import scipy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
#import text data
outage = pd.read_csv('We_WPS_comments_categories.csv')

#seperating out unknown causes
all_unknowns = outage[outage['outage category'] == 'Unknown']
#save off unknowns
#all_unknowns.to_csv('Unknown Outcomes.csv')
#remove unknowns and not reporteable
outage = outage[outage['outage category'] != 'Unknown']
outage = outage[outage['outage category'] != 'Not Reportable']

#convert target to numeric
outage['outage category'] = pd.factorize(outage['outage category'])[0]
outage = outage[outage['Mobile Data Remarks'].notna()]
outage.head()

Unnamed: 0,outage category,outage subcategory,Mobile Data Remarks
0,0,Conductor/Cable,one leg of urd is dead customer said we can f...
1,1,Out Clearance Zone (ROW),removed tree from line nearpole63 1688 and cl...
2,0,Conductor/Cable,repaired bad primary urd cables cable repaire...
3,0,,bad connector on cust serv pole conn replaced...
4,0,Simple Interrupting Device,replaced broken cutout and installed wlp taq ...


In [3]:
train_data, test_data = train_test_split(outage, test_size=0.1, random_state=42)

In [4]:
def vectorizing(vectorizing_type):
    if vectorizing_type == 'Count':
        return CountVectorizer(stop_words='english')
    elif vectorizing_type == 'TFIDF':
        return TfidfVectorizer(stop_words='english')
    elif vectorizing_type == 'Hash':
        return HashingVectorizer(stop_words='english')
    else:
        return print('Vectorizer Not an option')
    return vectorizer

In [5]:
def unsupervised_model(algorithm_name, components):
    if algorithm_name == 'NMF':
        model = NMF(n_components=components)
        
    elif algorithm_name == 'SVD':
        model = TruncatedSVD(n_components=components)
        
    else:
        print('Algorithnm is not an option.')
            
    return model

In [6]:
def logit_model(training_data, testing_data):
    
    logit_hyper = {'C': '0.865280262065544', 
                   'multi_class': 'multinomial', 
                   'solver': 'newton-cg'}
    
    #train the logit
    logit = LogisticRegression(C=float(logit_hyper['C']),
                                solver=logit_hyper['solver'],
                                max_iter=10000,
                                multi_class=logit_hyper['multi_class'])

    logit = logit.fit(training_data, train_data['outage category'])

    y_pred_logit = logit.predict(testing_data)
    return accuracy_score(test_data['outage category'], y_pred_logit)

In [7]:
vector_time = []
unsupervised_train_time = []
unsupervised_apply_time = []
model_time = []
model_accuracy = []
this_vector = []
this_algorithm = []
this_component = []


vectorizing_types = ['Count', #'TFIDF', 'Hash'
                    ]
algorithm_names = [#'NMF', 
                    'SVD'
                    #'None'
                    ]
components = [25000]

In [None]:
for vectorizing_type in vectorizing_types:
    for algorithm_name in algorithm_names:
        for component in components:
            print('Start')
            start = datetime.datetime.now()

            print('Vectorizing')
            #vector
            this_vector.append(vectorizing_type)
            vectorizer = vectorizing(vectorizing_type)
            X_train = vectorizer.fit_transform(train_data['Mobile Data Remarks'])
            X_test = vectorizer.transform(test_data['Mobile Data Remarks'])

            vector_times_actual = datetime.datetime.now()
            vector_time.append(vector_times_actual - start)

            
            if vectorizing_type == 'Hash' and algorithm_name == 'NMF':
                print('Current Match Will not Work.\n')
                this_algorithm.append(algorithm_name)
                this_component.append(0)
                unsupervised_train_time.append(0)
                unsupervised_apply_time.append(0)
                model_accuracy.append(0)
                model_time.append(0)
            else:
                if algorithm_name == 'None':
                    print('No Unsupervised')
                    this_algorithm.append(algorithm_name)
                    this_component.append(component)
                    unsupervised_train_time.append(0)
                    unsupervised_apply_time.append(0)
                    
                    print('Logit')
                    #logit model
                    model_accuracy.append(logit_model(X_train, X_test))
                    modeling_time = datetime.datetime.now()
                    model_time.append(modeling_time - apply_time)
                    
                else:
                    print('Unsupervised')
                    #unsupervied modeling
                    this_algorithm.append(algorithm_name)
                    this_component.append(component)
                    model = unsupervised_model(algorithm_name, component)
                    model.fit(X_train)
                    train_time = datetime.datetime.now()
                    unsupervised_train_time.append(train_time - vector_times_actual)

                    new_X_train = model.transform(X_train)
                    new_X_test = model.transform(X_test)
                    apply_time = datetime.datetime.now()
                    unsupervised_apply_time.append(apply_time - train_time)
                    
                    print('Logit')
                    #logit model
                    model_accuracy.append(logit_model(new_X_train, new_X_test))
                    modeling_time = datetime.datetime.now()
                    model_time.append(modeling_time - apply_time)

                    print('Done.\n')

Start
Vectorizing
Unsupervised


In [None]:
results_df = pd.DataFrame({'Vector': this_vector,
                           'Vector Time': vector_time,
                           'Unsupervised Algorithm': this_algorithm,
                           'HyperTune Parameter': this_component,
                           'Unsupervised Train Time': unsupervised_train_time,
                           'Unsupervised Apply Time': unsupervised_apply_time,
                           'Model Time': model_time,
                           'Model Accuracy': model_accuracy
                          })
                           
results_df

In [None]:
# Count SVD 20000 50min 55min 27min 0.818657
# Count None                  28min 0.822734