In [82]:
#Cleaning Data
import os
import pandas as pd
import numpy as np
import datetime
#viz
import matplotlib.pyplot as plt
#modeling tools
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
#model training
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.amazon_estimator import get_image_uri

In [83]:
#get iam role
role = 'arn:aws:iam::435708786586:role/service-role/AmazonSageMaker-ExecutionRole-20190829T165674'
#create sagemaker session
sess = sagemaker.Session()
#use the region the session is in
region = sess.boto_session.region_name
#bucket name
bucket =  'distribution-reliability-nlp'
#model folder name
model_location = 's3://{}/model'.format(bucket)
#Sagemaker model container
container = get_image_uri(region, 'xgboost', repo_version='latest')

	get_image_uri(region, 'xgboost', '0.90-1').


# Import - We

In [84]:
#import data
we_outage = pd.read_csv('We_Cleaned.csv')
we_outage.head()

Unnamed: 0,Volt_D,Volt_E,Volt_F,Volt_H,Volt_J,Volt_M,Volt_R,Volt_SD,Volt_TV,Volt_US,...,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,outage category,outage subcategory
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Equipment,
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Equipment,Conductor/Cable
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Vegetation,Out Clearance Zone (ROW)
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Equipment,Conductor/Cable
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Equipment,


In [85]:
#split features and target
X = we_outage.drop(['outage category', 'outage subcategory'], axis=1)
Y = pd.factorize(we_outage['outage category'])[0]
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=53)
X_train, X_vaild, y_train, y_vaild = train_test_split(X_train, y_train, test_size = 0.1, random_state=53)

In [86]:
#add the target back into the dataset
X_train['target'] = y_train
X_vaild['target'] = y_vaild
X_test['target'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [87]:
#reordering columns for sagemaker
cols = list(X_train)[-1:] + list(X_train)[:-1]
X_train = X_train[cols]
X_vaild = X_vaild[cols]
X_test = X_test[cols]

In [88]:
X_train

Unnamed: 0,target,Volt_D,Volt_E,Volt_F,Volt_H,Volt_J,Volt_M,Volt_R,Volt_SD,Volt_TV,...,week_9,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
8595,3,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
184604,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
235887,5,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
195509,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
188053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
292869,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
278195,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
157454,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
35237,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
289225,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [89]:
#Save to CSVs
X_train.to_csv('Non-Text_train.csv', header=False, index=False)
X_vaild.to_csv('Non-Text_vaild.csv', header=False, index=False)
X_test.to_csv('Non-Text_test.csv', header=False, index=False)

In [103]:
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(
    path='Non-Text_train.csv', bucket=bucket,
    key_prefix='data/Non-Text/train')

testpath = sess.upload_data(
    path='Non-Text_vaild.csv', bucket=bucket,
    key_prefix='data/Non-Text/vaild')

testpath = sess.upload_data(
    path='Non-Text_test.csv', bucket=bucket,
    key_prefix='data/Non-Text/test')

# Training

In [123]:
#set up sagemaker model
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.c5.18xlarge',
                                    output_path=model_location,
                                    sagemaker_session=sess)

In [124]:
#set up hyperparameters
xgb.set_hyperparameters(objective='multi:softprob',
                        num_class = '7',
                        num_round=100,
                        rate_drop=0.3,
                        tweedie_variance_power=1.4)

In [125]:
#set hyperparms range
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}

In [126]:
#set up tuner
objective_metric_name = 'validation:auc'
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=3)

In [127]:
#location of training data
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/data/Non-Text/train'.format(bucket), content_type='csv')
s3_input_vaild = sagemaker.s3_input(s3_data='s3://{}/data/Non-Text/vaild'.format(bucket), content_type='csv')

In [128]:
#train model
tuner.fit({'train': s3_input_train, 'validation': s3_input_vaild}, include_cls_metadata=False)

# Analyze Results

In [None]:
#get results back as df
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

full_df = tuner.dataframe()

In [102]:
's3://{}/data/Non-Text/'.format(bucket)

's3://distribution-reliability-nlp/data/Non-Text/'