Let's import the necessary libraries and files to get started

In [16]:
import numpy as np
import pandas as pd

import boto3
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
bucket='ceo-turnover-data'
data_key = 'pre_processed_v4_CEO.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

data = pd.read_csv(data_location)

data is a pandas dataframe storing our preprocessed dataset. Let's print out data.head() to check it imported correctly and see how it's structured

In [17]:
print(data.head())

   Age                                       Company Name   Director Name  \
0   73  COSTCO WHOLESALE CORP (Costco Companies Inc pr...     Jim Sinegal   
1   28            Morris & Garritano Insurance Agency Inc  Brendan Morris   
2   29  Madison Industries Inc (Madison Capital Partne...   Larry Gies Jr   
3   33                              Crowley Maritime Corp  Tom Crowley Jr   
4   33                     Enterprise Solutions Group Inc     Savas Karas   

   Number of Records               Role Name           Seniority  \
0                  1           President/CEO  Executive Director   
1                  1                     CEO  Executive Director   
2                  1  Chairman/President/CEO  Executive Director   
3                  1  Chairman/President/CEO  Executive Director   
4                  1           President/CEO  Executive Director   

   Tenure (Years) Turnover (YES/NO)  Year  
0               7                NO  2000  
1               7                NO  200

Now we're cooking with gas! Let's trim our data to only include predictors and labels.

In [18]:
data = data[['Turnover (YES/NO)', 'Age', 'Tenure (Years)']]

print(data.head())
print("\n We have {} rows of {} columns".format(data.shape[0], data.shape[1]))

  Turnover (YES/NO)  Age  Tenure (Years)
0                NO   73               7
1                NO   28               7
2                NO   29               7
3                NO   33               7
4                NO   33               7

 We have 291294 rows of 3 columns


One-hot encode our label data and store it in a separate array

In [19]:
pre_y = np.array(data[['Turnover (YES/NO)']])
y = np.zeros((data.shape[0], 1))

for i in range(data.shape[0]):
    if pre_y[i] == 'YES':
        y[i] = 1
        
print(data.head())

  Turnover (YES/NO)  Age  Tenure (Years)
0                NO   73               7
1                NO   28               7
2                NO   29               7
3                NO   33               7
4                NO   33               7


In [29]:
data["pred"] = y

model_data = data[["pred", "Age", "Tenure (Years)"]]

print(model_data.head())

   pred  Age  Tenure (Years)
0   0.0   73               7
1   0.0   28               7
2   0.0   29               7
3   0.0   33               7
4   0.0   33               7


Now time to import sagemaker and instantiate the model (Boilerplate imports)

In [30]:
import boto3
import re
from sagemaker import get_execution_role

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
from sagemaker.predictor import csv_serializer

Split that data and csv it

In [32]:
# we should come up with a better way to split the data
# ex. by company, by individual
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])


train_data.to_csv('train.csv', header=False, index=False)

# no need to save the other sets as csvs for now
validation_data.to_csv('validation.csv', header=False, index=False)
# test_data.to_csv('test.csv', header=False, index=False)

Now to upload these files to S3.. fingers crossed!

In [41]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('', 'train/train_data_0.csv')).upload_file('train.csv')

# no need to upload the other sets to S3 for now
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join('', 'train/validation_data_0.csv')).upload_file('validation.csv')

Get model in a container

In [42]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version='0.90-1')

In [43]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train/train_data_0.csv'.format(bucket), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/train/validation_data_0'.format(bucket), content_type='csv')

In [44]:
sess = sagemaker.Session()

In [45]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/output'.format(bucket),
                                    sagemaker_session=sess)

In [49]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=10)

In [50]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-02-25 06:12:32 Starting - Starting the training job...
2020-02-25 06:12:33 Starting - Launching requested ML instances...
2020-02-25 06:13:30 Starting - Preparing the instances for training......
2020-02-25 06:14:20 Downloading - Downloading input data...
2020-02-25 06:14:45 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[06:15:11] 203905x2 matrix with 407810 entries loaded from /opt/ml/input/data/tr