In [None]:
!pip install -q boto3
#!pip install -q scikit-learn==0.20.3
#!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Copy the datasets from S3 to this notebook instance

In [None]:
# $S3_BUCKET/feature-store/amazon-reviews/balanced-tfidf-without-header/data.csv

prefix_train = 'feature-store/amazon-reviews/balanced-tfidf-without-header/train'
prefix_validation = 'feature-store/amazon-reviews/balanced-tfidf-without-header/validation'
prefix_test = 'feature-store/amazon-reviews/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_train_path = './{}/data.csv'.format(prefix_train)
balanced_tfidf_without_header_validation_path = './{}/data.csv'.format(prefix_validation)
balanced_tfidf_without_header_test_path = './{}/data.csv'.format(prefix_test)

import os
os.makedirs(prefix_train, exist_ok=True)
os.makedirs(prefix_validation, exist_ok=True)
os.makedirs(prefix_test, exist_ok=True)

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

In [None]:
!aws s3 cp $balanced_tfidf_without_header_train_s3_uri $balanced_tfidf_without_header_train_path
!aws s3 cp $balanced_tfidf_without_header_validation_s3_uri $balanced_tfidf_without_header_validation_path
!aws s3 cp $balanced_tfidf_without_header_test_s3_uri $balanced_tfidf_without_header_test_path

# Load the data

In [None]:
def load_dataset(path, sep):
    data = pd.read_csv(path, sep=sep)

    labels = data['is_positive_sentiment']
    features = data.drop(['is_positive_sentiment'], axis=1)

    return features, labels

# Train the model
This may take a few minutes.  Please be patient.

In [None]:
objective  = 'binary:logistic'
max_depth  = 5
num_round  = 1

# Load transformed features (is_positive_sentiment, f0, f1, ...)
X_train, y_train = load_dataset(balanced_tfidf_without_header_train_path, ',')
X_validation, y_validation = load_dataset(balanced_tfidf_without_header_validation_path, ',')
X_test, y_test = load_dataset(balanced_tfidf_without_header_test_path, ',')


# Train model with XGBoost

## Install xgboost

In [None]:
!pip install -q xgboost==0.90

# Train the model
_This will take a few minutes.  Please be patient._

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

model = XGBClassifier(objective=objective,
                           num_round=num_round,
                           max_depth=max_depth)

model.fit(X_train, y_train)

# Save Model

In [None]:
import os

import pickle as pkl

# See https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
# Need to save with joblib or pickle.  `xgb.save_model()` does not save feature_names
# TODO:  use pickle
model_dir  = './model/notebook/'
model_path = os.path.join(model_dir, 'xgboost-model')

pkl.dump(model, open(model_path, 'wb'))

print('Wrote model to {}'.format(model_path))

# Plot the feature importance for this model
TODO:  Display the values of the features!

In [None]:
import matplotlib.pyplot as plt
import xgboost

fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(model, importance_type='gain', max_num_features=30, height=0.8, ax=ax, show_values = True)
plt.title('Feature Importance')
plt.show()

# TODO:  Explain Model

# Restore Model 
This simulates restoring a model within an application.

In [None]:
import pickle as pkl

def model_fn(model_dir):
    model_path = os.path.join(model_dir, 'xgboost-model')
    model = pkl.load(open(model_path, 'rb'))
    return model

In [None]:
model_restored = model_fn(model_dir)

# TODO:  Perform hyperparamter tuning?

# Calculate Validation Metrics

In [None]:
X_validation.head(5)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

auc = model_restored.score(X_validation, y_validation)
print('Validation AUC: ', auc)

preds_validation = model_restored.predict(X_validation)
print('Validation Accuracy: ', accuracy_score(y_validation, preds_validation))
print('Validation Precision: ', precision_score(y_validation, preds_validation, average=None))

In [None]:
print(classification_report(y_validation, preds_validation))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_validation = confusion_matrix(y_validation, preds_validation)

#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm_validation, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()

# Test Metrics

In [None]:
X_test.head(5)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

auc = model_restored.score(X_test, y_test)
print('Test AUC ', auc)

preds_test = model_restored.predict(X_test)
print('Test Accuracy: ', accuracy_score(y_test, preds_test))
print('Test Precision: ', precision_score(y_test, preds_test, average=None))

In [None]:
print(classification_report(y_test, preds_test))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_test = confusion_matrix(y_test, preds_test)

#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm_test, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()

# TODO:  Deploy the model
1. Create a SageMaker endpoint using this model.

2. Define the predict function to transform raw text into TF/IDF.