In [None]:
!pip install -q boto3
#!pip install -q scikit-learn==0.20.3
#!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Copy the datasets from S3 to this notebook instance

In [None]:
# $S3_BUCKET/feature-store/amazon-reviews/balanced-tfidf-without-header/data.csv

prefix_train = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/train'
prefix_validation = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/validation'
prefix_test = 'feature-store/amazon-reviews/csv/balanced-tfidf-without-header/test'

balanced_tfidf_without_header_train_path = './{}/data.csv'.format(prefix_train)
balanced_tfidf_without_header_validation_path = './{}/data.csv'.format(prefix_validation)
balanced_tfidf_without_header_test_path = './{}/data.csv'.format(prefix_test)

import os
os.makedirs(prefix_train, exist_ok=True)
os.makedirs(prefix_validation, exist_ok=True)
os.makedirs(prefix_test, exist_ok=True)

balanced_tfidf_without_header_train_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_train)
balanced_tfidf_without_header_validation_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_validation)
balanced_tfidf_without_header_test_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_test)

In [None]:
!aws s3 cp $balanced_tfidf_without_header_train_s3_uri $balanced_tfidf_without_header_train_path
!aws s3 cp $balanced_tfidf_without_header_validation_s3_uri $balanced_tfidf_without_header_validation_path
!aws s3 cp $balanced_tfidf_without_header_test_s3_uri $balanced_tfidf_without_header_test_path

# Load the data
_Note:  `header=None`_

In [None]:
def load_dataset(path, sep, header):
    data = pd.read_csv(path, sep=sep, header=header)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)
    
    if header==None:
        # Adjust the column names after dropped the 0th column above
        # New column names are 0 (inclusive) to len(features.columns) (exclusive)
        new_column_names = list(range(0, len(features.columns)))
        features.columns = new_column_names

    return features, labels

In [None]:
# Load transformed features (is_positive_sentiment, f0, f1, ...)
X_train, y_train = load_dataset(path=balanced_tfidf_without_header_train_path, sep=',', header=None)
X_validation, y_validation = load_dataset(path=balanced_tfidf_without_header_validation_path, sep=',', header=None)
X_test, y_test = load_dataset(path=balanced_tfidf_without_header_test_path, sep=',', header=None)


In [None]:
X_train.head(5)

# Train model with XGBoost
This may take a few minutes.  Please be patient.

## Install xgboost

In [None]:
!pip install -q xgboost==0.90

# Train the model
_This will take a few minutes.  Please be patient._

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

objective  = 'binary:logistic'
max_depth  = 5
num_round  = 1

model = XGBClassifier(objective=objective,
                      num_round=num_round,
                      max_depth=max_depth)

model.fit(X_train, y_train)

# Save Model

In [None]:
import os

import pickle as pkl

# See https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
# Need to save with joblib or pickle.  `xgb.save_model()` does not save feature_names
model_dir  = './models/notebook/'

os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'xgboost-model')
pkl.dump(model, open(model_path, 'wb'))
print('Wrote model to {}'.format(model_path))

# TODO:  Explain Model

# Restore Model 
This simulates restoring a model within an application.

In [None]:
import pickle as pkl

def model_fn(model_dir):
    model_path = os.path.join(model_dir, 'xgboost-model')
    model = pkl.load(open(model_path, 'rb'))
    return model

In [None]:
model_dir  = './models/notebook/'
model_restored = model_fn(model_dir)

# Plot the feature importance for this model
TODO:  Display the values of the features!

In [None]:
import matplotlib.pyplot as plt
import xgboost

fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(model_restored, 
                        importance_type='gain', 
                        max_num_features=30, 
                        height=0.8, 
                        ax=ax, 
                        show_values = True)
plt.title('Feature Importance')
plt.show()

# TODO:  Perform hyperparamter tuning?

# Calculate Validation Metrics

In [None]:
X_validation.head(5)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

auc = model_restored.score(X_validation, y_validation)
print('Validation AUC: ', auc)

preds_validation = model_restored.predict(X_validation)
print('Validation Accuracy: ', accuracy_score(y_validation, preds_validation))
print('Validation Precision: ', precision_score(y_validation, preds_validation, average=None))

In [None]:
print(classification_report(y_validation, preds_validation))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_validation = confusion_matrix(y_validation, preds_validation)
df_cm_validation


In [None]:
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm_validation, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()

# Test Metrics

In [None]:
X_test.head(5)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

auc = model_restored.score(X_test, y_test)
print('Test AUC ', auc)

preds_test = model_restored.predict(X_test)
print('Test Accuracy: ', accuracy_score(y_test, preds_test))
print('Test Precision: ', precision_score(y_test, preds_test, average=None))

In [None]:
print(classification_report(y_test, preds_test))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_test = confusion_matrix(y_test, preds_test)
df_cm_test

In [None]:
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm_test, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()

# Predict in Notebook

Create `feature_transform_fn()` function (same used during `prepare` phase)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

def feature_transform_fn(df_text, column_name, n_components):
    text_processors = Pipeline(
        steps=[
            (
                'tfidfvectorizer',
                TfidfVectorizer(
                    max_df=0.25,                                       
                    min_df=.0025,
                    analyzer='word',
                    max_features=10000
                )
            )
        ]
    )

    column_transformer = ColumnTransformer(
        transformers=[('text_processing', text_processors, df_text.columns.get_loc(column_name))]
    )

    pipeline = Pipeline(
        steps=[
            ('column_transformer',
             column_transformer), ('svd', TruncatedSVD(n_components=n_components)),
            ('standardscaler', StandardScaler())
        ]
    )

    return pipeline

In [None]:
# $S3_BUCKET/feature-store/amazon-reviews/balanced-raw-with-header/data.csv

prefix_raw = 'feature-store/amazon-reviews/csv/scrubbed-raw-with-header'

scrubbed_raw_path = './{}/data.csv'.format(prefix_raw)

import os
os.makedirs(prefix_raw, exist_ok=True)

scrubbed_raw_s3_uri = 's3://{}/{}/data.csv'.format(bucket, prefix_raw)

In [None]:
!aws s3 cp $scrubbed_raw_s3_uri $scrubbed_raw_path

In [None]:
X_raw, y_raw = load_dataset(path=scrubbed_raw_path, sep=',', header=0)
X_raw.head(5)

In [None]:
y_raw.head(5)

In [None]:
np_tfidf = feature_transform_fn(X_raw, 'review_body', 300).fit_transform(X_raw)
df_tfidf = pd.DataFrame(np_tfidf)
df_tfidf.shape

In [None]:
df_tfidf.head(5)

In [None]:
X_raw.head(5)

In [None]:
y_raw.head(5)

In [None]:
preds = model_restored.predict(df_tfidf)
df_preds = pd.DataFrame(preds)
df_preds.head(5)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

auc = model_restored.score(df_tfidf, y_raw)
print('Test AUC ', auc)

preds_raw = model_restored.predict(df_tfidf)
print('Test Accuracy: ', accuracy_score(y_raw, preds_raw))
print('Test Precision: ', precision_score(y_raw, preds_raw, average=None))

In [None]:
pd.read_csv(scrubbed_raw_path)

# TODO:  Deploy the model
1. Create a SageMaker endpoint using this model.

2. Define the predict function to transform raw text into TF/IDF.

In [None]:
# # TODO:  1) update this to do TF/IDF
# #        2) use this in other versions of the model
# # Derived from the following:
# #   https://aim357.readthedocs.io/en/latest/GluePySparkMLFeatureEngineering/GluePySparkMLFeatureEngineering.html#deepar-deep-dive

# class XGBoostPredictor(sagemaker.predictor.RealTimePredictor):

#     def __init__(self, *args, **kwargs):
#         super().__init__(*args, content_type=sagemaker.content_types.CONTENT_TYPE_CSV, **kwargs)

#     def predict(self, df):
#         """Requests the prediction of for the time series listed in `ts`, each with the (optional)
#         corresponding category listed in `cat`.

#         df -- `pandas.Series` object, the data frame to predict

#         Return value: list of `pandas.DataFrame` objects, each containing the predictions
#         """
#         req = self.__encode_request(df)
#         # TODO:  change this
#         res = predict(req) # super(DeepARPredictor, self).predict(req)
#         return self.__decode_response(res)

#     def __encode_request(self, df):
#         # TODO:  Add transform
# #        df = feature_transform
#         encoded_request = pd.DataFrame([0,1])
#         return encoded_request

#     def __decode_response(self, response):
#         predictions = response
#         return pd.DataFrame(data=predictions)

In [None]:
# xgb_endpoint_name = prefix + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

# xgb_predictor = xgb_estimator.deploy(
#                      initial_instance_count=1, 
#                      instance_type='local',
# #                     instance_type='ml.m4.xlarge',
#                      predictor_cls=XGBoostPredictor,
#                      endpoint_name=xgb_endpoint_name)