# Note:  This notebook takes a long time due to the amount of data involved.  Do not run this.  Use the more-scalable mechanisms presented in this material.

# Train an XGBoost Model with Jupyter Notebook
We will train a custom XGBoost directly in this notebook to predict sentiment of Amazon customer reviews.

In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Copy the datasets from S3 to this notebook instance

In [None]:
%store -r spark_processing_job_s3_output_prefix

In [None]:
print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))

In [None]:
prefix_train = '{}/output/tfidf-train'.format(spark_processing_job_s3_output_prefix)
prefix_validation = '{}/output/tfidf-validation'.format(spark_processing_job_s3_output_prefix)
prefix_test = '{}/output/tfidf-test'.format(spark_processing_job_s3_output_prefix)

tfidf_train_path = './{}'.format(prefix_train)
tfidf_validation_path = './{}'.format(prefix_validation)
tfidf_test_path = './{}'.format(prefix_test)

tfidf_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
tfidf_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
tfidf_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

import os
os.makedirs(prefix_train, exist_ok=True)
os.makedirs(prefix_validation, exist_ok=True)
os.makedirs(prefix_test, exist_ok=True)

tfidf_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
tfidf_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
tfidf_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

print(tfidf_train_s3_uri)
print(tfidf_validation_s3_uri)
print(tfidf_test_s3_uri)

In [None]:
!aws s3 ls $tfidf_train_s3_uri/ 

In [None]:
!aws s3 ls $tfidf_validation_s3_uri/

In [None]:
!aws s3 ls $tfidf_test_s3_uri/

In [None]:
!aws s3 cp --recursive $tfidf_train_s3_uri $tfidf_train_path
!aws s3 cp --recursive $tfidf_validation_s3_uri $tfidf_validation_path
!aws s3 cp --recursive $tfidf_test_s3_uri $tfidf_test_path

# Load the data

In [None]:
import glob
import pandas as pd

def load_dataset(path, sep, header):
    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)
    
    if header==None:
        # Adjust the column names after dropping the 0th column above
        # New column names are 0 (inclusive) to len(features.columns) (exclusive)
        new_column_names = list(range(0, len(features.columns)))
        features.columns = new_column_names

    return features, labels

In [None]:
# Balanced
X_train, y_train = load_dataset(path=tfidf_train_path, sep=',', header=None)
X_validation, y_validation = load_dataset(path=tfidf_validation_path, sep=',', header=None)
X_test, y_test = load_dataset(path=tfidf_test_path, sep=',', header=None)

In [None]:
X_train.shape

In [None]:
X_train.head(5)

# Train the model
_This will take a few minutes.  Please be patient._

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

objective  = 'binary:logistic'
max_depth  = 5
num_round  = 1

xgb_estimator = XGBClassifier(objective=objective,
                              num_round=num_round,
                              max_depth=max_depth)

xgb_estimator.fit(X_train, y_train)

# Save Model

In [None]:
import os

import pickle as pkl

# See https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
# Need to save with joblib or pickle.  `xgb.save_model()` does not save feature_names
model_dir  = './models/notebook/'

os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'xgboost-model')

pkl.dump(xgb_estimator, open(model_path, 'wb'))

print('Wrote model to {}'.format(model_path))

# Restore Model 
This simulates restoring a model within an application.

In [None]:
import pickle as pkl
import os

model_dir  = './models/notebook/'
model_path = os.path.join(model_dir, 'xgboost-model')

xgb_estimator_restored = pkl.load(open(model_path, 'rb'))

type(xgb_estimator_restored)

# Plot the feature importance for this model

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import xgboost

fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(xgb_estimator_restored, 
                        importance_type='gain', 
                        max_num_features=30, 
                        height=0.8, 
                        ax=ax, 
                        show_values = True)
plt.title('Feature Importance')
plt.show()

# Calculate Validation Metrics

In [None]:
X_validation.head(5)

In [None]:
preds_validation = xgb_estimator_restored.predict(X_validation)
preds_validation.shape

In [None]:
import numpy as np
preds_validation_0_or_1 = np.where(preds_validation > 0.5, 1, 0)
preds_validation_0_or_1.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print('Validation Accuracy: ', accuracy_score(y_validation, preds_validation_0_or_1))
print('Validation Precision: ', precision_score(y_validation, preds_validation_0_or_1, average=None))

In [None]:
pd.DataFrame(preds_validation).head(5)

In [None]:
print(classification_report(y_validation, preds_validation_0_or_1))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_validation = confusion_matrix(y_validation, preds_validation_0_or_1)
df_cm_validation

In [None]:
import itertools
import numpy as np

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="black" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
plt.figure()
fig, ax = plt.subplots(figsize=(10,5))
plot_conf_mat(df_cm_validation, classes=['Not Positive Sentiment', 'Positive Sentiment'], 
                          title='Confusion matrix')
plt.show()

In [None]:
from sklearn import metrics

#print(metrics.f1_score(y_validation, preds_validation))

auc = round(metrics.roc_auc_score(y_validation, preds_validation_0_or_1), 4)
print('AUC is ' + repr(auc))

fpr, tpr, _ = metrics.roc_curve(y_validation, preds_validation_0_or_1)

plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Calculate Test Metrics

In [None]:
X_test.head(5)

In [None]:
preds_test = xgb_estimator_restored.predict(X_test)


In [None]:
preds_test

In [None]:
import numpy as np
preds_test_0_or_1 = np.where(preds_test > 0.5, 1, 0)
preds_test_0_or_1.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print('Test Accuracy: ', accuracy_score(y_test, preds_test_0_or_1))
print('Test Precision: ', precision_score(y_test, preds_test_0_or_1, average=None))

In [None]:
print(classification_report(y_test, preds_test_0_or_1))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_test = confusion_matrix(y_test, preds_test_0_or_1)
df_cm_test

In [None]:
import itertools
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="black" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
plt.figure()
fig, ax = plt.subplots(figsize=(6,4))
plot_conf_mat(df_cm_test, classes=['Not Positive Sentiment', 'Positive Sentiment'], 
                          title='Confusion matrix')
plt.show()

In [None]:
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

auc = round(metrics.roc_auc_score(y_test, preds_test_0_or_1), 4)
print('AUC is ' + repr(auc))

fpr, tpr, _ = metrics.roc_curve(y_test, preds_test_0_or_1)

plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Predict in Notebook

Create `feature_transform_fn()` function (same used during `prepare` phase)

In [None]:
# Use TruncatedSVD vs. PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

def feature_transform_fn(df_text, column_name, num_components):
    text_processors = Pipeline(
        steps=[
            (
                'tfidfvectorizer',
                TfidfVectorizer(
                    max_df=0.25,                                       
                    min_df=.0025,
                    analyzer='word',
                    max_features=10000
                )
            )
        ]
    )

    column_transformer = ColumnTransformer(
        transformers=[('text_processing', text_processors, df_text.columns.get_loc(column_name))]
    )

    pipeline = Pipeline(
        steps=[
            ('column_transformer', column_transformer), 
            ('dimension_reducer', TruncatedSVD(n_components=num_components)),
            ('standard_scaler', StandardScaler())
        ]
    )

    return pipeline

In [None]:
# $S3_BUCKET/feature-store/amazon-reviews/scrubbed-raw-with-header

prefix_raw = 'feature-store/amazon-reviews/raw-labeled-split-balanced-header-test/'

scrubbed_raw_path = './{}'.format(prefix_raw)

import os
os.makedirs(prefix_raw, exist_ok=True)

scrubbed_raw_s3_uri = 's3://{}/{}'.format(bucket, prefix_raw)

In [None]:
!aws s3 cp --recursive $scrubbed_raw_s3_uri $scrubbed_raw_path

In [None]:
X_raw, y_raw = load_dataset(path=scrubbed_raw_path, sep=',', header=0)
X_raw.head(5)

In [None]:
y_raw.head(5)

# Transform raw to tfidf (we've already done this, but showing it again for clarity)

In [None]:
np_tfidf = feature_transform_fn(df_text=X_raw, column_name='review_body', num_components=300).fit_transform(X_raw)
df_tfidf = pd.DataFrame(np_tfidf)
df_tfidf.shape

In [None]:
df_tfidf.head(5)

In [None]:
X_raw.head(5)

In [None]:
y_raw.head(5)

In [None]:
preds_raw = xgb_estimator_restored.predict(df_tfidf)
df_preds_raw = pd.DataFrame(preds_raw)
df_preds_raw.head(5)

In [None]:
# TODO:  This isn't needed anymore

In [None]:
import numpy as np
preds_raw_0_or_1 = np.where(preds_raw > 0.5, 1, 0)
preds_raw_0_or_1.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print('Test Accuracy: ', accuracy_score(y_raw, preds_raw_0_or_1))
print('Test Precision: ', precision_score(y_raw, preds_raw_0_or_1, average=None))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm_raw = confusion_matrix(y_raw, preds_raw_0_or_1)
df_cm_raw

In [None]:
import itertools

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="black" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
plt.figure()
fig, ax = plt.subplots(figsize=(6,4))
plot_conf_mat(df_cm_raw, classes=['Not Positive Sentiment', 'Positive Sentiment'], 
                          title='Confusion matrix')
plt.show()

In [None]:
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

auc = round(metrics.roc_auc_score(y_raw, preds_raw_0_or_1), 4)
print('AUC is ' + repr(auc))

fpr, tpr, _ = metrics.roc_curve(y_raw, preds_raw_0_or_1)

plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()