Based on these posts: 
* https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
* https://github.com/keisukeirie/Amazon_review_helpfulness_prediction
* https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/
* https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90
!pip install -q scikit-learn==0.20.3
!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!head ./data/amazon90000.tsv

In [None]:
df = pd.read_csv('./data/amazon90000.tsv', delimiter='\t')
df.shape

In [None]:
df.head(5)

# Drop any NaNs

In [None]:
df.shape

In [None]:
df.isna().values.any()

In [None]:
df_cleaned_and_filtered = df.dropna()
df_cleaned_and_filtered = df_cleaned_and_filtered.reset_index()
df_cleaned_and_filtered.shape

In [None]:
df_cleaned_and_filtered = df_cleaned_and_filtered.query('helpful_votes > 0')
df_cleaned_and_filtered.shape

In [None]:
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['pct_helpful_votes'] = df_cleaned_and_filtered['helpful_votes'] / df_cleaned_and_filtered['total_votes']
df_cleaned_and_filtered.shape

In [None]:
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['is_helpful'] = df_cleaned_and_filtered['pct_helpful_votes'] > 0.75
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['is_positive_sentiment'] = (df_cleaned_and_filtered['star_rating'] >= 4).astype(int)
df_cleaned_and_filtered.head(5)

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_cleaned_and_filtered)


## Balance the dataset

In [None]:
from sklearn.utils import resample

is_negative_sentiment_df = df_cleaned_and_filtered.query('is_positive_sentiment == 0')
is_positive_sentiment_df = df_cleaned_and_filtered.query('is_positive_sentiment == 1')

is_positive_downsampled_df = resample(is_positive_sentiment_df,
                                      replace = False, # sample without replacement
                                      n_samples = len(is_negative_sentiment_df), # match minority n
                                      random_state = 27) # reproducible results

df_cleaned_and_filtered = pd.concat([is_negative_sentiment_df, is_positive_downsampled_df])


In [None]:
sns.countplot(x='is_positive_sentiment', data=df_cleaned_and_filtered)


Create X (features) and y (labels)

In [None]:
X = df_cleaned_and_filtered[['review_body']]
y = df_cleaned_and_filtered['is_positive_sentiment']

print('X.shape:  {}'.format(X.shape))
print('y.shape:  {}'.format(y.shape))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
X_train.head(5)

In [None]:
y_train.head(5)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]

In [None]:
import nltk
import re
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

### Create Scikit-Learn Pipeline with XGBoostClassifier

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('body', Pipeline([
            ('body_text_selector', TextSelector('review_body')),
            ('tfidf_vectorizer', TfidfVectorizer(tokenizer=Tokenizer, stop_words="english",
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
    ])),
    ('classifier', XGBClassifier(objective='binary:logistic',
                                 max_depth=5, 
                                 n_estimators=300)),
])

### Fit the model

_Note:  This will take a couple minutes.  Please be patient._

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.steps[1][1]

In [None]:
##
# TODO:  https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd
#        https://gitlab.com/juliensimon/awsdevdays2020/-/blob/master/mls1/XGBoost.ipynb
##

#plot feature importance

import matplotlib.pyplot as plt
import xgboost

fig, ax = plt.subplots(figsize=(12,12))
xgboost.plot_importance(pipeline.steps[1][1], importance_type='gain', max_num_features=30, height=0.8, ax=ax, show_values = True)
plt.title('Feature Importance')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

test_preds = pipeline.predict(X_test)

print('Test Accuracy: ', accuracy_score(y_test, test_preds))
print('Test Precision: ', precision_score(y_test, test_preds, average=None))

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
df_cm = confusion_matrix(y_test, test_preds)
df_cm

In [None]:
# TODO:  Explain the classifier

# Test Metrics

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()