Based on these posts: 
* https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
* https://github.com/keisukeirie/Amazon_review_helpfulness_prediction
* https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/
* https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [None]:
!pip install -q boto3
!pip install -q xgboost==0.90
!pip install -q scikit-learn==0.20.3
!pip install -q nltk==3.4.5

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

#sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!head ./data/amazon90000.tsv

In [None]:
df = pd.read_csv('./data/amazon90000.tsv', delimiter='\t')
df.shape

In [None]:
df.head(5)

# TODO:  Clean commas from raw text

In [2]:
# ...

# Drop NaNs

In [None]:
df.shape

In [None]:
df.isna().values.any()

In [None]:
df_scrubbed = df.dropna()
df_scrubbed = df_scrubbed.reset_index()
df_scrubbed.shape

In [None]:
df_labeled['is_positive_sentiment'] = (df_scrubbed['star_rating'] >= 4).astype(int)
df_labeled.head(5)

In [None]:
import seaborn as sns

sns.countplot(x='is_positive_sentiment', data=df_labeled)


# Balance the Dataset between Classes

In [None]:
from sklearn.utils import resample

is_negative_sentiment_df = df_labeled.query('is_positive_sentiment == 0')
is_positive_sentiment_df = df_labeled.query('is_positive_sentiment == 1')

is_positive_downsampled_df = resample(is_positive_sentiment_df,
                                      replace = False, # sample without replacement
                                      n_samples = len(is_negative_sentiment_df), # match minority n
                                      random_state = 27) # reproducible results

df_labeled_and_balanced = pd.concat([is_negative_sentiment_df, is_positive_downsampled_df])


In [None]:
sns.countplot(x='is_positive_sentiment', data=df_cleaned_and_filtered)


In [None]:
# $S3_BUCKET/feature-store/amazon-reviews/labeled-balanced/data.csv

df_labeled.to_csv('./feature-store/amazon-reviews/labeled-balanced/data.csv', index=False, header=True)

In [None]:
from sklearn.model_selection import train_test_split

#df_train, df_holdout = train_test_split(df_cleaned_and_filtered, test_size=0.10)
#df_validation, df_test = train_test_split(df_holdout, test_size=0.5)

X = df_cleaned_and_filtered[['review_body']]
y = df_cleaned_and_filtered['is_positive_sentiment']

X.to_csv('./feature-store/amazon-reviews/scrubbed-and-balanced-features/csv/train/data.csv', index=False, header=True)
y.to_csv('./feature-store/amazon-reviews/labels-with-scrubbed-features/csv/validation/data.csv', index=False, header=True)


In [None]:
print('X.shape:  {}'.format(X.shape))
print('y.shape:  {}'.format(y.shape))

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.1, random_state=0)
X_validation, X_test, y_validation, y_test = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0)

print('X.shape={}'.format(X.shape))
print('y.shape={}'.format(y.shape))

print('X_train.shape={}'.format(X_train.shape))
print('y_train.shape={}'.format(y_train.shape))

print('X_validation.shape={}'.format(X_validation.shape))
print('y_validation.shape={}'.format(y_validation.shape))

print('X_test.shape={}'.format(X_test.shape))
print('y_test.shape={}'.format(y_test.shape))


In [None]:
X_train.to_csv('./feature-store/amazon-reviews/labels-with-text-features/csv/train/data.csv', index=False, header=True)
X_validation.to_csv('./feature-store/amazon-reviews/labels-with-text-features/csv/validation/data.csv', index=False, header=True)


In [None]:
X_train.head(5)

In [None]:
y_train.head(5)

In [None]:
# TODO:  Remove this in favor of explicitly merging these wherever they are needed

df_train = X_train.merge(y_train, left_index=True, right_index=True)
df_train.head(5)

df_validation = X_validation.merge(y_validation, left_index=True, right_index=True)
df_validation.head(5)

df_train.to_csv('./feature-store/amazon-reviews/labels-with-text-features/csv/train/data.csv', index=False, header=True)
df_validation.to_csv('./feature-store/amazon-reviews/labels-with-text-features/csv/validation/data.csv', index=False, header=True)

In [None]:
prefix = 'sagemaker/xgboost/data'

train_data_uri = sess.upload_data(path="./data/train.csv", key_prefix=prefix + "/train")
validation_data_uri = sess.upload_data(path="./data/validation.csv", key_prefix=prefix + "/validation")

print(train_data_uri)
print(validation_data_uri)

#s3_input_train_data = sagemaker.s3_input(s3_data=train_data_uri, content_type='text/csv')
#s3_input_validation_data = sagemaker.s3_input(s3_data=validation_data_uri, content_type='text/csv')


# AutoML

Typical dataset split:  `train` => `validation` => `test`

However, we only specify `train` and `test` (not `validation`) for AutoML.

AutoML will use all of the `train` data - splitting into `train` and `validation` on its own.  

We hold out `test` to test the final model after AutoML generates the best model candidates.

In [None]:
#df_train_automl = X_train.merge(y_train, left_index=True, right_index=True)[['is_positive_sentiment', 'review_body']]
#df_validation = X_validation.merge(y_validation)

df_train_automl = df_train[['is_positive_sentiment', 'review_body']]
df_train_automl.shape


In [None]:
df_train_automl.head(5)

In [None]:
# TODO:  Change file name to indicate automl
# TODO:  Name `-train.csv` to distriguish from `-test.csv` next
df_train_automl.to_csv(path_or_buf='./data/amazon-digital-video-download-cleaned-and-filtered.csv', index=False)

In [None]:
df_test_automl = df_test[['is_positive_sentiment', 'review_body']]
df_test_automl.shape


In [None]:
df_test_automl.head(5)

In [None]:
# TODO:  Change filename to indicate automl
df_test_automl.to_csv(path_or_buf='./data/amazon-digital-video-download-cleaned-and-filtered-test.csv', index=False)

In [None]:
# TODO:  Unify these locations
prefix = 'xgboost-reviews/autopilot'

# TODO:  Change to `-train.csv` and remove the old files from S3 to avoid confusion
sess.upload_data(path='./data/amazon-digital-video-download-cleaned-and-filtered.csv', bucket=bucket, key_prefix=prefix + '/input')
sess.upload_data(path='./data/amazon-digital-video-download-cleaned-and-filtered-test.csv', bucket=bucket, key_prefix=prefix + '/input')

In [None]:
!aws s3 ls $bucket/$prefix/input/

# Built-In

### Create X (features) and y (labels)

In [None]:
X = df_cleaned_and_filtered[['review_body']]
y = df_cleaned_and_filtered['is_positive_sentiment']

print('X.shape:  {}'.format(X.shape))
print('y.shape:  {}'.format(y.shape))

# TODO:  Split again to separate `validation` from `test`

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.1, random_state=0)
X_validation, X_test, y_validation, y_test = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0)

print('X.shape={}'.format(X.shape))
print('y.shape={}'.format(y.shape))

print('X_train.shape={}'.format(X_train.shape))
print('y_train.shape={}'.format(y_train.shape))

print('X_validation.shape={}'.format(X_validation.shape))
print('y_validation.shape={}'.format(y_validation.shape))

print('X_test.shape={}'.format(X_test.shape))
print('y_test.shape={}'.format(y_test.shape))


In [None]:
X_train.head(5)

In [None]:
y_train.head(5)

In [None]:
# scikit-Learn==0.20.3
# nltk==3.4.5

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin

import nltk
import re

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

feature_union = FeatureUnion([
    ('body', Pipeline([
        ('body_text_selector', TextSelector('review_body')),
        ('tfidf_vectorizer', TfidfVectorizer(tokenizer=Tokenizer, stop_words="english",
                 min_df=.0025, max_df=0.25, ngram_range=(1,3))),
        ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
    ]))
])

X_train_transformed = feature_union.fit_transform(X_train)
X_train_transformed.shape

In [None]:
# For CSV training, the XGBoost built-in algorithm assumes that the target variable is in the first column and that the CSV does not have a header record.

X_train_new_pd = pd.DataFrame(X_train_transformed)
X_train_new_pd.insert(0, 'is_positive_sentiment', y_train)
X_train_new_pd = X_train_new_pd.fillna(0)

#cols = list(train_df)
#cols.insert(0, cols.pop(cols.index('OUTPUT_LABEL')))
#train_df = train_df.loc[:, cols]

X_train_new_pd.head(5)


# TODO:  Rename this to _builtin.csv or _tfidf.csv

In [None]:
X_train_new_pd.to_csv('./data/train_transformed.csv', index=False, header=False)

In [None]:
X_test_transformed = feature_union.fit_transform(X_test)
X_test_transformed.shape

In [None]:
# For CSV training, the XGBoost built-in algorithm assumes that the target variable is in the first column and that the CSV does not have a header record.

X_test_new_pd = pd.DataFrame(X_test_transformed)
X_test_new_pd.insert(0, 'is_positive_sentiment', y_test)
X_test_new_pd = X_test_new_pd.fillna(0)

X_test_new_pd.head(5)


# TODO:  Rename this to _builtin.csv or _tfidf.csv

In [None]:
X_test_new_pd.to_csv('./data/validation_transformed.csv', index=False, header=False)