Based on these posts: 

https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

https://github.com/keisukeirie/Amazon_review_helpfulness_prediction

In [None]:
!pip install -q boto3
!pip install -q xgboost

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!head ./amazon20000.csv

In [None]:
df = pd.read_csv('amazon20000.csv')
df.shape

In [None]:
df.head(5)

# Drop any NaNs

In [None]:
df.shape

In [None]:
df.isna().values.any()

In [None]:
df_dropna = df.dropna()
df_dropna.shape

In [None]:
df_cleaned_and_filtered = df_dropna.query('helpful_votes > 0')
df_cleaned_and_filtered.shape

In [None]:
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['pct_helpful_votes'] = df_cleaned_and_filtered['helpful_votes'] / df_cleaned_and_filtered['total_votes']
df_cleaned_and_filtered.shape

In [None]:
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['is_helpful'] = df_cleaned_and_filtered['pct_helpful_votes'] > 0.75
df_cleaned_and_filtered.head(5)

In [None]:
df_cleaned_and_filtered['is_positive_sentiment'] = (df_cleaned_and_filtered['star_rating'] >= 4).astype(int)
df_cleaned_and_filtered.head(5)

Create X (features) and y (labels)

In [None]:
X = df_cleaned_and_filtered[['review_body']]
y = df_cleaned_and_filtered['is_positive_sentiment']

print('X.shape:  {}'.format(X.shape))
print('y.shape:  {}'.format(y.shape))

In [None]:
# TODO:  Balance the dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
X_train.head(5)

In [None]:
y_train.head(5)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
#class NumberSelector(BaseEstimator, TransformerMixin):
#    def __init__(self, field):
#        self.field = field
#    def fit(self, X, y=None):
#        return self
#    def transform(self, X):
#        return X[[self.field]]

In [None]:
import nltk
import re
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

### Create Scikit-Learn Pipeline with SGDClassifier

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
#from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

classifier = Pipeline([
    ('features', FeatureUnion([
        ('body', Pipeline([
            ('body_text_selector', TextSelector('review_body')),
            ('tfidf_vectorizer', TfidfVectorizer(tokenizer=Tokenizer, stop_words="english",
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
#        ('star_rating', Pipeline([
#            ('star_rating_selector', NumberSelector('star_rating')),
#            ('standard_scaler', StandardScaler()),
#        ])),
    ])),
    ('classifier', SGDClassifier(learning_rate='optimal'))
#    ('classifier', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
#    ('classifier', RandomForestClassifier()),
    ])

Fit the model

In [None]:
classifier.fit(X_train, y_train)

Predict and calculate metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

preds = classifier.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, preds))
print('Precision: ', precision_score(y_test, preds, average=None))

In [None]:
print(classification_report(y_test, preds))

In [None]:
df_cm = confusion_matrix(y_test, preds)
df_cm

In [None]:
# TODO:  Explain the classifier

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Balance the dataset before training
# TODO:  Add labels to each quadrant (predicted y-axis, actual x-axis)

plt.show()

## TensorFlow

In [None]:
!pip3 uninstall -q -y tensorflow tensorflow-estimator tb-nightly tf-estimator-nightly tensorboard

In [None]:
!pip uninstall -q -y tensorflow tensorflow-estimator tb-nightly tf-estimator-nightly tensorboard

In [None]:
!pip install -q tensorflow==2.0.0b1 --upgrade --ignore-installed --no-cache --user tensorboard==1.14.0

In [None]:
!pip install tensorflow-hub

In [None]:
import tensorflow as tf

print("Version: ", tf.__version__)

import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import keras

print(keras.backend.backend())

In [None]:
X = df_cleaned_and_filtered[['review_body']]
y = df_cleaned_and_filtered['is_positive_sentiment']

print('X.shape:  {}'.format(X.shape))
print('y.shape:  {}'.format(y.shape))

Create X (features) and y (labels)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
def one_hot_column(feature_name, vocab):
  return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,
                                                 vocab)
  )

In [None]:
import tensorflow_hub as hub

CATEGORICAL_COLUMNS = [] # ['is_positive_sentiment']
NUMERIC_COLUMNS = [] # ['star_rating']
TEXT_COLUMNS = ['review_body']

feature_columns = []

for feature_name in CATEGORICAL_COLUMNS:
    # Need to one-hot encode categorical features.
    vocabulary = X_train[feature_name].unique()
    feature_columns.append(one_hot_column(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(fc.numeric_column(feature_name,
                                             dtype=tf.float32))

for feature_name in TEXT_COLUMNS:
    feature_columns.append(hub.text_embedding_column(key="sentence", 
                                                     module_spec="https://tfhub.dev/google/nnlm-en-dim128/1"))    

In [None]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
    if shuffle:
      dataset = dataset.shuffle(NUM_EXAMPLES)
    # For training, cycle thru dataset as many times as need (n_epochs=None).
    dataset = (dataset
      .repeat(n_epochs)
      .batch(NUM_EXAMPLES))
    return dataset
  return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, shuffle=False, n_epochs=1)

In [None]:
params = {
  'n_trees': 50,
  'max_depth': 3,
  'n_batches_per_layer': 1,
  # You must enable center_bias = True to get DFCs. This will force the model to
  # make an initial prediction before using any features (e.g. use the mean of
  # the training labels for regression or log odds for classification when
  # using cross entropy loss).
  'center_bias': True
}

tf_gbt_classifier = tf.estimator.BoostedTreesClassifier(feature_columns, **params)

# Train model.
tf_gbt_classifier.train(train_input_fn, max_steps=100)

### Create Scikit-Learn Pipeline with XGBoostClassifier

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
#from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from sklearn.linear_model import SGDClassifier

classifier = Pipeline([
    ('features', FeatureUnion([
        ('body', Pipeline([
            ('body_text_selector', TextSelector('review_body')),
            ('tfidf_vectorizer', TfidfVectorizer(tokenizer=Tokenizer, stop_words="english",
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
#        ('star_rating', Pipeline([
#            ('star_rating_selector', NumberSelector('star_rating')),
#            ('standard_scaler', StandardScaler()),
#        ])),
    ])),
#    ('classifier', SGDClassifier(learning_rate='optimal'))
    ('classifier', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
#    ('classifier', RandomForestClassifier()),
    ])

Fit the model

In [None]:
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

preds = classifier.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, preds))
print('Precision: ', precision_score(y_test, preds, average=None))

In [None]:
print(classification_report(y_test, preds))

In [None]:
df_cm = confusion_matrix(y_test, preds)
df_cm

In [None]:
# TODO:  Explain the classifier

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

# TODO:  Balance the dataset
# TODO:  Add labels to each quadrant (False, True / False, True)

plt.show()

### BERT
MultiLabel Classification:  https://towardsdatascience.com/multi-label-classification-using-bert-roberta-xlnet-xlm-and-distilbert-with-simple-transformers-b3e0cda12ce5

BinaryClassification:  https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3

Install simpletransformers by Hugging Face folks

In [None]:
!pip install simpletransformers==0.19.9

In [None]:
!pip install tensorboardx==2.0

In [None]:
!pip install torch==1.4.0

### Prepare the dataset for BERT.

By `simpletransformer` convention, the dataframe must have 2 columns:
* `text`
* `labels`

In [None]:
df_bert = df_cleaned_and_filtered[['review_body', 'is_positive_sentiment']]
df_bert.columns = ['text', 'labels']
df_bert.head(5)

### Split the data into train and test.  

In [None]:
from sklearn.model_selection import train_test_split

df_bert_train, df_bert_test = train_test_split(df_bert, test_size=0.10)

### Train the model

In [None]:
from simpletransformers.classification import ClassificationModel

# Create a TransformerModel

args = {
#   'model_type':  'bert',
#   'model_name': 'bert-base-cased',
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'fp16': False,
#   'fp16_opt_level': 'O1',
   'max_seq_length': 128,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 1,
   'weight_decay': 0,
   'learning_rate': 3e-5,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'tensorboard_dir': 'runs/'
   'overwrite_output_dir': True,
   'reprocess_input_data': False,
}

bert_model = ClassificationModel(model_type='bert', #roberta, etc.
                                 model_name='bert-base-cased',
                                 args=args,
                                 use_cuda=False)

bert_model.train_model(train_df=df_bert_train,
                       eval_df=df_bert_test,
                       show_running_loss=True)

In [None]:
import sklearn

result, model_outputs, wrong_predictions = bert_model.eval_model(eval_df=df_bert_test, acc=sklearn.metrics.accuracy_score)

In [None]:
predictions, raw_outputs = bert_model.predict(["""Very funny. A typical mid 50's comedy."""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = bert_model.predict(["""bad movie"""])
print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))