# Using BERT with `Simple Transformers`

Install simpletransformers based on the Transformers library by HuggingFace:  https://github.com/ThilinaRajapakse/simpletransformers/

**BinaryClassification**:  https://towardsdatascience.com/simple-transformers-introducing-the-easiest-bert-roberta-xlnet-and-xlm-library-58bf8c59b2a3

In [None]:
!pip install -q boto3
!pip install -q scikit-learn==0.20.3
!pip install -q simpletransformers==0.22.1
!pip install -q tensorboardx==2.0
!pip install -q torch==1.4.0 torchvision==0.5.0

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Download the Data Locally

In [None]:
!aws s3 cp 's3://{bucket}/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

In [None]:
import csv

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

In [None]:
df.head(5)

# Enrich the Data with `is_positive_sentiment` Column

In [None]:
# df['is_positive_sentiment'] = (df['star_rating'] >= 4).astype(int)
# df.head(5)

# Adapt the Dataset to Simple Transformers Convention.

By `simpletransformer` convention, the dataframe must have 2 columns:
* `text`
* `labels`

In [None]:
df_bert = df[['review_body', 'star_rating']]
df_bert.columns = ['text', 'labels']

# Start Labels at 0 instead of 1
SimpleTransformers requires that our labels start with 0:  https://medium.com/swlh/simple-transformers-multi-class-text-classification-with-bert-roberta-xlnet-xlm-and-8b585000ce3a

In [None]:
df_bert['labels'] = df_bert['labels'] - 1
df_bert.head(500)

# To Lower the Training Time, Use a Subset of the Dataset

In [None]:
df_bert = df_bert[:2000]
df_bert.shape

# Split the Data into `train`, `validation`, and `test`.

In [None]:
from sklearn.model_selection import train_test_split

df_bert_train, df_bert_holdout = train_test_split(df_bert, test_size=0.40)
df_bert_validation, df_bert_test = train_test_split(df_bert_holdout, test_size=0.50)

print(df_bert_train.shape)
print(df_bert_validation.shape)
print(df_bert_test.shape)

# Train the Classification Model

In [None]:
from simpletransformers.classification import ClassificationModel

# args = {
#    'output_dir': 'outputs/',
#    'cache_dir': 'cache/',
#    'fp16': False,
#    'max_seq_length': 128,
#    'train_batch_size': 8,
#    'eval_batch_size': 8,
#    'gradient_accumulation_steps': 1,
#    'num_train_epochs': 1,
#    'weight_decay': 0,
#    'learning_rate': 3e-5,
#    'adam_epsilon': 1e-8,
#    'warmup_ratio': 0.06,
#    'warmup_steps': 0,
#    'max_grad_norm': 1.0,
#    'logging_steps': 50,
#    'evaluate_during_training': False,
#    'save_steps': 2000,
#    'eval_all_checkpoints': True,
#    'use_tensorboard': True,
#    'tensorboard_dir': 'tensorboard',
#    'overwrite_output_dir': True,
#    'reprocess_input_data': False,
# }

train_args={
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'num_train_epochs': 3,
}

bert_model = ClassificationModel(model_type='distilbert', # bert, distilbert, etc, etc.
                                 model_name='distilbert-base-cased',
                                 args=train_args,
                                 use_cuda=False,
                                 num_labels=5)

bert_model.train_model(train_df=df_bert_train,
                       eval_df=df_bert_validation,
                       show_running_loss=True)

# Evaluate the Model Using the `test` Dataset

In [None]:
import sklearn

result, model_outputs, wrong_predictions = bert_model.eval_model(eval_df=df_bert_test, acc=sklearn.metrics.accuracy_score)

result

## Show the Bad Predictions

In [None]:
print('Number of wrong predictions: {}'.format(len(wrong_predictions)))
print('\n')

for prediction in wrong_predictions:
    print(prediction.text_a)
    print('\n')

## Calculate the Accuracy and Precision

In [None]:
preds_test, preds_raw_outputs = bert_model.predict(df_bert_test['text'].tolist())
preds_test

In [None]:
y_test = df_bert_test['labels']
y_test.shape

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print('Test Accuracy: ', accuracy_score(y_test, preds_test))
print('Test Precision: ', precision_score(y_test, preds_test, average=None))

# Perform Ad-Hoc Predictions

In [None]:
predictions, raw_outputs = bert_model.predict(["""I really enjoyed this item.  I highly recommend it."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = bert_model.predict(["""This item is awful and terrible."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))