In [33]:
# with hugging face wrapper for sanity check
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from ast import literal_eval
import re

In [34]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 1e-05

# change to true to run per review
EXPANDED = False

model_checkpoint = "distilbert/distilbert-base-uncased"

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [35]:
review_df = pd.read_csv('../data/split/train.csv', usecols=['Overall Compliance', 'reviews'])
test_df = pd.read_csv('../data/split/test.csv', usecols=['Overall Compliance', 'reviews'])
val_df = pd.read_csv('../data/split/val.csv', usecols=['Overall Compliance', 'reviews'])
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)
val_df['reviews'] = val_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
if EXPANDED:
    review_df = review_df.explode('reviews')
    review_df = review_df.reset_index().drop(columns=['index'])

    test_df = test_df.explode('reviews')
    test_df = test_df.reset_index().drop(columns=['index'])
    
    val_df = val_df.explode('reviews')
    val_df = val_df.reset_index().drop(columns=['index'])

train_ds = Dataset.from_pandas(review_df)
test_ds = Dataset.from_pandas(test_df)
val_ds = Dataset.from_pandas(val_df)

review_dataset = DatasetDict()

review_dataset['train'] = train_ds
review_dataset['test'] = test_ds
review_dataset['val'] = val_ds

In [36]:
def clean_reviews(reviews):
    cleaned = []
    for review in reviews:
        review = review.replace('\n', ' ')
        cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so

    return cleaned

def extract_text_features(data):
    output = {}
    reviews = clean_reviews(data['reviews'])
    output['text'] = " ".join(reviews)
    output['label'] = 0 if data['Overall Compliance'] == 'Yes' else 1

    return output


def extract_expanded_text_features(data):
    output = {}
    review = data['reviews']
    review = review.replace('\n', ' ')
    output['text'] = re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()
    output['label'] = 0 if data['Overall Compliance'] == 'Yes' else 1

    return output

In [37]:
extract_function = extract_text_features

if EXPANDED:
    extract_function = extract_expanded_text_features

review_dataset = review_dataset.map(extract_function)

Map: 100%|██████████| 1754/1754 [00:00<00:00, 3867.16 examples/s]
Map: 100%|██████████| 195/195 [00:00<00:00, 3836.24 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 3969.22 examples/s]


In [39]:
review_dataset['train'][2]

{'Overall Compliance': 'No',
 'reviews': ["(3.5) ~ good overall food service.\n\nMENU:\n* Plain Pizza = (3.75) Good.  It's similar in style & texture to the original pan pizzas that you'd get at Pizza Hut.\n* French Fries = (3.25) pretty good",
  'Just had pizza from here for 1st time. It was delicious. Light crust lots of cheese. Definitely will reorder.',
  "Review for cheesesteak only. I've been searching the Main Line for good cheesesteaks and this is the place I'd go when in the Devon/Berwyn area (see my other reviews for where'd I'd go in Wayne). This place makes a proper cheesesteak.",
  'One of my favorite spots!! Food is amazing, staff is super friendly and I love choosing this place to go eat.',
  'Great greek-style pizza that has been serving the area for years.  Still family owned and they care about their quality.  I grew up on greek style pizza and I need my fix from time to time.  This is the place I go to for just that.  Beyond their pizza, their food is of great qualit

In [40]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(examples):
    return tokenizer(examples["text"], max_length=MAX_TOKENS, truncation=True)

In [41]:
token_dataset = review_dataset.map(tokenize, remove_columns=['Overall Compliance', 'reviews'])

Map: 100%|██████████| 1754/1754 [00:04<00:00, 361.23 examples/s]
Map: 100%|██████████| 195/195 [00:00<00:00, 344.23 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 310.10 examples/s]


In [43]:
token_dataset['train'][2]

{'text': '3 5    good overall food service   MENU    Plain Pizza    3 75  Good   It s similar in style   texture to the original pan pizzas that you d get at Pizza Hut    French Fries    3 25  pretty good Just had pizza from here for 1st time  It was delicious  Light crust lots of cheese  Definitely will reorder Review for cheesesteak only  I ve been searching the Main Line for good cheesesteaks and this is the place I d go when in the Devon Berwyn area  see my other reviews for where d I d go in Wayne   This place makes a proper cheesesteak One of my favorite spots   Food is amazing  staff is super friendly and I love choosing this place to go eat Great greek style pizza that has been serving the area for years   Still family owned and they care about their quality   I grew up on greek style pizza and I need my fix from time to time   This is the place I go to for just that   Beyond their pizza  their food is of great quality and really tasty   They always have a smile on their face w

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [30]:
accuracy_metric = evaluate.load("accuracy")
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)

    return {'accuracy': accuracy, 'recall': recall, 'f1': f1}

Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 9.50MB/s]
Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 14.1MB/s]


In [32]:
f1_metric.compute(predictions=[0,1], references=[0,1])

{'f1': 1.0}

In [11]:
id2label = {0: "Pass", 1: "Fail"}
label2id = {"Pass": 0, "Fail": 1}

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="base_bert_model",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=token_dataset["train"],
    eval_dataset=token_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


train_result = trainer.train()

trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Recall,F1
1,No log,0.472015,{'accuracy': 0.8205128205128205},{'recall': 0.0},{'f1': 0.0}
2,No log,0.465191,{'accuracy': 0.8205128205128205},{'recall': 0.0},{'f1': 0.0}
3,No log,0.458886,{'accuracy': 0.8205128205128205},{'recall': 0.0},{'f1': 0.0}
4,No log,0.458501,{'accuracy': 0.8205128205128205},{'recall': 0.0},{'f1': 0.0}


***** train metrics *****
  epoch                    =        4.0
  total_flos               =   865563GF
  train_loss               =     0.5201
  train_runtime            = 0:05:07.54
  train_samples_per_second =     22.813
  train_steps_per_second   =      1.431


In [19]:
import json

metrics = trainer.evaluate(token_dataset['val'])

file_path = "base_bert_model/val_results.json"

with open(file_path, "w") as json_file:
    json.dump(metrics, json_file)

In [None]:
!zip -r base_bert.zip base_bert_model