In [None]:
from dataloaders import BERTReviewData
from transformers import DistilBertTokenizer
from torch.utils.data import DataLoader
from shared_models import TextBERT
import pandas as pd
import torch
from sklearn import metrics
import os

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 1e-05

# change to true to run per review
EXPANDED = False

save_dir = 'bert_compact_text_nosig'
os.makedirs(save_dir, exist_ok=True)

In [None]:
review_df = pd.read_csv('../data/split/train.csv')
test_df = pd.read_csv('../data/split/test.csv')
val_df = pd.read_csv('../data/split/val.csv')

review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)
val_df['reviews'] = val_df['reviews'].apply(literal_eval)

review_df = review_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
test_df = test_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
val_df = val_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})


label_dict = {0 : 'Yes', 1 : 'No'}
# test classifying at reivew level then resturant level
if EXPANDED:
    review_df = review_df.explode('reviews')
    review_df = review_df.reset_index().drop(columns=['index'])

    test_df = test_df.explode('reviews')
    test_df = test_df.reset_index().drop(columns=['index'])
    
    val_df = val_df.explode('reviews')
    val_df = val_df.reset_index().drop(columns=['index'])
    

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=False)
train_data = BERTReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, labels=label_dict)
test_data = BERTReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, labels=label_dict)
val_data = BERTReviewData(val_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED, label=label_dict)

In [7]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)
val_loader = DataLoader(val_data, **test_params)

In [10]:
# initialize model and optimizing function
model = TextBERT()
model.to(device)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def validate(data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        for _, data in enumerate(data_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [12]:
# train the model
train_results = {}

for epoch in range(EPOCHS):
    results = {}
    losses = []
    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    preds, targets = validate(testing_loader)
    
    results['preds'] = preds
    results['labels'] = targets
    results['losses'] = losses
    train_results[epoch] = results

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    print(f"Epoch {epoch}: Accuracy Score = {accuracy}")

Epoch: 0, Loss:  0.6662896275520325
Epoch: 0, Loss:  0.4127616882324219
Epoch 0: Accuracy Score = 0.8205128205128205
Epoch: 1, Loss:  0.561906635761261
Epoch: 1, Loss:  0.481205552816391
Epoch 1: Accuracy Score = 0.8205128205128205
Epoch: 2, Loss:  0.5565972328186035
Epoch: 2, Loss:  0.40803948044776917
Epoch 2: Accuracy Score = 0.8205128205128205
Epoch: 3, Loss:  0.8382182121276855
Epoch: 3, Loss:  0.261642187833786
Epoch 3: Accuracy Score = 0.8205128205128205


In [14]:
# save training data
with open(f"{save_dir}/train_data.json", "w") as out:
    json.dump(train_results, out)

In [13]:
# evaluate model
test_results = {}
preds, targets = validate(val_loader)

accuracy = metrics.accuracy_score(targets, preds)
print(f"Validation Acc = {accuracy}")

test_results['preds'] = preds
test_results['labels'] = targets

Validation Acc = 0.8055555555555556


In [15]:
# save val data
with open(f"{save_dir}/evaluate_data.json", "w") as out:
    json.dump(test_results, out)