In [None]:
from dataloaders import BERTReviewData
from transformers import DistilBertTokenizer
from torch.utils.data import DataLoader
from shared_models import TextBERT
import pandas as pd
import torch
from sklearn import metrics

In [None]:
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 1e-05

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# import review data
train_data = pd.read_csv('../data/yelp/yelp_verified_slim.csv')
val_data = pd.read_csv('../data/yelp/yelp_verified_slim.csv') # update these!!
test_data = pd.read_csv('../data/yelp/yelp_verified_slim.csv')

##TODO:
# Rename labels column
# Rename text column

label_dict = {0: "GPT", 1 : "Human"}

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_dataset = BERTReviewData(train_data, tokenizer=tokenizer, max_tokens=MAX_TOKENS, expanded=True, labels=label_dict)
test_dataset = BERTReviewData(test_data, tokenizer=tokenizer, max_tokens=MAX_TOKENS, expanded=True, labels=label_dict)
val_dataset = BERTReviewData(val_data, tokenizer=tokenizer, max_tokens=MAX_TOKENS, expanded=True, labels=label_dict)

In [None]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

train_loader = DataLoader(train_dataset, **train_params)
test_loader = DataLoader(test_dataset, **test_val_params)
val_loader = DataLoader(val_dataset, **test_val_params)

In [None]:
model = TextBERT()
model.to(device)
loss_fn = torch.nn.BCELoss()

optimizer = torch.optim.Adam(params= model.parameters(), lr=LEARNING_RATE)

In [None]:
## can also probably get moved to a py file
def validate(data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        for _, data in enumerate(data_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
## train
# train the model
train_results = {}

for epoch in range(EPOCHS):
    results = {}
    losses = []
    model.train()
    for idx, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    preds, targets = validate(val_loader)
    
    results['preds'] = preds
    results['labels'] = targets
    results['losses'] = losses
    train_results[epoch] = results

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    print(f"Epoch {epoch}: Accuracy Score = {accuracy}")

In [None]:
# test model
test_results = {}
preds, targets = validate(test_loader)

accuracy = metrics.accuracy_score(targets, preds)
print(f"Validation Acc = {accuracy}")

test_results['preds'] = preds
test_results['labels'] = targets