In [None]:
import torch
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.functional as tf
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe
import re
from ast import literal_eval
from sklearn import metrics
from dataloaders import FakeReviewData
from shared_models import RNNLM

In [None]:
#training hyperparameters
MAX_TOKENS = 150
EMBEDDING_SIZE = 300
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 10
GRAD_CLIP = 1.
LEARNING_RATE = 1e-05
VECTOR_CACHE_DIR = '/Users/jackgibson/Documents/advanced_ml/how_the_bear_got_a_C/glove/'

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
review_df = pd.read_csv('../data/split/train.csv')
val_df = pd.read_csv('../data/split/val.csv')
test_df = pd.read_csv('../data/split/val.csv')
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
val_df['reviews'] = val_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)


# test classifying at reivew level then resturant level
review_df = review_df.explode('reviews')
review_df = review_df.reset_index().drop(columns=['index'])

val_df = val_df.explode('reviews')
val_df = val_df.reset_index().drop(columns=['index'])
    
review_df = review_df.rename(columns={'reviews' : 'text'})
val_df = val_df.rename(columns={'reviews' : 'text'})

review_df = review_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
test_df = test_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})
val_df = val_df.rename(columns={'reviews': 'text', 'Overall Compliance': 'label'})

In [None]:
label_dict = {0: "Yes", 1 : "No"}
glove = GloVe('6B', cache=VECTOR_CACHE_DIR)

train_dataset = FakeReviewData(review_df, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)
test_dataset = FakeReviewData(test_df, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)
val_dataset = FakeReviewData(test_df, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)

In [None]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(test_dataset, **train_params, drop_last=True)
test_loader = DataLoader(test_dataset, **test_params, drop_last=True)
val_loader = DataLoader(val_dataset, **test_params, drop_last=True)

In [None]:
def validate(model, data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        hidden = None
        for _, data in enumerate(data_loader, 0):
            text = data['text'].to(device, dtype = torch.float)
            labels = data['label'].to(device, dtype = torch.float)
        

            outputs, hidden = model(text, hidden)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
model = RNNLM(max_token=MAX_TOKENS, embedding_dim=EMBEDDING_SIZE, hidden_dim=256, num_layers=2, num_labels=2, dropout=0.5)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCELoss()
losses = []

for epoch in range(EPOCHS):
    model.train()
    hidden = None
    for idx, data in enumerate(training_loader):
        labels = data['label'].to(device, dtype = torch.float)
        text = data['text'].to(device, dtype = torch.float)

        hidden = repackage_hidden(hidden)
        model.zero_grad()

        output, hidden = model(text, hidden)
        
        loss = loss_fn(output, labels)
        losses.append(loss.item())
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        if idx % 1000 == 0:
            print("epoch", epoch, "iter", idx, "loss", loss.item())
    
    # evaluate at every step
    preds, labels = validate(model, test_loader)
    print(f'epoch {epoch}: accuracy {metrics.accuracy_score(labels, preds)}')

In [None]:
import matplotlib.pyplot as plt

def plot_metrics(metrics):
    f1s, accuracies, recalls = metrics
    plt.plot(range(1, len(accuracies)+1), f1s, label='f1')
    plt.plot(range(1, len(accuracies)+1), accuracies, label='accuracy')
    plt.plot(range(1, len(accuracies)+1), recalls, label='recall')
    plt.legend()

    print(f"Final values: f1 {f1s[-1]:.3f}, accuracy {accuracies[-1]:.3f}, recall {recalls[-1]:.3f}")