In [None]:
from dataloaders import FakeReviewData
from torch.utils.data import DataLoader
from shared_models import RNNLM
import pandas as pd
import torch
from sklearn import metrics
from torchtext.vocab import GloVe
import json

In [None]:
#training hyperparameters
MAX_TOKENS = 150
EMBEDDING_SIZE = 300
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 10
GRAD_CLIP = 1.
LEARNING_RATE = 1e-05
VECTOR_CACHE_DIR = 'ADD FILE PATH HERE'

save_dir = 'fake_reviews_rnn'

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# import review data
train_data = pd.read_csv('../data/fake_reviews/train.csv')
val_data = pd.read_csv('../data/fake_reviews/val.csv') # update these!!
test_data = pd.read_csv('../data/fake_reviews/test.csv')

label_dict = {0: "FAKE", 1 : "HUMAN"}

glove = GloVe('6B', cache=VECTOR_CACHE_DIR)

train_dataset = FakeReviewData(train_data, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)
test_dataset = FakeReviewData(test_data, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)
val_dataset = FakeReviewData(val_data, embedding=glove, max_tokens=MAX_TOKENS, labels=label_dict)

In [None]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

train_loader = DataLoader(train_dataset, **train_params, drop_last=True)
test_loader = DataLoader(test_dataset, **test_val_params, drop_last=True)
val_loader = DataLoader(val_dataset, **test_val_params, drop_last=True)

In [None]:
def validate(model, data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        hidden = None
        for _, data in enumerate(data_loader, 0):
            text = data['text'].to(device, dtype = torch.float)
            labels = data['label'].to(device, dtype = torch.float)
        

            outputs, hidden = model(text, hidden)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [None]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
model = RNNLM(embedding_dim=EMBEDDING_SIZE, hidden_dim=256, num_layers=2, num_labels=2, dropout=0.5)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCELoss()
train_results = {}

for epoch in range(EPOCHS):
    results = {}
    losses = []

    model.train()
    hidden = None
    for idx, data in enumerate(train_loader):
        labels = data['label'].to(device, dtype = torch.float)
        text = data['text'].to(device, dtype = torch.float)

        hidden = repackage_hidden(hidden)
        model.zero_grad()

        output, hidden = model(text, hidden)
        
        loss = loss_fn(output, labels)
        losses.append(loss.item())
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        if idx % 1000 == 0:
            print("epoch", epoch, "iter", idx, "loss", loss.item())
    
    # evaluate at every step
    preds, labels = validate(model, val_loader)
    results['preds'] = preds
    results['labels'] = labels
    results['losses'] = losses
    train_results[epoch] = results

    print(f'epoch {epoch}: accuracy {metrics.accuracy_score(labels, preds)}')

In [None]:
# save training data
with open(f"{save_dir}/train_data.json", "w") as out:
    json.dump(train_results, out)

In [None]:
# test model
test_results = {}
preds, targets = validate(test_loader)

accuracy = metrics.accuracy_score(targets, preds)
print(f"Validation Acc = {accuracy}")

test_results['preds'] = preds
test_results['labels'] = targets

In [None]:
# save val data
with open(f"{save_dir}/evaluate_data.json", "w") as out:
    json.dump(test_results, out)