In [1]:
import torch
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.functional as tf
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import pandas as pd
from itertools import combinations
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe
import re
from ast import literal_eval
from sklearn import metrics

In [3]:
#training hyperparameters
MAX_TOKENS = 150
EMBEDDING_SIZE = 300
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 10
GRAD_CLIP = 1.
LEARNING_RATE = 1e-05
VECTOR_CACHE_DIR = '/Users/jackgibson/Documents/advanced_ml/how_the_bear_got_a_C/glove/'

In [4]:
# constants
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
review_df = pd.read_csv('../data/split/train.csv')
test_df = pd.read_csv('../data/split/test.csv')
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
review_df = review_df.explode('reviews')
review_df = review_df.reset_index().drop(columns=['index'])

test_df = test_df.explode('reviews')
test_df = test_df.reset_index().drop(columns=['index'])
    
review_df = review_df.rename(columns={'reviews' : 'text'})
test_df = test_df.rename(columns={'reviews' : 'text'})

In [176]:
# It is best to save GloVe data in a cache to reuse across projects.
VECTOR_CACHE_DIR = '//Users/jackgibson/Documents/advanced_ml/how_the_bear_got_a_C'

glove = GloVe(name='6B', cache=VECTOR_CACHE_DIR)

In [None]:
sample_text = 'The dog is black and likes to chase tennis balls.'
tokenize = get_tokenizer('basic_english')
test = glove.get_vecs_by_tokens(tokenize(sample_text)).size()
tf.to_tensor(test, padding_value=30)

In [177]:
class FakeReviewData(Dataset):

    def __init__(self, df: pd.DataFrame, max_tokens: int):
        self.embedding = glove = GloVe(name='6B', cache=VECTOR_CACHE_DIR, dim=EMBEDDING_SIZE)
        self.tokenizer = get_tokenizer('basic_english')
        self.df = df
        self.max_tokens = max_tokens
        self.review_text = self.clean_text(self.df)
        self.target_cat = self.df['Overall Compliance']


    
    def clean_text(self, df: pd.DataFrame) -> pd.Series:

        df['text'] = df['text'].str.strip()
        df['text'] = df['text'].str.replace('\n', ' ')
        df['text'] = df['text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
        
        return df['text']

    def __len__(self):
        return len(self.review_text)

    def __getitem__(self, index):
        review_text = str(self.review_text[index])
        tokens = self.tokenizer(review_text)[:self.max_tokens]
        target_cat = self.target_cat[index]
        inputs = self.embedding.get_vecs_by_tokens(tokens)
    
        # [0, 1] = real, [1, 0] = gpt
        target = []
        if target_cat == 'Yes':
            target = [1, 0]
        else:
            target = [0, 1]
        
        padding = torch.nn.ZeroPad2d((0, 0, 0, self.max_tokens-len(inputs)))

        return {
            'text': padding(inputs),
            'label': torch.tensor(target, dtype=torch.float),
        }

In [185]:
train_dataset = FakeReviewData(df=review_df, max_tokens=MAX_TOKENS)
test_dataset = FakeReviewData(df=test_df, max_tokens=MAX_TOKENS)

TypeError: FakeReviewData.__init__() got an unexpected keyword argument 'drop_last'

In [186]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(test_dataset, **train_params, drop_last=True)
test_loader = DataLoader(test_dataset, **test_params, drop_last=True)

In [187]:
## RNN-LSTM

class RNNLM(torch.nn.Module):
    """ Container module with an embedding module, an LSTM module,
        and a final linear layer to map the LSTM output to the
        vocabulary.
    """

    def __init__(self, max_token, embedding_dim, hidden_dim, num_layers, num_labels,
                 dropout=0.5):
        super(RNNLM, self).__init__()
 
        self.dropout = torch.nn.Dropout(dropout)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.llayer = torch.nn.Linear(hidden_dim, num_labels)
        self.activation = torch.nn.Sigmoid()
      


    def forward(self, input, hidden0):
        '''
        Run forward propagation for a given minibatch of inputs using
        hidden0 as the initial hidden state"
        '''
        embeds = self.dropout(input)
        lstm_out, hiddenn = self.lstm(embeds, hidden0)
        l_out = self.dropout(lstm_out)
        output = self.llayer(l_out[:,-1,:])

        return self.activation(output), hiddenn

In [188]:
def validate(model, data_loader):
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        hidden = None
        for _, data in enumerate(data_loader, 0):
            text = data['text'].to(device, dtype = torch.float)
            labels = data['label'].to(device, dtype = torch.float)
        

            outputs, hidden = model(text, hidden)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [189]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [190]:
model = RNNLM(max_token=MAX_TOKENS, embedding_dim=EMBEDDING_SIZE, hidden_dim=256, num_layers=2, num_labels=2, dropout=0.5)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCELoss()
losses = []

for epoch in range(EPOCHS):
    model.train()
    hidden = None
    for idx, data in enumerate(training_loader):
        labels = data['label'].to(device, dtype = torch.float)
        text = data['text'].to(device, dtype = torch.float)

        hidden = repackage_hidden(hidden)
        model.zero_grad()

        output, hidden = model(text, hidden)
        
        loss = loss_fn(output, labels)
        losses.append(loss.item())
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        if idx % 1000 == 0:
            print("epoch", epoch, "iter", idx, "loss", loss.item())
    
    # evaluate at every step
    preds, labels = validate(model, test_loader)

epoch 0 iter 0 loss 0.7112951278686523


In [191]:
accuracy = metrics.accuracy_score(targets, preds)

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
