In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import log_softmax
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from ast import literal_eval
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [28]:
review_df = pd.read_csv('../data/split/train.csv', usecols=['Overall Compliance', 'reviews'])
test_df = pd.read_csv('../data/split/test.csv', usecols=['Overall Compliance', 'reviews'])
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
review_df = review_df.explode('reviews')
review_df = review_df.reset_index().drop(columns=['index'])

test_df = test_df.explode('reviews')
test_df = test_df.reset_index().drop(columns=['index'])
review_df

Unnamed: 0,Overall Compliance,reviews
0,Yes,water ice and pretzels were a staple evening t...
1,Yes,Love this water ice/ ice cream shop. Used to g...
2,Yes,"""I'm bored. It's a gorgeous night. You want to..."
3,Yes,We came here as I have heard that the food is ...
4,Yes,Wow! Great venue with excellent food. I went t...
...,...,...
13587,Yes,"Very good food, terrible service. Waited for ..."
13588,Yes,"We went here on a Tuesday night around 7pm, no..."
13589,Yes,Wholesale Granite did an amazing job for my of...
13590,Yes,From start to finish Wholesale Granite provide...


In [68]:
review_df[review_df['Overall Compliance'] == 'No']

Unnamed: 0,Overall Compliance,reviews
51,No,3 5 good overall food service MENU Pla...
52,No,Just had pizza from here for 1st time It was ...
53,No,Review for cheesesteak only I ve been searchi...
54,No,One of my favorite spots Food is amazing st...
55,No,Great greek style pizza that has been serving ...
...,...,...
13575,No,What a gem what a find during my week of work...
13576,No,I was pleasantly surprised to find this place ...
13578,No,Horrible customer Service Arrogant self agg...
13579,No,Finally hit my limit with Arrow Customer serv...


In [4]:
#mode info
model_checkpoint = "distilbert/distilbert-base-uncased"

In [5]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

In [69]:
class ReviewData(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer : DistilBertTokenizer, max_tokens: int, expanded: bool = False):
        self.tokenizer = tokenizer
        self.df = df
        self.max_tokens = max_tokens
        self.expanded = expanded
        self.review_text = self.clean_text(self.df)
        self.target_cat = self.df['Overall Compliance']
     
    
    def clean_text(self, df: pd.DataFrame) -> pd.Series:

        def clean_reviews(reviews):
            cleaned = []
            for review in reviews:
                review = review.replace('\n', ' ')
                cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so

            return cleaned

        if self.expanded:
            df['reviews'] = df['reviews'].str.strip()
            df['reviews'] = df['reviews'].str.replace('\n', ' ')
            df['reviews'] = df['reviews'].str.replace(r"[^a-zA-Z0-9]", ' ', regex=True)

            return df['reviews']

        return df['reviews'].apply(clean_reviews)


    def __len__(self):
        return len(self.review_text)

    def __getitem__(self, index):
        review_text = str(self.review_text[index])
        target_cat = self.target_cat[index]
        
        if not self.expanded: 
            # combine all reviews into one string
            review_text = " ".join(self.review_text[index])

        inputs = self.tokenizer.encode_plus(
            review_text,
            None,
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding='max_length',
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # [0, 1] = pass, [1, 0] = fail
        target = []
        if target_cat == 'No':
            target = [1, 0]
        else:
            target = [0, 1]
     

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.float)
        }

In [70]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_data = ReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=True)
test_data = ReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=True)

In [71]:
#load dataloaders 
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

In [72]:
# BERT-based model (only text)

class BERTAndErnie(torch.nn.Module):
    def __init__(self):
        super(BERTAndErnie, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")

        ## TODO: add other linear classifer layer
        self.l2 = torch.nn.Linear(768, 2)

    ## TODO: args for cat features 
    def forward(self, ids, mask):
        out1 = self.l1(ids, attention_mask=mask, return_dict=False)
        # pull out tensor and reshape
        hidden_layer = out1[0]
        bert_out = hidden_layer[:, 0]
        output = self.l2(bert_out)
   
        return output

In [73]:
# check with Claire to standardize 
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [74]:
# initialize model and optimizing function
model = BERTAndErnie()
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [75]:
def validate():
    """
    Evaluate model during trainging. 
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    
    with torch.no_grad():

        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)
            
            #compute argmax 
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [76]:
# train the model
saved_outputs = []
saved_accuracy = []

for epoch in range(EPOCHS):

    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    preds, targets = validate()

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    saved_accuracy.append(accuracy)
    print(f"Accuracy Score = {accuracy}")

Epoch: 0, Loss:  0.7016710042953491
Accuracy Score = 0.8329113924050633
