In [54]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import log_softmax
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from ast import literal_eval
import re

In [55]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [56]:
#training hyperparameters
MAX_TOKENS = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05

# change to true to run per review
EXPANDED = True

In [57]:
review_df = pd.read_csv('../data/split/train.csv')
test_df = pd.read_csv('../data/split/test.csv')
review_df['reviews'] = review_df['reviews'].apply(literal_eval)
test_df['reviews'] = test_df['reviews'].apply(literal_eval)

# test classifying at reivew level then resturant level
if EXPANDED:
    review_df = review_df.explode('reviews')
    review_df = review_df.reset_index().drop(columns=['index'])

    test_df = test_df.explode('reviews')
    test_df = test_df.reset_index().drop(columns=['index'])
    
review_df

Unnamed: 0,Overall Compliance,name,stars,review_count,is_open,reviews,ratings,n_reviews,avg_rating,IR_regular,...,Cheesesteaks,Middle Eastern,Wineries,Indian,Halal,Vegan,Vegetarian,Beer Bar,Soup,Sushi Bars
0,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,water ice and pretzels were a staple evening t...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
1,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,Love this water ice/ ice cream shop. Used to g...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
2,Yes,Pop's Homemade Water Ice - Havertown,4.0,16.0,1.0,"""I'm bored. It's a gorgeous night. You want to...","[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
3,Yes,South,4.0,501.0,1.0,We came here as I have heard that the food is ...,"[4.0, 5.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 4.0, ...",48,4.229167,1,...,0,0,0,0,0,0,0,0,0,0
4,Yes,South,4.0,501.0,1.0,Wow! Great venue with excellent food. I went t...,"[4.0, 5.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 4.0, ...",48,4.229167,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13587,Yes,Core De Roma,4.0,96.0,1.0,"Very good food, terrible service. Waited for ...","[1.0, 3.0]",2,2.000000,1,...,0,0,0,0,0,0,0,0,0,0
13588,Yes,Core De Roma,4.0,96.0,1.0,"We went here on a Tuesday night around 7pm, no...","[1.0, 3.0]",2,2.000000,1,...,0,0,0,0,0,0,0,0,0,0
13589,Yes,Wholesale Granite Marble & Tile,5.0,27.0,1.0,Wholesale Granite did an amazing job for my of...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0
13590,Yes,Wholesale Granite Marble & Tile,5.0,27.0,1.0,From start to finish Wholesale Granite provide...,"[5.0, 5.0, 5.0]",3,5.000000,1,...,0,0,0,0,0,0,0,0,0,0


In [58]:
#mode info
model_checkpoint = "distilbert/distilbert-base-uncased"

In [59]:
class ReviewData(Dataset):

    def __init__(self, df: pd.DataFrame, tokenizer : DistilBertTokenizer, max_tokens: int, expanded: bool = False):
        self.tokenizer = tokenizer
        self.df = df
        self.max_tokens = max_tokens
        self.expanded = expanded
        self.review_text = self.clean_text(self.df)
        self.target_cat = self.df['Overall Compliance']


    def clean_text(self, df: pd.DataFrame) -> pd.Series:

        def clean_reviews(reviews):
            cleaned = []
            for review in reviews:
                review = review.replace('\n', ' ')
                cleaned.append(re.sub(r"[^a-zA-Z0-9]", ' ', review).strip()) #may need to find a better way to do so

            return cleaned

        if self.expanded:
            df['reviews'] = df['reviews'].str.strip()
            df['reviews'] = df['reviews'].str.replace('\n', ' ')
            df['reviews'] = df['reviews'].str.replace(r"[^a-zA-Z0-9]", ' ', regex=True)

            return df['reviews']

        return df['reviews'].apply(clean_reviews)


    def __len__(self):
        return len(self.review_text)

    def __getitem__(self, index):
        review_text = str(self.review_text[index])
        target_cat = self.target_cat[index]

        if not self.expanded:
            # combine all reviews into one string
            review_text = " ".join(self.review_text[index])

        inputs = self.tokenizer.encode_plus(
            review_text,
            None,
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding='max_length',
            truncation=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # [0, 1] = pass, [1, 0] = fail
        target = []
        if target_cat == 'No':
            target = [1, 0]
        else:
            target = [0, 1]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.float)
        }

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_data = ReviewData(review_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED)
test_data = ReviewData(test_df, tokenizer, max_tokens=MAX_TOKENS, expanded=EXPANDED)

In [9]:
#load dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                }

training_loader = DataLoader(train_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

In [68]:
# BERT-based model (only text)

class BERTAndErnie(torch.nn.Module):
    def __init__(self):
        super(BERTAndErnie, self).__init__()
        self.l1 = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
        # could also use SoftMax?
        self.l2 = torch.nn.Sigmoid()
       

    ## TODO: args for cat features
    def forward(self, ids, mask):
        out = self.l1(ids, attention_mask=mask)
        output = self.l2(out.logits)

        return output

In [69]:
# check with Claire to standardize
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [73]:
test = next(iter(training_loader))

In [78]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

outputs = model(test['ids'], test['mask'])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
def validate():
    """
    Evaluate model during trainging.
    """
    model.eval()
    fin_targets=[]
    fin_outputs=[]

    with torch.no_grad():

        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)

            #compute argmax
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)

            fin_targets.extend(labels.cpu().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [72]:
# train the model
losses = []
saved_preds = []
saved_labels = []

for epoch in range(EPOCHS):

    model.train()
    for idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        

        if idx%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    preds, targets = validate()

    # play with a softmax activation function in the classifier
    accuracy = metrics.accuracy_score(targets, preds)
    print(f"Accuracy Score = {accuracy}")

torch.Size([16, 2])
Epoch: 0, Loss:  0.7223564982414246
torch.Size([16, 2])
torch.Size([16, 2])


KeyboardInterrupt: 

In [26]:
labels = torch.tensor([1])
labels

tensor([1])

In [19]:
from collections import defaultdict

epoch_results = defaultdict(list)

epoch_results['labels'].extend([1,2,3])

In [20]:
epoch_results

defaultdict(list, {'labels': [1, 2, 3]})