# Testing Finetuned Bert on Yelp Data

In [1]:
from transformers import AutoModelForSequenceClassification

path = './finetuned_bert'
model = AutoModelForSequenceClassification.from_pretrained(path, local_files_only=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [3]:
raw_inputs = [
    "I am pretty indifferent about the movie. I didn't hate it and I didn't love it.",
    "Joe's Pizza is the best!!! You have to check it out if you're in town.",
    "This supermarket is the absolute WORST. They didn't have any oreos!"
]
inputs = tokenizer(raw_inputs, padding=True,
                   truncation=True, return_tensors="pt")


In [4]:
model(**inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.8890,  3.0487,  1.8772, -1.4140, -2.4633],
        [-2.9113, -2.4303, -0.6465,  2.5758,  3.4966],
        [ 4.6360,  0.5701, -1.4783, -1.8714, -0.9393]],
       grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [5]:
import pandas as pd

path = '../data/yelp_data/yelp_academic_dataset_review_50k.json'
df = pd.read_json(path, lines=True)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,oUVfM9ua2UtJ68sHKgCvNA,-YzMXeOVQfWAVXNAtMSbyw,dnQMntrmickWGYLB30KBEQ,4,0,0,0,Coffee is VERY good. My breakfast was a welcom...,2014-07-16 13:01:33
1,E7QcmW1jmB6T3HkSMdLGDA,jLNR8Tsvi47ENvoNfVYKiQ,4GGhj7Z99E5IYWdEqOsLUQ,5,0,0,0,"I've been coming to this place for 18 years, a...",2019-04-16 20:17:17
2,GgGLzyl408biArY9oLGbRQ,392lRckiPvP-xTZ10E5RPw,c3QxX3toWdqJnKQmmIliRQ,2,0,0,0,This place is a bit overrated. It is very tren...,2021-05-02 23:53:15
3,B-EtTJZH45iCGWDNU36-1Q,OIa6ptM1qUts5arovQUAFQ,-QI8Qi8XWH3D8y8ethnajA,2,6,1,2,"This is an older airport, and it reminded me a...",2018-04-13 15:51:03
4,RJb-x897_abr1CZDYiB1Xw,fwOETgbWmBAhdO9058e4Zg,C5ZOzlslhMxRJDjBDV3KoQ,5,0,0,0,Awesome. One of my favorites. They have less ...,2016-03-27 18:29:16


In [6]:
import random

subset = random.sample(range(0, 50_000), 500)

In [7]:
X = df['text'].loc[subset]
X

1956     Such good food!!!! Love this place for breakfa...
37823    Sunday. Gravy. Little Nonna's makes this amazi...
40292    Thai Star sushi is a great addition to Madeira...
20525    I had the most positively horrible experience ...
27741    Simple breakfast with a few unique offerings. ...
                               ...                        
16233    For all of those people that reviewed this pla...
6625     We love Buffalo Wild Wings and were so excited...
42732    We were absolutely disappointed.  It was our f...
749      I'm in pretty good shape, and I consider mysel...
14829    A very unfortunate experience.  We were delaye...
Name: text, Length: 500, dtype: object

In [8]:
y = df['stars'].loc[subset]
y

1956     2
37823    5
40292    4
20525    1
27741    3
        ..
16233    5
6625     1
42732    1
749      5
14829    1
Name: stars, Length: 500, dtype: int64

In [9]:
import numpy as np

def predict(model, reviews):
    predictions = []
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    for review in reviews.tolist():
        input_tokenized = tokenizer(review, padding=True,
                       truncation=True, return_tensors="pt")
    
        probabilities = model(**input_tokenized)
        max_review = -1
        max_review_id = -1
        for i, s in enumerate(probabilities[0].tolist()[0]):
            if s > max_review:
                max_review = s
                max_review_id = i
        predictions.append(max_review_id+1)
        
    return pd.Series(predictions)

In [10]:
preds = predict(model, reviews=X)

In [11]:
def score(true, pred):
    if len(true) != len(pred):
        raise Error('True and prediction lists must be equal length.')
    right = 0

    for t, p in zip(true, pred):
        if t == p:
            right += 1
    
    return right/len(true)

In [12]:
score(y, preds)

0.644