In [23]:
import json
import pandas as pd
import numpy as np

# nltk pos tagging 
import nltk
from nltk.tokenize import word_tokenize

# transformers-based pos tagging
from transformers import AutoModelForTokenClassification, AutoTokenizer
import tensorflow as tf

#### import data

In [24]:
# read json data
data_path = 'C:\\Users\\tanch\\Documents\\NTU\\NTU Year 3\\Sem 1\\CZ4045 Natural Language Processing\\Assignment 1\\local\\data\\reviewSelected100\\reviewSelected100.json'
reviews = []
with open(data_path,"r") as f:
    for l in f.readlines():
        reviews.append(json.loads(l))
        
# convert to dataframe
reviews_df = pd.DataFrame(reviews)

## Load Transformers-based POS Tagger

In [25]:
model_checkpoint = "vblagoje/bert-english-uncased-finetuned-pos"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [26]:
def bert_generate_tags(text):
    input_ids = tokenizer(text, return_attention_mask  = False, add_special_tokens = True, return_tensors='pt')['input_ids']   # wordpiece toeknization
    logits = model(input_ids).logits                                        # model forwards pass
    ids = np.argmax(logits.detach().numpy()[0],axis = 1)                    # acquire labels for each token             
    labels = [model.config.id2label[i] for i in ids]
    return list(zip([tokenizer.decode(t) for t in input_ids[0]],labels))[1:-1]    # return labels for each token

## NLTK POS Tagger

In [27]:
def nltk_generate_tags(text):
    text = word_tokenize(text)
    return nltk.pos_tag(text)

## Randomly select 5 Sentences

In [28]:
seed = 85
sample_size = 5
samples = reviews_df.sample(sample_size, random_state = seed)[["review_id","text"]].reset_index(drop=True)
[len(t) for t in samples.text]

[129, 146, 227, 2189, 1647]

# Apply POS Tagging

### Review 1

In [29]:
i = 0
review = samples.text[i]
print(review)

Love this place! Great place to grab a delicious sandwich and soup without spending a fortune!! My favourite is the Farmers Club!


In [30]:
bert_tags = bert_generate_tags(review)
nltk_tags = nltk_generate_tags(review)

In [31]:
print("BERT POS Tagging:             NLTK POS Tagging:")
for i in range(len(bert_tags)):
    if i >= len(nltk_tags):
        print(i,bert_tags[i])
    else:
        print(f"{i} {str(bert_tags[i]):28}{nltk_tags[i]}")

BERT POS Tagging:             NLTK POS Tagging:
0 ('love', 'VERB')            ('Love', 'VB')
1 ('this', 'DET')             ('this', 'DT')
2 ('place', 'NOUN')           ('place', 'NN')
3 ('!', 'PUNCT')              ('!', '.')
4 ('great', 'ADJ')            ('Great', 'JJ')
5 ('place', 'NOUN')           ('place', 'NN')
6 ('to', 'PART')              ('to', 'TO')
7 ('grab', 'VERB')            ('grab', 'VB')
8 ('a', 'DET')                ('a', 'DT')
9 ('delicious', 'ADJ')        ('delicious', 'JJ')
10 ('sandwich', 'NOUN')        ('sandwich', 'NN')
11 ('and', 'CCONJ')            ('and', 'CC')
12 ('soup', 'NOUN')            ('soup', 'NN')
13 ('without', 'SCONJ')        ('without', 'IN')
14 ('spending', 'VERB')        ('spending', 'VBG')
15 ('a', 'DET')                ('a', 'DT')
16 ('fortune', 'NOUN')         ('fortune', 'NN')
17 ('!', 'PUNCT')              ('!', '.')
18 ('!', 'PUNCT')              ('!', '.')
19 ('my', 'PRON')              ('My', 'PRP$')
20 ('favourite', 'NOUN')       ('favouri

### Review 2

In [32]:
i = 1
review = samples.text[i]
print(review)

Large space for group events and great service. Food is decent as well - a good go to if you're in the area and just craving drinks and basic food


In [33]:
bert_tags = bert_generate_tags(review)
nltk_tags = nltk_generate_tags(review)

In [34]:
print("BERT POS Tagging:             NLTK POS Tagging:")
for i in range(len(bert_tags)):
    if i >= len(nltk_tags):
        print(i,bert_tags[i])
    else:
        print(f"{i} {str(bert_tags[i]):28}{nltk_tags[i]}")

BERT POS Tagging:             NLTK POS Tagging:
0 ('large', 'ADJ')            ('Large', 'JJ')
1 ('space', 'NOUN')           ('space', 'NN')
2 ('for', 'ADP')              ('for', 'IN')
3 ('group', 'NOUN')           ('group', 'NN')
4 ('events', 'NOUN')          ('events', 'NNS')
5 ('and', 'CCONJ')            ('and', 'CC')
6 ('great', 'ADJ')            ('great', 'JJ')
7 ('service', 'NOUN')         ('service', 'NN')
8 ('.', 'PUNCT')              ('.', '.')
9 ('food', 'NOUN')            ('Food', 'NNP')
10 ('is', 'AUX')               ('is', 'VBZ')
11 ('decent', 'ADJ')           ('decent', 'JJ')
12 ('as', 'ADV')               ('as', 'RB')
13 ('well', 'ADV')             ('well', 'RB')
14 ('-', 'PUNCT')              ('-', ':')
15 ('a', 'DET')                ('a', 'DT')
16 ('good', 'ADJ')             ('good', 'JJ')
17 ('go', 'NOUN')              ('go', 'NN')
18 ('to', 'ADP')               ('to', 'TO')
19 ('if', 'SCONJ')             ('if', 'IN')
20 ('you', 'PRON')             ('you', 'PRP')
21 ("

### Review 3

In [35]:
i = 2
review = samples.text[i]
print(review)

Chettinad Chicken masala the sauce is good but chicken little pieces bones they cut the chicken tight into 4 not good for the value $12.99  rate poor
Boneless chicken biriyani was excellent. 

Fish Fry was not fresh very fishy.


In [36]:
bert_tags = bert_generate_tags(review)
nltk_tags = nltk_generate_tags(review)

In [37]:
print("BERT POS Tagging:             NLTK POS Tagging:")
for i in range(len(bert_tags)):
    if i >= len(nltk_tags):
        print(i,bert_tags[i])
    else:
        print(f"{i} {str(bert_tags[i]):28}{nltk_tags[i]}")

BERT POS Tagging:             NLTK POS Tagging:
0 ('chet', 'NOUN')            ('Chettinad', 'NNP')
1 ('##tina', 'NOUN')          ('Chicken', 'NNP')
2 ('##d', 'PROPN')            ('masala', 'VBD')
3 ('chicken', 'NOUN')         ('the', 'DT')
4 ('mas', 'NOUN')             ('sauce', 'NN')
5 ('##ala', 'NOUN')           ('is', 'VBZ')
6 ('the', 'DET')              ('good', 'JJ')
7 ('sauce', 'NOUN')           ('but', 'CC')
8 ('is', 'AUX')               ('chicken', 'JJ')
9 ('good', 'ADJ')             ('little', 'JJ')
10 ('but', 'CCONJ')            ('pieces', 'NNS')
11 ('chicken', 'NOUN')         ('bones', 'NNS')
12 ('little', 'ADJ')           ('they', 'PRP')
13 ('pieces', 'NOUN')          ('cut', 'VBD')
14 ('bones', 'VERB')           ('the', 'DT')
15 ('they', 'PRON')            ('chicken', 'NN')
16 ('cut', 'VERB')             ('tight', 'VBD')
17 ('the', 'DET')              ('into', 'IN')
18 ('chicken', 'NOUN')         ('4', 'CD')
19 ('tight', 'ADV')            ('not', 'RB')
20 ('into', 'ADP')  

### Review 4

In [38]:
i = 3
review = samples.text[i]
print(review)

For years I have been going to the strip mall aka food court mall near the north west corner of Eglington and Dixie.  I've seen some businesses come and go through that time and because of that you get the new place every once in a while.  The Burger Factory has been here for a while now and although I hadn't been, I knew it was there.  The fact is this area of Dixie road between Eglinton and Matheson has MANY places to eat.  If I were a new business in this area with this much competition I'd probably be like, yikes!  Still it's a busy enough area to warrant the kind of dense concentration and saturation of restaurants I suppose.  

The Burger Factory is part of the north end of the strip mall, opposite the long time mainstays like Bamiyan Kabob and Chang & Huang's which are on the south side.  My family just had an early dinner and Chang's and my brother was telling me that the Burger Factory has funnel cakes and that we should go for it as a group dessert.  I was like OK, sure.  Tha

In [39]:
bert_tags = bert_generate_tags(review)
nltk_tags = nltk_generate_tags(review)

In [40]:
print("BERT POS Tagging:             NLTK POS Tagging:")
for i in range(len(bert_tags)):
    if i >= len(nltk_tags):
        print(i,bert_tags[i])
    else:
        print(f"{i} {str(bert_tags[i]):28}{nltk_tags[i]}")

BERT POS Tagging:             NLTK POS Tagging:
0 ('for', 'ADP')              ('For', 'IN')
1 ('years', 'NOUN')           ('years', 'NNS')
2 ('i', 'PRON')               ('I', 'PRP')
3 ('have', 'AUX')             ('have', 'VBP')
4 ('been', 'AUX')             ('been', 'VBN')
5 ('going', 'VERB')           ('going', 'VBG')
6 ('to', 'ADP')               ('to', 'TO')
7 ('the', 'DET')              ('the', 'DT')
8 ('strip', 'NOUN')           ('strip', 'NN')
9 ('mall', 'NOUN')            ('mall', 'NN')
10 ('aka', 'X')                ('aka', 'VBD')
11 ('food', 'PROPN')           ('food', 'NN')
12 ('court', 'PROPN')          ('court', 'NN')
13 ('mall', 'PROPN')           ('mall', 'NN')
14 ('near', 'ADP')             ('near', 'IN')
15 ('the', 'DET')              ('the', 'DT')
16 ('north', 'ADJ')            ('north', 'JJ')
17 ('west', 'ADJ')             ('west', 'JJ')
18 ('corner', 'NOUN')          ('corner', 'NN')
19 ('of', 'ADP')               ('of', 'IN')
20 ('e', 'PROPN')              ('Eglingt

### Review 5

In [41]:
i = 4
review = samples.text[i]
print(review)

I had no idea a Waffles INCaffeinated was coming to the Wexford area. I've never been to one before but I have heard of it. I was on the NoWait app the other day going somewhere else for breakfast and Waffles INC popped up (Yes, they are on NoWait!). I didn't go that day but told myself to try it soon. 

I tried it today and I was pleasantly surprised. It's located in the plaza right in front of North Allegheny Senior High School. If you drive too fast, you might miss it. The sign outside is a little small. The environment itself is pretty small too but quaint enough to be a nice-sized diner. It wasn't too busy so we sat down right away. 

They have  a large menu filled with unique breakfast items like chicken and waffles, Eggs Benedict called "Benny" with a waffle substituted for the english muffin and topped with crab, and the "Breakfast Magic," which is what I got. It's a waffle with bacon, cheddar cheese, and green onions cooked into the waffle topped with more bacon and cheese as 

In [42]:
bert_tags = bert_generate_tags(review)
nltk_tags = nltk_generate_tags(review)

In [43]:
print("BERT POS Tagging:             NLTK POS Tagging:")
for i in range(len(bert_tags)):
    if i >= len(nltk_tags):
        print(i,bert_tags[i])
    else:
        print(f"{i} {str(bert_tags[i]):28}{nltk_tags[i]}")

BERT POS Tagging:             NLTK POS Tagging:
0 ('i', 'PRON')               ('I', 'PRP')
1 ('had', 'VERB')             ('had', 'VBD')
2 ('no', 'DET')               ('no', 'DT')
3 ('idea', 'NOUN')            ('idea', 'NN')
4 ('a', 'DET')                ('a', 'DT')
5 ('wa', 'NOUN')              ('Waffles', 'NNP')
6 ('##ffle', 'NOUN')          ('INCaffeinated', 'NNP')
7 ('##s', 'PROPN')            ('was', 'VBD')
8 ('inca', 'ADJ')             ('coming', 'VBG')
9 ('##ffe', 'ADJ')            ('to', 'TO')
10 ('##inated', 'ADJ')         ('the', 'DT')
11 ('was', 'AUX')              ('Wexford', 'NNP')
12 ('coming', 'VERB')          ('area', 'NN')
13 ('to', 'ADP')               ('.', '.')
14 ('the', 'DET')              ('I', 'PRP')
15 ('wexford', 'PROPN')        ("'ve", 'VBP')
16 ('area', 'NOUN')            ('never', 'RB')
17 ('.', 'PUNCT')              ('been', 'VBN')
18 ('i', 'PRON')               ('to', 'TO')
19 ("'", 'AUX')                ('one', 'CD')
20 ('ve', 'AUX')               ('befor