# POS Tagging

In [60]:
# pandas
import pandas as pd

# random
import random

# NLTK packages
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag as pos_tagger
from nltk import UnigramTagger
from nltk.tag import BigramTagger

# spaCy lib
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
#import the json-file and put it in a dataframe
df = pd.read_json('reviewSelected100.json', lines=True, encoding = "ISO-8859-1")

#dataframe
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11
...,...,...,...,...,...,...,...,...,...
15295,qknwFVEh_0KSuexigYBI_A,YMGmyPOU65SMs4H60ltYiw,shIPnFoXrL3dFo5HLH1_HA,1,2,0,0,This was the worst experience ever. So much so...,2014-07-12 21:58:15
15296,Y-ZRoyAXCukBK1uK1ZcZCA,JLhOWQiWtGbr14K_KmoWxA,zPEYgVqJ2QNKi45FJi2jvg,5,0,0,0,We come here every time we hit Vegas! A giant ...,2018-11-10 21:38:49
15297,A8HdjBfhj3pgQuSbwNtDEw,6CoiKFDFXIACJZvv_I_8mQ,zPEYgVqJ2QNKi45FJi2jvg,1,0,1,0,As locals we used to the this place when it w...,2018-10-13 22:11:22
15298,2n1QdrYBRAAe6GKaxEV0jA,_fH4s3ls08eSl_PfX38KIA,etzDsNjkCyQBoJcU2a3U-g,5,0,0,0,The food was delicious. We were seated in 15 m...,2015-02-15 08:43:46


### Clean text data

In [3]:
#select all the text
text = df['text'].astype('str') 

#join all the text into one string
all_text = ''.join(map(str, text))

In [4]:
#separating the string into sentences
sentences = sent_tokenize(all_text)

#sentences

#### Select 5 random sentences

In [5]:
#select 5 random sentences
random_5_sentences = random.choices(sentences, k=5)

pd.options.display.max_colwidth = 300
pd.DataFrame(random_5_sentences, columns=['sentence'])

Unnamed: 0,sentence
0,"No judgement here, as I like to lick my plates clean too!"
1,It was a blast.
2,The night manager started shouting at me and even told me that my room was not his issue.
3,calls were extra charge.
4,"We dropped in during show time and since the box office was closed, we decided to sneak into the main show."


#### NLTK

In [78]:
nltk_pos_tag_li = []
for i in range(len(random_5_sentences)):
    tagged_sentence = nltk.pos_tag(word_tokenize(random_5_sentences[i]))
    nltk_pos_tag_li.append(tagged_sentence)

#### spaCy

In [74]:
spacy_pos_tag_li = []
for i in range(len(random_5_sentences)):
    sentence_to_tag = nlp(random_5_sentences[i])
    
    sentence_pos_tag_li = []
    for token in sentence_to_tag:
        sentence_pos_tag_li.append(tuple([token.text, token.pos_]))
    
    spacy_pos_tag_li.append(sentence_pos_tag_li)

### Results

In [83]:
from IPython.core import display as ICD

for i in range(len(random_5_sentences)):
    table_nltk_pos_tag = pd.DataFrame(nltk_pos_tag_li[i], columns=['word', 'POS_Tag_NLTK'])
    table_spacy_pos_tag = pd.DataFrame(spacy_pos_tag_li[i], columns=['word', 'POS_Tag_spaCy'])
    print("-------------- Sentence "+ str(i+1) + " --------------")
    combined_table = table_nltk_pos_tag.merge(table_spacy_pos_tag, left_index=True, right_index=True, how='left')
    ICD.display(combined_table.drop(combined_table.columns[2], axis=1))

-------------- Sentence 1 --------------


Unnamed: 0,word_x,POS_Tag_NLTK,POS_Tag_spaCy
0,No,DT,DET
1,judgement,NN,NOUN
2,here,RB,ADV
3,",",",",PUNCT
4,as,IN,ADP
5,I,PRP,PRON
6,like,VBP,VERB
7,to,TO,PART
8,lick,VB,VERB
9,my,PRP$,PRON


-------------- Sentence 2 --------------


Unnamed: 0,word_x,POS_Tag_NLTK,POS_Tag_spaCy
0,It,PRP,PRON
1,was,VBD,AUX
2,a,DT,DET
3,blast,NN,NOUN
4,.,.,PUNCT


-------------- Sentence 3 --------------


Unnamed: 0,word_x,POS_Tag_NLTK,POS_Tag_spaCy
0,The,DT,DET
1,night,NN,NOUN
2,manager,NN,NOUN
3,started,VBD,VERB
4,shouting,VBG,VERB
5,at,IN,ADP
6,me,PRP,PRON
7,and,CC,CCONJ
8,even,RB,ADV
9,told,VBD,VERB


-------------- Sentence 4 --------------


Unnamed: 0,word_x,POS_Tag_NLTK,POS_Tag_spaCy
0,calls,NNS,NOUN
1,were,VBD,AUX
2,extra,JJ,ADJ
3,charge,NN,NOUN
4,.,.,PUNCT


-------------- Sentence 5 --------------


Unnamed: 0,word_x,POS_Tag_NLTK,POS_Tag_spaCy
0,We,PRP,PRON
1,dropped,VBD,VERB
2,in,IN,ADP
3,during,IN,ADP
4,show,NN,NOUN
5,time,NN,NOUN
6,and,CC,CCONJ
7,since,IN,SCONJ
8,the,DT,DET
9,box,NN,PROPN


## Research: Unigram vs Bigram

#### Split the remaining unselected data to be used as training data

In [6]:
#Split the data into train and test
for i in range(len(random_5_sentences)):
    sentences.remove(random_5_sentences[i]) #Remove the 5 random chosen sentences for testing; rest of the sentences will be used for training

### Unigram and Bigram Toolkit

In [7]:
#Tag each sentences and add to new list
def tagged_sentences(sentences):
    list_sentences = []
    for i in range(len(sentences)):
        tagged_sentence_i = nltk.pos_tag(word_tokenize(sentences[i]))
        list_sentences.append(tagged_sentence_i)
    
    return list_sentences

In [8]:
#Training Unigram and storing results
def unigram_result(sentences_train, sentences_test, tagged_sentences_test):
    #Training the NgramTagger
    unigram_tagger = nltk.UnigramTagger(sentences_train)
    
    #Storing results of test
    list_results = []
    for i in range(len(sentences_test)):
        result_sentence_i = unigram_tagger.tag(word_tokenize(sentences_test[i]))
        list_results.append(result_sentence_i)
        
    #evaluate results
    accuracy = unigram_tagger.evaluate(tagged_sentences_test)
    
    return list_results, accuracy

In [10]:
sentences_train = tagged_sentences(sentences)
tagged_sentences_test = tagged_sentences(random_5_sentences)

##### train bigram manually

In [11]:
#Training the BigramTagger
bigram_tagger = nltk.BigramTagger(sentences_train)

#Storing results of test
list_bigram = []
for i in range(len(random_5_sentences)):
    result_sentence_i = bigram_tagger.tag(word_tokenize(random_5_sentences[i]))
    list_bigram.append(result_sentence_i)

#evaluate results
acc_bigram = bigram_tagger.evaluate(tagged_sentences_test)

In [12]:
#Results of unigram
list_unigram, acc_unigram = unigram_result(sentences_train, random_5_sentences, tagged_sentences_test)
#list_bigram, acc_bigram = bigram_result(sentences_train, random_5_sentences, tagged_sentences_test)

## Results

### Accuracy

In [13]:
print ('accuracy of unigram:', acc_unigram)
print ('accuracy of bigram :', acc_bigram)

accuracy of unigram: 0.9696969696969697
accuracy of bigram : 0.5


In [14]:
from IPython.core import display as ICD

for i in range(len(random_5_sentences)):
    table_unigram = pd.DataFrame(list_unigram[i], columns=['word', 'POS_Tag_Unigram'])
    table_bigram = pd.DataFrame(list_bigram[i], columns=['word', 'POS_Tag_Bigram'])
    print("-------------- Sentence "+ str(i+1) + " --------------")
    combined_table = table_unigram.merge(table_bigram, left_index=True, right_index=True, how='left')
    ICD.display(combined_table.drop(combined_table.columns[2], axis=1))

-------------- Sentence 1 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,No,DT,DT
1,judgement,NN,
2,here,RB,
3,",",",",
4,as,IN,
5,I,PRP,
6,like,IN,
7,to,TO,
8,lick,VB,
9,my,PRP$,


-------------- Sentence 2 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,It,PRP,PRP
1,was,VBD,VBD
2,a,DT,DT
3,blast,NN,NN
4,.,.,.


-------------- Sentence 3 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,The,DT,DT
1,night,NN,NN
2,manager,NN,NN
3,started,VBD,VBD
4,shouting,VBG,
5,at,IN,
6,me,PRP,
7,and,CC,
8,even,RB,
9,told,VBD,


-------------- Sentence 4 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,calls,NNS,
1,were,VBD,
2,extra,JJ,
3,charge,NN,
4,.,.,


-------------- Sentence 5 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,We,PRP,PRP
1,dropped,VBD,VBD
2,in,IN,IN
3,during,IN,IN
4,show,NN,NN
5,time,NN,NN
6,and,CC,CC
7,since,IN,IN
8,the,DT,DT
9,box,NN,NN


### Combining Taggers

#### Find the most commonly used tagger

In [40]:
tags = []
for li in range(len(sentences_train)):
    for li_item in range(len(sentences_train[li])):
        tags.append(sentences_train[li][li_item][1])

In [41]:
nltk.FreqDist(tags).max()

'NN'

In [42]:
default_tagger = nltk.DefaultTagger('NN')

#### Adding Backoff tagger

In [43]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(sentences_train, backoff=t0)
t2 = nltk.BigramTagger(sentences_train, backoff=t1)

In [44]:
#Result of combining taggers
list_combine = []
for i in range(len(random_5_sentences)):
    result_sentence_i = t2.tag(word_tokenize(random_5_sentences[i]))
    list_combine.append(result_sentence_i)

#evaluate results
acc_combine = t2.evaluate(tagged_sentences_test)

### Accuracy

In [45]:
print ('accuracy of unigram  :', acc_unigram)
print ('accuracy of bigram   :', acc_bigram)
print ('accuracy of combining:', acc_combine)

accuracy of unigram  : 0.9696969696969697
accuracy of bigram   : 0.5
accuracy of combining: 1.0


In [46]:
from IPython.core import display as ICD

for i in range(len(random_5_sentences)):
    table_unigram = pd.DataFrame(list_unigram[i], columns=['word', 'POS_Tag_Unigram'])
    table_combine = pd.DataFrame(list_combine[i], columns=['word', 'POS_Tag_Combine'])
    print("-------------- Sentence "+ str(i+1) + " --------------")
    merged_table = table_unigram.merge(table_combine, left_index=True, right_index=True, how='left')
    ICD.display(merged_table.drop(merged_table.columns[2], axis=1))

-------------- Sentence 1 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Combine
0,No,DT,DT
1,judgement,NN,NN
2,here,RB,RB
3,",",",",","
4,as,IN,IN
5,I,PRP,PRP
6,like,IN,VBP
7,to,TO,TO
8,lick,VB,VB
9,my,PRP$,PRP$


-------------- Sentence 2 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Combine
0,It,PRP,PRP
1,was,VBD,VBD
2,a,DT,DT
3,blast,NN,NN
4,.,.,.


-------------- Sentence 3 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Combine
0,The,DT,DT
1,night,NN,NN
2,manager,NN,NN
3,started,VBD,VBD
4,shouting,VBG,VBG
5,at,IN,IN
6,me,PRP,PRP
7,and,CC,CC
8,even,RB,RB
9,told,VBD,VBD


-------------- Sentence 4 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Combine
0,calls,NNS,NNS
1,were,VBD,VBD
2,extra,JJ,JJ
3,charge,NN,NN
4,.,.,.


-------------- Sentence 5 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Combine
0,We,PRP,PRP
1,dropped,VBD,VBD
2,in,IN,IN
3,during,IN,IN
4,show,NN,NN
5,time,NN,NN
6,and,CC,CC
7,since,IN,IN
8,the,DT,DT
9,box,NN,NN
