# POS Tagging

In [1]:
# pandas
import pandas as pd

# random
import random

# NLTK packages
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag as pos_tagger
from nltk import UnigramTagger
from nltk.tag import BigramTagger

In [2]:
#import the json-file and put it in a dataframe
df = pd.read_json('reviewSelected100.json', lines=True, encoding = "ISO-8859-1")

#dataframe
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11
...,...,...,...,...,...,...,...,...,...
15295,qknwFVEh_0KSuexigYBI_A,YMGmyPOU65SMs4H60ltYiw,shIPnFoXrL3dFo5HLH1_HA,1,2,0,0,This was the worst experience ever. So much so...,2014-07-12 21:58:15
15296,Y-ZRoyAXCukBK1uK1ZcZCA,JLhOWQiWtGbr14K_KmoWxA,zPEYgVqJ2QNKi45FJi2jvg,5,0,0,0,We come here every time we hit Vegas! A giant ...,2018-11-10 21:38:49
15297,A8HdjBfhj3pgQuSbwNtDEw,6CoiKFDFXIACJZvv_I_8mQ,zPEYgVqJ2QNKi45FJi2jvg,1,0,1,0,As locals we used to the this place when it w...,2018-10-13 22:11:22
15298,2n1QdrYBRAAe6GKaxEV0jA,_fH4s3ls08eSl_PfX38KIA,etzDsNjkCyQBoJcU2a3U-g,5,0,0,0,The food was delicious. We were seated in 15 m...,2015-02-15 08:43:46


### Clean text data

In [3]:
#select all the text
text = df['text'].astype('str') 

#join all the text into one string
all_text = ''.join(map(str, text))

In [4]:
#separating the string into sentences
sentences = sent_tokenize(all_text)

#sentences

#### Select 5 random sentences

In [5]:
#select 5 random sentences
random_5_sentences = random.choices(sentences, k=5)

pd.options.display.max_colwidth = 300
pd.DataFrame(random_5_sentences, columns=['sentence'])

Unnamed: 0,sentence
0,Bartenders here are great.
1,THe pets even sit next to you at the bar.
2,"Peanuts and bread are served before every meal and yes, you can throw the spent peanut shells on the floor."
3,!This is by far my favorite new restaurant in Madison.
4,We both ordered the sirloin meal with Cesar salad and steak fries.


#### Split the remaining unselected data to be used as training data

In [6]:
#Split the data into train and test
for i in range(len(random_5_sentences)):
    sentences.remove(random_5_sentences[i]) #Remove the 5 random chosen sentences for testing; rest of the sentences will be used for training

### Unigram and Bigram Toolkit

In [7]:
#Tag each sentences and add to new list
def tagged_sentences(sentences):
    list_sentences = []
    for i in range(len(sentences)):
        tagged_sentence_i = nltk.pos_tag(word_tokenize(sentences[i]))
        list_sentences.append(tagged_sentence_i)
    
    return list_sentences

In [8]:
#Training Unigram and storing results
def unigram_result(sentences_train, sentences_test, tagged_sentences_test):
    #Training the NgramTagger
    unigram_tagger = nltk.UnigramTagger(sentences_train)
    
    #Storing results of test
    list_results = []
    for i in range(len(sentences_test)):
        result_sentence_i = unigram_tagger.tag(word_tokenize(sentences_test[i]))
        list_results.append(result_sentence_i)
        
    #evaluate results
    accuracy = unigram_tagger.evaluate(tagged_sentences_test)
    
    return list_results, accuracy

In [9]:
#Training Bigram and storing results
def bigram_result(sentences_train, sentences_test, tagged_sentences_test):
    #Training the NgramTagger
    bigram_tagger = nltk.BigramTagger(sentences_train)
    
    #Storing results of test
    list_results = []
    for i in range(len(sentences_test)):
        result_sentence_i = bigram_tagger.tag(word_tokenize(sentences_test[i]))
        list_results.append(result_sentence_i)
    
    #evaluate results
    accuracy = bigram_tagger.evaluate(tagged_sentences_test)
    
    return list_results

###### ^ above code results in ValueError: too many values to unpack (expected 2)

In [12]:
sentences_train = tagged_sentences(sentences)
tagged_sentences_test = tagged_sentences(random_5_sentences)

##### train bigram manually

In [13]:
#Training the BigramTagger
bigram_tagger = nltk.BigramTagger(sentences_train)

#Storing results of test
list_bigram = []
for i in range(len(random_5_sentences)):
    result_sentence_i = bigram_tagger.tag(word_tokenize(random_5_sentences[i]))
    list_bigram.append(result_sentence_i)

#evaluate results
acc_bigram = bigram_tagger.evaluate(tagged_sentences_test)

In [14]:
#Results of unigram
list_unigram, acc_unigram = unigram_result(sentences_train, random_5_sentences, tagged_sentences_test)
#list_bigram, acc_bigram = bigram_result(sentences_train, random_5_sentences, tagged_sentences_test)

## Results

### Accuracy

In [15]:
print ('accuracy of unigram:', acc_unigram)
print ('accuracy of bigram :', acc_bigram)

accuracy of unigram: 0.9365079365079365
accuracy of bigram : 0.5396825396825397


In [16]:
from IPython.core import display as ICD

for i in range(len(random_5_sentences)):
    table_unigram = pd.DataFrame(list_unigram[i], columns=['word', 'POS_Tag_Unigram'])
    table_bigram = pd.DataFrame(list_bigram[i], columns=['word', 'POS_Tag_Bigram'])
    print("-------------- Sentence "+ str(i+1) + " --------------")
    combined_table = table_unigram.merge(table_bigram, left_index=True, right_index=True, how='left')
    ICD.display(combined_table.drop(combined_table.columns[2], axis=1))

-------------- Sentence 1 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,Bartenders,NNS,NNS
1,here,RB,RB
2,are,VBP,VBP
3,great,JJ,JJ
4,.,.,.


-------------- Sentence 2 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,THe,DT,DT
1,pets,NNS,NNS
2,even,RB,RB
3,sit,VB,VB
4,next,JJ,JJ
5,to,TO,TO
6,you,PRP,PRP
7,at,IN,IN
8,the,DT,DT
9,bar,NN,NN


-------------- Sentence 3 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,Peanuts,,
1,and,CC,
2,bread,NN,
3,are,VBP,
4,served,VBN,
5,before,IN,
6,every,DT,
7,meal,NN,
8,and,CC,
9,yes,UH,


-------------- Sentence 4 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,!,.,.
1,This,DT,DT
2,is,VBZ,VBZ
3,by,IN,IN
4,far,RB,RB
5,my,PRP$,PRP$
6,favorite,JJ,JJ
7,new,JJ,JJ
8,restaurant,NN,NN
9,in,IN,IN


-------------- Sentence 5 --------------


Unnamed: 0,word_x,POS_Tag_Unigram,POS_Tag_Bigram
0,We,PRP,PRP
1,both,DT,DT
2,ordered,VBD,VBD
3,the,DT,DT
4,sirloin,NN,NN
5,meal,NN,NN
6,with,IN,IN
7,Cesar,NNP,
8,salad,NN,
9,and,CC,
