Load all libraries

In [317]:
import pandas as pd
import itertools, nltk, string 
import requests, re
from nltk import Tree
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
import os
import spacy
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import matplotlib as plt
from sklearn.metrics import average_precision_score
from collections import OrderedDict
import numpy as np

wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")


Read Dataset

In [298]:
res_single_df = pd.read_csv('../restaurant-single-categories.csv')
res_multiple_df = pd.read_csv('../restaurant-multiple-categories.csv')

lap_single_df = pd.read_csv('../laptop-single-categories.csv')
lap_multiple_df = pd.read_csv('../laptop-multiple-categories.csv')

Dataset Exploration

In [299]:
res_single_df.head()

Unnamed: 0,reviewID,sentenceID,review,category,polarity
0,1004293,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,negative
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,negative
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,negative
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,negative
4,1004293,1004293:4,"After all that, they complained to me about th...",SERVICE#GENERAL,negative


restaurant categories

In [300]:
res_single_df.category.value_counts()

FOOD#QUALITY                383
RESTAURANT#GENERAL          245
SERVICE#GENERAL             183
AMBIENCE#GENERAL            111
FOOD#STYLE_OPTIONS           51
RESTAURANT#MISCELLANEOUS     50
RESTAURANT#PRICES            24
DRINKS#QUALITY               21
FOOD#PRICES                  17
DRINKS#STYLE_OPTIONS         16
LOCATION#GENERAL             16
DRINKS#PRICES                 3
Name: category, dtype: int64

load lexicon

In [301]:
positive_lexicon = []
negative_lexicon = []

def read_lexicon():
    global positive_lexicon;
    global negative_lexicon;
    
    with open(os.path.join(os.path.abspath('../opinion-lexicon-English/') , 'positive-words.txt'), 'r') as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
         
        positive_lexicon = file.readlines()
    
    with open(os.path.join(os.path.abspath('../opinion-lexicon-English/') , 'negative-words.txt'), 'r', encoding = "ISO-8859-1") as file:
        line = file.readline();
        while ";" in line:
            line = file.readline();
        
        negative_lexicon = file.readlines()
        
    positive_lexicon = list(map(lambda word: word.rstrip("\n\r"), positive_lexicon))
    negative_lexicon = list(map(lambda word: word.rstrip("\n\r"), negative_lexicon))

read_lexicon()

Preprocessing Function

In [363]:
linking_verbs_be = [
    'be',
    'is',
    'are',
    'am',
    'was',
    'were',
    'can be',
    'could be',
    'will be',
    'would be',
    'shall be',
    'should be',
    'may be',
    'might be',
    'must be',
    'has been',
    'have been',
    'had been'
];

linking_verbs_v = [
    'feel',
    'look',
    'smell',
    'sound',
    'taste',
    'act',
    'appear',
    'become',
    'get',
    'grow',
    'prove',
    'remain', 
    'seem',
    'stay',
    'turn'
];

def check_is_noun(pos):
    return re.match('NN.*', pos)

def check_is_verb(pos):
    return re.match('VB.*', pos)

def check_is_adjective(pos):
    return re.match('JJ.*', pos)

def check_is_adverb(pos):
    return re.match('RB.*', pos)

def lemmatize(word, pos):
    tag = wn.NOUN
    if(check_is_noun(pos)):
        tag = wn.NOUN
    elif(check_is_verb(pos)):
        tag = wn.VERB
    elif(check_is_adjective(pos)):
        tag = wn.ADJ
    elif(check_is_adverb(pos)):
        tag = wn.ADV
            
    lemma = wordnet_lemmatizer.lemmatize(word, tag)
    return lemma

def preprocessing(sentence):
    #res = re.sub(' +', ' ', re.sub(r'[^\w\s]','',sentence.replace("'m", "am").replace("n't", "not").replace("'s", ''))).lower()
    res = sentence.replace("'m", "am").replace("'t", " not").replace("'s", '').replace("'ve", " have").lower()
    res = re.sub(r'\b\d+\b', 'NUM', res)
    res = re.sub(r'(.)\1{3,}', '', res)
    return res

def preprocessing_with_lemma(sentence):
    res = re.sub(r'[^\w\s]','', sentence.replace("'m", "am").replace("n't", "not").replace("'ve", " have").replace("'s", '')).lower()
    res = re.sub(r'(.)\1{3,}', '', res)
   
    #checking for parallel clauses
    #splitted = res.split(', and but')
    res = re.sub(r'\b\d+\b', 'NUM', res)
    tagged_words = pos_tag(res)
    lemma = []
    for word, pos in tagged_words:
        lemma.append(lemmatize(word, pos))
        
    return " ".join(lemma)
    
def pos_tag(sentence):
    url = "http://localhost:9000"
    request_params = {"annotators": "pos"}
    r = requests.post(url, data=sentence, params=request_params, timeout=120)
    try:
        results = r.json()['sentences'][0]['tokens']
        res = []
        for pos in results:
            res.append((pos['word'], pos['pos']))
        return res
    except Exception as e:
        print(e)
        return []

def get_tregex(text, tregex):
    url = "http://localhost:9000/tregex"
    request_params = {"pattern": tregex}
    r = requests.post(url, data=text, params=request_params, timeout=120)
    try:
        return r.json()['sentences'][0]
    except:
        return []

def sentence_from_tree(s):
    pattern = r'(?<= )[a-zA-Z].*?(?=\))'
    replaced = s.replace('\r\n', '')
    res = ' '.join(re.findall(pattern, replaced))
    return res
        
def sentence_type(clauses):
    IC = 0
    DC = 0
    for clause in clauses:
        if(clause[1] == 'IC'):
            IC += 1
        elif(clause[1] == 'DC'):
            DC += 1

    if IC == 1 and DC == 0:
        return 'simple_sentence'
    elif IC >= 2 and DC == 0:
        return 'compound_sentence'
    elif IC ==1 and DC >= 1:
        return 'complex_sentence'
    elif IC > 1 and DC >= 1:
        return 'compound_complex_sentence'
    else:
        return 'phrase'
    

phrase extraction

In [364]:
def get_phrases(sentence):
    np_tree = get_tregex(sentence, 'NP < NN | < NNS')
    np_temp = []
    
    if np_tree:
        for x in range(0, len(np_tree)):
            phrase = " ".join(Tree.fromstring(np_tree[str(x)]['match']).leaves())
            if not any(phrase in s for s in np_temp):
                np_temp.append(phrase)
     
    advp_tree = get_tregex(sentence, 'ADVP')
    advp_temp = []
    
    if advp_tree:
         for x in range(0, len(advp_tree)):
            phrase = " ".join(Tree.fromstring(advp_tree[str(x)]['match']).leaves())
            if not any(phrase in s for s in advp_temp):
                advp_temp.append(phrase)
     
        
    adjp_tree = get_tregex(sentence, 'ADJP')
    adjp_temp = []
    
    if adjp_tree:
        for x in range(0, len(adjp_tree)):
            phrase = " ".join(Tree.fromstring(adjp_tree[str(x)]['match']).leaves())
            if not any(phrase in s for s in adjp_temp):
                adjp_temp.append(phrase)
    
    pp_tree = get_tregex(sentence, 'PP')
    pp_temp = []
    
    if pp_tree:
        for x in range(0, len(pp_tree)):
            phrase = " ".join(Tree.fromstring(pp_tree[str(x)]['match']).leaves())
            if not any(phrase in s for s in pp_temp):
                pp_temp.append(phrase)
    
    sent_tagged = pos_tag(sentence)
    chunking = []
    
    finish = False
    index = 0
    concat_word = ''
    concat_pos = ''
    while not finish:
        word_tagged = sent_tagged[index]
        concat_word = (concat_word + ' ' + word_tagged[0]).strip()
        concat_pos = (concat_pos + ' ' + word_tagged[1]).strip()
        
        if not check_is_verb(word_tagged[1]) and not word_tagged[1] == 'MD':
            if concat_word in np_temp:
                chunking.append((concat_word.strip(), 'NP', concat_pos))
                concat_word = ''
                concat_pos = ''
           
            if concat_word in pp_temp:
                chunking.append((concat_word.strip(), 'PP', concat_pos))
                concat_word = ''
                concat_pos = ''
                
            if concat_word in adjp_temp:
                chunking.append((concat_word.strip(), 'ADJP', concat_pos))
                concat_word = ''
                concat_pos = ''
            
            if concat_word in advp_temp:
                chunking.append((concat_word.strip(), 'ADVP', concat_pos))
                concat_word = ''
                concat_pos = ''
                
            if word_tagged[1] == 'PRP' or word_tagged[1] == 'FW' and len(concat_word.split()) == 1:
                chunking.append((concat_word.strip(), word_tagged[1], concat_pos))
                concat_word = ''
                concat_pos = ''
                
        else:
            if len(concat_word.split()) == 1:
                next_word = sent_tagged[index + 1] if index + 1 < len(sent_tagged) else ('.', 'END')
                next_next_word = sent_tagged[index + 2] if index + 2 < len(sent_tagged) else ('.', 'END')
                if (check_is_verb(next_word[1]) and check_is_verb(next_next_word[1])) or (check_is_verb(next_next_word[1]) and next_word[1] == 'TO'):
                    chunking.append( (concat_word + ' ' + next_word[0] + ' ' + next_next_word[0], 'VP', 
                                     concat_pos + " " + next_word[1] + ' ' + next_next_word[1]) )
                    concat_word = ''
                    concat_pos = ''
                    index += 2
                elif check_is_verb(next_word[1]) or next_word[1] == 'RP':
                    chunking.append( (concat_word + ' ' + next_word[0], 'VP', concat_pos + ' ' + next_word[1]))
                    concat_word = ''
                    concat_pos = ''
                    index += 1
                else:
                    chunking.append((concat_word.strip(), 'VP', concat_pos))
                    concat_word = ''
                    concat_pos = ''
        
        index += 1
       
        if index >= len(sent_tagged):
            if concat_word != '' and concat_word != '.':
                fail_words = concat_word.split()
                index = index - len(fail_words)
                index = index + 1 if sent_tagged[index][0].strip() != fail_words[0].strip() else index
                chunking.append((sent_tagged[index][0], sent_tagged[index][1], sent_tagged[index][1]))
                concat_word = ''
                concat_pos = ''
                index += 1
            
            if index >= len(sent_tagged):
                finish = True 

    return chunking

get clause function

In [365]:
def get_clauses(sentence):
    temp = []
    clauses = []
    
    res_all_clauses = get_tregex(sentence, 'S < (NP $ VP)') 
    res_sbar_clause = get_tregex(sentence, 'SBAR < S')
    #filter clauses with dependency clauses
    for x in range(0, len(res_all_clauses)):
        s = sentence_from_tree(res_all_clauses[str(x)]['match'])
        ic = True    
        for y in range(0, len(res_sbar_clause)):
            sbar = sentence_from_tree(res_sbar_clause[str(y)]['match'])            
            if sbar in s and sbar != s:
                s = s.replace(sbar, '')
                if(len(res_sbar_clause) == 1 and sbar != ''):
                    temp.append([sbar.strip(), 'DC'])
            elif s in sbar and sbar != '':
                ic = False
                temp.append( [sbar.strip(), 'DC'])
        if ic:
            temp.append( [s.strip(), 'IC'] )

    #overwrite sentence that already exist in list
    len_clause = len(temp)
    for x in range(0, len_clause):
        for y in range(x + 1, len_clause):
            temp[x][0] = temp[x][0].replace(temp[y][0], '').strip()
        
        temp[x][0] = re.sub(r"  ", " ", temp[x][0])
        if(temp[x][0] != ''):
            clauses.append( tuple(temp[x]) )
    #sorted by index sentence
    
    if(len(clauses) == 0):
        clauses.append((sentence, 'Phrase'))
    
    return sorted(clauses, key=lambda clause: 999 if sentence.find(clause[0]) == -1 else sentence.find(clause[0]))

aspect extraction function

In [366]:
 def aspect_extraction(r):
    clauses = get_clauses(r)
    stype = sentence_type(clauses)
    candidate_aspect_per_sentence = []
    candidate_opinion_per_sentence = []
    for c,t in clauses:
        candidate_aspect_per_clause = []
        candidate_opinion_per_clause = []
            
        if stype == 'phrase':
            c_tagged = pos_tag(c)
            
            #Find all noun and append to aspect term
            #for opinion find adjective, verb, and adverb and append to opinion term
            for word, pos in c_tagged:
                if check_is_noun(pos) and word != 'NUM':
                    candidate_aspect_per_clause.append(lemmatize(word, pos))
                elif (check_is_verb(pos) or check_is_adverb(pos) or check_is_adjective(pos)) and (word in positive_lexicon or word in negative_lexicon):
                    candidate_opinion_per_clause.append(lemmatize(word, pos))
        else:
            phrases = get_phrases(c)
                
            is_finish = False;
            index_word = 0
            while not is_finish:
                phrase = phrases[index_word]
                next_phrase = phrases[index_word + 1] if (index_word + 1) != len(phrases) else ('.', 'END', '.')
                #checking verb
                   
                if phrase[1] == 'VP':
                    #aspect always in independet clause:
                    if t == 'IC':
                        if phrase[0] in linking_verbs_be or (phrase[0] in linking_verbs_v and next_phrase[1] != 'NP'):
                            #linking verb condition
                            #find aspect in subject
                            for i in range(0, index_word):
                                p = phrases[i]
                                if p[1] == 'NP':
                                    pp_words = p[0].split()
                                    pp_pos = p[2].split()
                                    
                                    for i, x_p in enumerate(pp_words):
                                        if x_p not in stopWords and check_is_noun(pp_pos[i]) and x_p != 'NUM':
                                            candidate_aspect_per_clause.append(lemmatize(x_p, pp_pos[i]))
                        else:
                            #action verb
                            #checking verb is opinion or not
                            if phrase[0] not in stopWords and (phrase[0] in positive_lexicon or phrase[0] in negative_lexicon):
                                candidate_opinion_per_clause.append(lemmatize(phrase[0], 'VB'))

                            #find aspect in object
                            for i in range(index_word+1, len(phrases)):
                                p = phrases[i]
                                if p[1] == 'NP':
                                    pp_pos = p[2].split()
                                    pp_words = p[0].split()
                                   
                                    for i, x_p in enumerate(pp_words):
                                        if x_p not in stopWords and check_is_noun(pp_pos[i]) and x_p != 'NUM':
                                             candidate_aspect_per_clause.append(lemmatize(x_p, pp_pos[i]))
                                    
                            #if subject preposition find aspect in preposition
                            if len(candidate_aspect_per_clause) == 0:
                                #find in pp after verb
                                for i in range(index_word+1, len(phrases)):
                                    p = phrases[i]
                                    words = nltk.word_tokenize(p[0])
                                    word_taggeds = nltk.word_tokenize(p[2])

                                    if p[1] == 'PP':
                                        for i,w in enumerate(words):
                                            if w not in stopWords and check_is_noun(word_taggeds[i]) and w != 'NUM':
                                                candidate_aspect_per_clause.append(lemmatize(w, word_taggeds[i]))

                                #find in pp before verb
                                for i in range(0, index_word):
                                    p = phrases[i]
                                    words = nltk.word_tokenize(p[0])
                                    word_taggeds = nltk.word_tokenize(p[2])

                                    if p[1] == 'PP':
                                        for i,w in enumerate(words):
                                            if w not in stopWords and check_is_noun(word_taggeds[i]) and w != 'NUM':
                                                candidate_aspect_per_clause.append(lemmatize(w, word_taggeds[i]))

                    #find opinion both in IC and DC
                    for i in range(index_word+1, len(phrases)):
                        p = phrases[i]
                        words = nltk.word_tokenize(p[0])
                        word_taggeds = nltk.word_tokenize(p[2])

                        #check opinion in adjective
                        if p[1] == 'ADJP' or p[1] == 'JJ':
                            for i,w in enumerate(words):
                                if w not in stopWords:
                                    candidate_opinion_per_clause.append(lemmatize(w, word_taggeds[i]))          
                        #check opinion in adverb
                        elif p[1] == 'ADVP' or p[1] == 'RB':
                            for i,w in enumerate(words):
                                if w not in stopWords and (w in positive_lexicon or w in negative_lexicon):
                                    candidate_opinion_per_clause.append(lemmatize(w, word_taggeds[i])) 
                        elif p[1] == 'PP':
                            for i,w in enumerate(words):
                                if w not in stopWords and (w in positive_lexicon or w in negative_lexicon):
                                    candidate_opinion_per_clause.append(lemmatize(w, word_taggeds[i]))
                        elif p[1] == 'VP':
                            for i,w in enumerate(words):
                                if w not in stopWords and (w in positive_lexicon or w in negative_lexicon):
                                    candidate_opinion_per_clause.append(lemmatize(w, word_taggeds[i]))
                        elif p[1] == 'NP':
                            for i,w in enumerate(words):
                                if w not in stopWords and (w in positive_lexicon or w in negative_lexicon):
                                    candidate_opinion_per_clause.append(lemmatize(w, word_taggeds[i]))
                                elif w not in stopWords and check_is_noun(word_taggeds[i]) and t == 'IC' and w != 'NUM':
                                    candidate_aspect_per_clause.append(lemmatize(w, word_taggeds[i]))
                        
                    is_finish = True
                else:
                    index_word += 1
                    if index_word >= len(phrases):
                        is_finish = True
        if len(candidate_aspect_per_clause) > 0 or len(candidate_opinion_per_clause) > 0:
            candidate_aspect_per_sentence.append(candidate_aspect_per_clause)
            candidate_opinion_per_sentence.append(candidate_opinion_per_clause)
            
    return candidate_aspect_per_sentence, candidate_opinion_per_sentence
   

Sentence Preprocessing

In [367]:
#preprocessed sentence and append to new coloum df
arr = []
for r in res_single_df['review']:
    preprocessed_sent = preprocessing(r)
  
    arr.append(preprocessed_sent)
    
preprocess_sent_series = pd.Series(arr)
res_single_df['preprocessed_sentence'] = preprocess_sent_series

aspect and opinion extraction using grammartical rule

In [368]:
aspect_term = []
opinion_term = []

for r in res_single_df['preprocessed_sentence']:
    candidate_aspect_per_sentence, candidate_opinion_per_sentence = aspect_extraction(r);
    aspect_term.append(candidate_aspect_per_sentence)
    opinion_term.append(candidate_opinion_per_sentence)

In [369]:
aspect_term_series = pd.Series(aspect_term)
opinion_term_series = pd.Series(opinion_term)
res_single_df['aspect_term'] = aspect_term_series
res_single_df['opinion_term'] = opinion_term_series

In [370]:
res_single_df.head(10)

Unnamed: 0,reviewID,sentenceID,review,category,polarity,preprocessed_sentence,aspect_term,opinion_term,entity,prediction_entities
0,1004293,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,negative,judging from previous posts this used to be a ...,"[[post, place]]",[[good]],RESTAURANT,RESTAURANT
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,negative,"we, there were four of us, arrived at noon - t...","[[place], [], [noon]]","[[empty], [rude], []]",SERVICE,RESTAURANT
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,negative,"they never brought us complimentary noodles, i...","[[noodle, request, dish, noodle, request]]",[[complimentary]],SERVICE,FOOD
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,negative,the food was lousy - too sweet or too salty an...,[[food]],"[[lousy, sweet, salty]]",FOOD,FOOD
4,1004293,1004293:4,"After all that, they complained to me about th...",SERVICE#GENERAL,negative,"after all that, they complained to me about th...",[[tip]],[[complain]],SERVICE,RESTAURANT
5,1004293,1004293:5,Avoid this place!,RESTAURANT#GENERAL,negative,avoid this place!,[[place]],[[]],RESTAURANT,RESTAURANT
6,1014458,1014458:0,"I have eaten at Saul, many times, the food is ...",FOOD#QUALITY,positive,"i have eaten at saul, many times, the food is ...","[[food], [saul]]","[[consistently, outrageously, good], [many]]",FOOD,RESTAURANT
7,1014458,1014458:1,Saul is the best restaurant on Smith Street an...,RESTAURANT#GENERAL,positive,saul is the best restaurant on smith street an...,"[[saul, restaurant]]",[[best]],RESTAURANT,RESTAURANT
8,1014458,1014458:2,The duck confit is always amazing and the foie...,FOOD#QUALITY,positive,the duck confit is always amazing and the foie...,"[[duck, confit]]",[[amazing]],FOOD,FOOD
9,1014458,1014458:3,The wine list is interesting and has many good...,DRINKS#STYLE_OPTIONS,positive,the wine list is interesting and has many good...,"[[wine, list, value]]","[[interesting, good]]",DRINKS,DRINKS


aspect categorization

TextRank Algorithm

In [371]:
class TextRank4Keyword():
    """Extract keywords from text"""
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight
        self.my_stopwords = list(stopWords)
        
    def set_stopwords(self, stopwords):
        for word in stopwords:
            self.my_stopwords.append(word)
        
    def sentence_segment(self, text, candidate_pos):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in nltk.sent_tokenize(text):
            selected_words = []
            tagged_words = pos_tag(sent)
            for token in tagged_words:
                # Store words only with cadidate POS tag
                if token[1] in candidate_pos and token[0] not in self.my_stopwords:
                    selected_words.append(token)
            sentences.append(selected_words)
        return sentences
    
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
    
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm
    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keywords = []
        for i, (key, value) in enumerate(node_weight.items()):
            print(key[0] + ' - ' + str(value))
            keywords.append(key[0])
            if i > number:
                break
        
        return keywords
                
    def analyze(self, text, 
                candidate_pos=['NN', 'PRP'], 
                window_size=4, stopwords=[]):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        #doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(text, candidate_pos) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
    

fast text training

In [358]:
sentences_ted = []

#review
for r in res_single_df['review']:
    rr = preprocessing_with_lemma(r)
    sentences_ted.append([w for w in nltk.word_tokenize(rr)])

model_ftext = FastText(size=100, window=5, min_count=1)
model_ftext.build_vocab(sentences=sentences_ted)
model_ftext.train(sentences=sentences_ted, total_examples=len(sentences_ted), epochs=100)

glove training

In [315]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = '../glove/glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [316]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model_gensim = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)


[('queen', 0.7698541283607483)]


Aspect Categorization Function

implementation only using entity as category

In [372]:
category = ['FOOD', 'DRINKS', 'SERVICE', 'AMBIENCE', 'LOCATION', 'RESTAURANT']

In [373]:
entity = []
for cat in res_single_df['category']:
    e,a = cat.split('#')
    entity.append(e)

entity_sent_series = pd.Series(entity)
res_single_df['entity'] = entity_sent_series

In [374]:
res_single_df.entity.value_counts()

FOOD          451
RESTAURANT    319
SERVICE       183
AMBIENCE      111
DRINKS         40
LOCATION       16
Name: entity, dtype: int64

In [408]:
def generate_keyword_entity(entity, n_sample=0.2, n_keywords=10):
    
    pd_filtered = res_single_df[res_single_df['entity'] == entity]
    sent = []
    row, column = pd_filtered.shape
    review_res_gen = pd_filtered.sample(round(n_sample * row))['review']
    for r in review_res_gen:
        sent.append(preprocessing_with_lemma(r))

    tr4w = TextRank4Keyword()
    tr4w.analyze(" ".join(sent), candidate_pos = ['NN', 'JJ'], window_size=4, stopwords=['NUM', 'u', 'i'])
    return tr4w.get_keywords(n_keywords)

def calculate_similarity(aspects, keywords):
    sim = 0
    for w_aspect in aspects:
        sum_key = 0

        for w_key in keywords:
            try:
                sum_key += model_gensim.similarity(w_aspect, w_key)
            except:
                try:
                    sum_key += model_fastext.wv.similarity(re.sub(r'[^\w\s]', '', w_aspect), re.sub(r'[^\w\s]', '', w_key))
                except:
                    print(re.sub(r'[^\w\s]', '', w_aspect), re.sub(r'[^\w\s]', '', w_key), sum_key)
                #print(w_aspect, w_key, sum_key)

        sim += sum_key / len(keywords)

    return sim
   
def prediction_entity(sim_combined, entities):          
    index_max_sim = sim_combined.index(max(sim_combined))
    return entities[index_max_sim];

In [409]:
keywords = {}
for e in category:
    keywords[e] = generate_keyword_entity(e)

food - 8.068097547850828
good - 6.781147180295589
huge - 3.913316038303037
roll - 3.644101868355033
service - 3.420324482509246
great - 3.2132021829157864
sushi - 3.208543352465555
pizza - 2.9297061155306294
special - 2.824240641649669
delicious - 2.808780227062196
lobster - 2.311824031137946
selection - 2.131401048811175
list - 2.207658564814815
wine - 2.084571180555556
good - 1.943282986111111
bar - 1.2089826388888887
beer - 0.9055011574074073
interesting - 0.8955451388888888
sassy - 0.8790028935185185
fav - 0.8783929398148147
sake - 0.8781666666666665
price - 0.8713634259259259
extensive - 0.8635081018518518
purple - 0.8604189814814814
service - 8.437038472394121
good - 2.2610135154659234
great - 2.1768241308562124
staff - 2.140929904321542
friendly - 2.081327503370053
waiter - 1.9738737035207143
atmosphere - 1.8518236348789272
rude - 1.5268307598173765
complimentary - 1.4639541855755622
water - 1.438414536546293
lot - 1.4237147342995167
bad - 1.3500303644842142
great - 3.2148205377

In [410]:
prediction_cats = []
for r in res_single_df.itertuples():
    c = []
    for i in r[7]:
        for y in i:
            c.append(y)
    #for i in r[8]:
    #    for y in i:
    #        c.append(y)
            
    sim_combined = []
    for e in category:
        sim_combined.append(calculate_similarity(c, keywords[e]))
    
    prediction_cats.append(prediction_entity(sim_combined, category))

prediction_cat_series = pd.Series(prediction_cats)
res_single_df['prediction_entities'] = prediction_cat_series

foodawesome idea 1.9778012186288834
impecable idea 2.02499583363533
dhosas idea -0.4044108148664236
lonk idea 3.3977761268615723
everythig idea 0.7952360790222883
everythig idea 0.7952360790222883
wasabe idea 1.2964256554841995
excpetiona idea 2.4718629717826843
winebytheglass idea 1.556631838902831
decour idea 1.8439793065190315
cheeseboth idea 2.564781203866005
steake idea 1.3827521055936813
fritte idea 1.6997877731919289
raddichio idea 1.9468672573566437
myagi idea 2.15672505274415
wonderfull idea 1.7665812522172928
prixe idea -0.1546870693564415
essabagel idea 1.1861664615571499
shabushabu idea 1.5306688025593758
thius idea 1.080758336931467
taxan idea 1.9920408427715302
guaranteeed idea 1.7598702237010002
cheesesticks idea 2.6673357486724854
waitstaffs idea 0.6678534522652626
hollondaise idea 3.0669340565800667
hollondaise idea 3.0669340565800667
lloovve idea 3.3570109009742737
tramezzinis idea 1.7235907204449177
tramezzinis idea 1.7235907204449177
jsut idea 1.9083990380167961
mil

In [402]:
y_true=res_single_df['entity']
y_pred=res_single_df['prediction_entities']

In [403]:
precision_recall_fscore_support(y_true, y_pred, average="weighted")

(0.44908731494183496, 0.40982142857142856, 0.37917388313111206, None)

In [326]:
confusion_matrix(y_true, y_pred)

array([[ 33,   0,  33,   7,  11,  27],
       [  1,   6,  23,   0,   0,  10],
       [  9,   3, 354,  14,  14,  57],
       [  2,   0,   2,   6,   0,   6],
       [  9,   2, 156,  16,  72,  64],
       [  7,   1,  63,   8,   9,  95]])

with opinion

In [266]:
prediction_cats = []
for r in res_single_df.itertuples():
    c = []
    for i in r[7]:
        for y in i:
            c.append(y)
    for i in r[8]:
        for y in i:
            c.append(y)
            
    sim_combined = []
    for e in category:
        sim_combined.append(calculate_similarity(c, keywords[e]))
    
    prediction_cats.append(prediction_entity(sim_combined, category))

prediction_cat_series = pd.Series(prediction_cats)
res_single_df['prediction_entities'] = prediction_cat_series

post pumkin 5.314675189554691
place pumkin 7.0278750360012054
good pumkin 8.241787642240524
place pumkin 7.0278750360012054
noon pumkin 3.302600637078285
empty pumkin 5.038004755973816
rude pumkin 2.640731304883957
noodle pumkin 3.6824923008680344
request pumkin 4.352973103523254
dish pumkin 6.393249750137329
noodle pumkin 3.6824923008680344
request pumkin 4.352973103523254
complimentary pumkin 4.038971794769168
food pumkin 7.8201000690460205
lousy pumkin 4.021219253540039
sweet pumkin 6.336414501070976
salty pumkin 3.8421414121985435
tip pumkin 4.075289823114872
complain pumkin 3.8616940081119537
place pumkin 7.0278750360012054
food pumkin 7.8201000690460205
saul pumkin 0.05654187127947807
consistently pumkin 3.7781015932559967
outrageously pumkin 1.3890289720147848
good pumkin 8.241787642240524
many pumkin 6.576793521642685
saul pumkin 0.05654187127947807
restaurant pumkin 7.065643101930618
best pumkin 7.59813392162323
duck pumkin 4.313877277076244
confit pumkin 1.3512624828144908
am

self-respect pumkin -2.072664000093937
bad pumkin 6.325080513954163
ask pumkin 5.984184518456459
triviality pumkin -2.4648811102379113
like pumkin 7.643898606300354
water pumkin 5.974116697907448
check pumkin 5.959682136774063
vintage pumkin 4.2359368950128555
tee pumkin 1.8669504122808576
shirt pumkin 3.532737225294113
tee pumkin 1.8669504122808576
shirt pumkin 3.532737225294113
agenda pumkin 3.9665547981858253
iam pumkin -2.0116618419997394
company pumkin 5.7959947139024734
experience pumkin 6.535707741975784
busy pumkin 4.991213962435722
bore pumkin 2.6370670422911644
enjoy pumkin 6.751292675733566
pleasant pumkin 4.9475133419036865
everyone pumkin 7.404812783002853
everyone pumkin 7.404812783002853
pretty pumkin 6.551564425230026
empty pumkin 5.038004755973816
restaurant pumkin 7.065643101930618
restaurant pumkin 7.065643101930618
best pumkin 7.59813392162323
experience pumkin 6.535707741975784
experience pumkin 6.535707741975784
great pumkin 6.809209495782852
food pumkin 7.8201000

dissapointing nice 0
dissapointing excellent 0
dissapointing selection 0
dissapointing chicken 0
dissapointing fish 0
dhosas wine 0
dhosas good 0
dhosas beer 0
dhosas price 0
dhosas drink 0
dhosas service 0
dhosas selection 0
dhosas menu 0
dhosas sake 0
dhosas big 0
dhosas happy 0
dhosas husband 0
dhosas delicious 0
dhosas food 0
dhosas dish 0
dhosas guest 0
dhosas pumkin 0
dhal pumkin -3.820132691645995
kinda pumkin 1.9768438681494445
dissapointing wine 0
dissapointing good 0
dissapointing beer 0
dissapointing price 0
dissapointing drink 0
dissapointing service 0
dissapointing selection 0
dissapointing menu 0
dissapointing sake 0
dissapointing big 0
dissapointing happy 0
dissapointing husband 0
dissapointing delicious 0
dissapointing food 0
dissapointing dish 0
dissapointing guest 0
dissapointing pumkin 0
dhosas service 0
dhosas staff 0
dhosas rude 0
dhosas food 0
dhosas good 0
dhosas friendly 0
dhosas thing 0
dhosas place 0
dhosas great 0
dhosas restaurant 0
dhosas wait 0
dhosas orde

friendly pumkin 5.629759252071381
place pumkin 7.0278750360012054
restaurant pumkin 7.065643101930618
fun pumkin 6.986879050731659
pizza pumkin 6.081620410084724
yummy pumkin 0.7827956080436707
pizza pumkin 6.081620410084724
way pumkin 7.18255940079689
expensive pumkin 6.019999235868454
lunch pumkin 6.511722654104233
review pumkin 4.681250877678394
good pumkin 8.241787642240524
sauce pumkin 5.133048910647631
food pumkin 7.8201000690460205
flavor pumkin 6.0786275789141655
watery pumkin 2.2932066237553954
place pumkin 7.0278750360012054
great pumkin 6.809209495782852
group pumkin 4.616148851811886
group pumkin 4.616148851811886
care pumkin 5.848891958594322
well pumkin 7.941395968198776
waitress pumkin 3.768599701230414
food pumkin 7.8201000690460205
patient pumkin 4.3041678965091705
phenomenal pumkin 3.2049078047275543
service pumkin 5.628849819302559
prompt pumkin 2.982295108027756
friendly pumkin 5.629759252071381
great pumkin 6.809209495782852
side pumkin 5.521021157503128
pricey pum

overprice table 0
overprice waiter 0
overprice attentive 0
overprice time 0
overprice rice 0
overprice nice 0
overprice great 0
overprice good 0
overprice atmosphere 0
overprice place 0
overprice service 0
overprice vibe 0
overprice food 0
overprice ambiance 0
overprice seat 0
overprice restaurant 0
overprice table 0
overprice excellent 0
overprice entertainment 0
overprice bit 0
overprice night 0
overprice garden 0
overprice sidewalk 0
overprice location 0
overprice world 0
overprice house 0
overprice foot 0
overprice chart 0
overprice 6th 0
overprice view 0
overprice top 0
overprice avenue 0
overprice atmosphere 0
overprice ambience 0
overprice cool 0
overprice idea 0
overprice evening 0
overprice wonderful 0
overprice great 0
overprice restaurant 0
overprice place 0
overprice time 0
overprice good 0
overprice great 0
overprice price 0
overprice spot 0
overprice food 0
overprice friend 0
overprice gem 0
overprice dinner 0
overprice favorite 0
overprice date 0
overprice city 0
overpri

price pumkin 6.434644788503647
pizza pumkin 6.081620410084724
mozzarella pumkin 2.3585558142513037
mozzarella pumkin 2.3585558142513037
delicious pumkin 6.593433536589146
fresh pumkin 6.40311399102211
cheap pumkin 5.7607109397649765
frozen pumkin 4.755580589175224
stink pumkin 2.205737722106278
nice pumkin 6.757991969585419
thin pumkin 4.389829650521278
cold pumkin 5.863706991076469
staff pumkin 4.787265190854669
business pumkin 6.746610715985298
pie pumkin 5.105678930878639
uws pumkin -3.732981339097023
best pumkin 7.59813392162323
dining pumkin 6.230201065540314
restaurant pumkin 7.065643101930618
quality pumkin 6.833547681570053
fine pumkin 6.468930602073669
choice pumkin 7.54949489235878
relaxing pumkin 3.4608439207077026
accompaniment pumkin 3.035947822034359
perfect pumkin 6.581308573484421
quiet pumkin 5.1882356852293015
delicious pumkin 6.593433536589146
excpetiona food 0
excpetiona good 0
excpetiona great 0
excpetiona delicious 0
excpetiona dish 0
excpetiona pizza 0
excpetiona

NUM-NUM sidewalk 0
NUM-NUM location 0
NUM-NUM world 0
NUM-NUM house 0
NUM-NUM foot 0
NUM-NUM chart 0
NUM-NUM 6th 0
NUM-NUM view 0
NUM-NUM top 0
NUM-NUM avenue 0
NUM-NUM atmosphere 0
NUM-NUM ambience 0
NUM-NUM cool 0
NUM-NUM idea 0
NUM-NUM evening 0
NUM-NUM wonderful 0
NUM-NUM great 0
NUM-NUM restaurant 0
NUM-NUM place 0
NUM-NUM time 0
NUM-NUM good 0
NUM-NUM great 0
NUM-NUM price 0
NUM-NUM spot 0
NUM-NUM food 0
NUM-NUM friend 0
NUM-NUM gem 0
NUM-NUM dinner 0
NUM-NUM favorite 0
NUM-NUM date 0
NUM-NUM city 0
NUM-NUM big 0
NUM-NUM home 0
NUM-NUM special 0
ok pumkin 5.149627178907394
good pumkin 8.241787642240524
seating pumkin 3.094459928572178
prompt pumkin 2.982295108027756
food pumkin 7.8201000690460205
good pumkin 8.241787642240524
order pumkin 5.953308671712875
slow pumkin 5.135080128908157
particularly pumkin 6.188870489597321
true pumkin 6.290135055780411
problem pumkin 5.522333063185215
eye-pleasing food 0
eye-pleasing good 0
eye-pleasing great 0
eye-pleasing delicious 0
eye-pleasi

nice pumkin 6.757991969585419
quiet pumkin 5.1882356852293015
enough pumkin 7.204892843961716
restaurant pumkin 7.065643101930618
overhyped pumkin -0.8079002879094332
really pumkin 7.058807790279388
bland pumkin 3.8494930397719145
oily pumkin 1.6246611755341291
part pumkin 6.399925276637077
part pumkin 6.399925276637077
way pumkin 7.18255940079689
way pumkin 7.18255940079689
bland pumkin 3.8494930397719145
fry pumkin 4.3731647580862045
service pumkin 5.628849819302559
thing pumkin 7.309315741062164
good pumkin 8.241787642240524
restaurant pumkin 7.065643101930618
inside pumkin 5.679953992366791
sushi pumkin 4.439757427200675
tuna pumkin 4.193827152252197
flavor pumkin 6.0786275789141655
flavor pumkin 6.0786275789141655
bore pumkin 2.6370670422911644
pretty pumkin 6.551564425230026
average pumkin 4.671602860093117
soggy pumkin 1.5886713601648808
pad pumkin 2.3340742513537407
penang pumkin -0.04565257439389825
everything pumkin 7.342931792140007
delicious pumkin 6.593433536589146
fantast

steake place 0
steake great 0
steake restaurant 0
steake wait 0
steake order 0
steake table 0
steake waiter 0
steake attentive 0
steake time 0
steake rice 0
fritte service 0
fritte staff 0
fritte rude 0
fritte food 0
fritte good 0
fritte friendly 0
fritte thing 0
fritte place 0
fritte great 0
fritte restaurant 0
fritte wait 0
fritte order 0
fritte table 0
fritte waiter 0
fritte attentive 0
fritte time 0
fritte rice 0
steake nice 0
steake great 0
steake good 0
steake atmosphere 0
steake place 0
steake service 0
steake vibe 0
steake food 0
steake ambiance 0
steake seat 0
steake restaurant 0
steake table 0
steake excellent 0
steake entertainment 0
steake bit 0
steake night 0
steake garden 0
fritte nice 0
fritte great 0
fritte good 0
fritte atmosphere 0
fritte place 0
fritte service 0
fritte vibe 0
fritte food 0
fritte ambiance 0
fritte seat 0
fritte restaurant 0
fritte table 0
fritte excellent 0
fritte entertainment 0
fritte bit 0
fritte night 0
fritte garden 0
steake sidewalk 0
steake lo

grazie pumkin -0.47286291141062975
service pumkin 5.628849819302559
proper pumkin 5.2227270901203156
deficiency pumkin 1.0143093480728567
teodora pumkin -3.781752325594425
deficiency pumkin 1.0143093480728567
completely pumkin 4.783951260149479
fair pumkin 6.003097668290138
redeeming pumkin 0.9661615727236494
factor pumkin 4.3248665779829025
food pumkin 7.8201000690460205
rosemary pumkin 2.518714224919677
orange pumkin 4.741781570017338
flavoring pumkin 2.4050382734276354
edible pumkin 3.7017644932493567
weird pumkin 4.026245128363371
dessert pumkin 6.354553788900375
dessert pumkin 6.354553788900375
recommend pumkin 4.580437943339348
expresso pumkin -0.6200001584365964
expresso pumkin -0.6200001584365964
like pumkin 7.643898606300354
restaurant pumkin 7.065643101930618
disappointment pumkin 3.8470276296138763
myagi food 0
myagi good 0
myagi great 0
myagi delicious 0
myagi dish 0
myagi pizza 0
myagi service 0
myagi rice 0
myagi sushi 0
myagi price 0
myagi appetizer 0
myagi restaurant 0


excuse pumkin 4.405640102922916
wks pumkin -2.8095447747036815
wks pumkin -2.8095447747036815
excuse pumkin 4.405640102922916
food pumkin 7.8201000690460205
rant pumkin 0.9815348568372428
rave pumkin 2.699318403378129
food pumkin 7.8201000690460205
like pumkin 7.643898606300354
offensive pumkin 3.218962736427784
class pumkin 4.651145055890083
service pumkin 5.628849819302559
food pumkin 7.8201000690460205
terrific pumkin 5.83663547039032
portion pumkin 4.2801162376999855
large pumkin 6.18683959543705
wine pumkin 7.441040739417076
list pumkin 5.44097039103508
excellent pumkin 6.58218489587307
chef pumkin 5.582213118672371
tasting pumkin 6.431734561920166
menu pumkin 6.796193040907383
bottle pumkin 5.907950468361378
place pumkin 7.0278750360012054
service pumkin 5.628849819302559
small pumkin 6.558736085891724
intimate pumkin 4.293176665902138
little pumkin 7.335524559020996
crowded pumkin 3.7630600258708
excellent pumkin 6.58218489587307
prixe food 0
prixe good 0
prixe great 0
prixe del

resturant drink 0
resturant service 0
resturant selection 0
resturant menu 0
resturant sake 0
resturant big 0
resturant happy 0
resturant husband 0
resturant delicious 0
resturant food 0
resturant dish 0
resturant guest 0
resturant pumkin 0
cramped pumkin 2.0705110798589885
unappealing pumkin 1.0736760233994573
resturant service 0
resturant staff 0
resturant rude 0
resturant food 0
resturant good 0
resturant friendly 0
resturant thing 0
resturant place 0
resturant great 0
resturant restaurant 0
resturant wait 0
resturant order 0
resturant table 0
resturant waiter 0
resturant attentive 0
resturant time 0
resturant rice 0
resturant nice 0
resturant great 0
resturant good 0
resturant atmosphere 0
resturant place 0
resturant service 0
resturant vibe 0
resturant food 0
resturant ambiance 0
resturant seat 0
resturant restaurant 0
resturant table 0
resturant excellent 0
resturant entertainment 0
resturant bit 0
resturant night 0
resturant garden 0
resturant sidewalk 0
resturant location 0
res

menu-fare wait 0
menu-fare order 0
menu-fare table 0
menu-fare waiter 0
menu-fare attentive 0
menu-fare time 0
menu-fare rice 0
menu-fare nice 0
menu-fare great 0
menu-fare good 0
menu-fare atmosphere 0
menu-fare place 0
menu-fare service 0
menu-fare vibe 0
menu-fare food 0
menu-fare ambiance 0
menu-fare seat 0
menu-fare restaurant 0
menu-fare table 0
menu-fare excellent 0
menu-fare entertainment 0
menu-fare bit 0
menu-fare night 0
menu-fare garden 0
menu-fare sidewalk 0
menu-fare location 0
menu-fare world 0
menu-fare house 0
menu-fare foot 0
menu-fare chart 0
menu-fare 6th 0
menu-fare view 0
menu-fare top 0
menu-fare avenue 0
menu-fare atmosphere 0
menu-fare ambience 0
menu-fare cool 0
menu-fare idea 0
menu-fare evening 0
menu-fare wonderful 0
menu-fare great 0
menu-fare restaurant 0
menu-fare place 0
menu-fare time 0
menu-fare good 0
menu-fare great 0
menu-fare price 0
menu-fare spot 0
menu-fare food 0
menu-fare friend 0
menu-fare gem 0
menu-fare dinner 0
menu-fare favorite 0
menu-f

non-veg sidewalk 0
non-veg location 0
non-veg world 0
non-veg house 0
non-veg foot 0
non-veg chart 0
non-veg 6th 0
non-veg view 0
non-veg top 0
non-veg avenue 0
non-veg atmosphere 0
non-veg ambience 0
non-veg cool 0
non-veg idea 0
non-veg evening 0
non-veg wonderful 0
non-veg great 0
non-veg restaurant 0
non-veg place 0
non-veg time 0
non-veg good 0
non-veg great 0
non-veg price 0
non-veg spot 0
non-veg food 0
non-veg friend 0
non-veg gem 0
non-veg dinner 0
non-veg favorite 0
non-veg date 0
non-veg city 0
non-veg big 0
non-veg home 0
non-veg special 0
decor pumkin 4.030533578246832
nice pumkin 6.757991969585419
spotty pumkin 0.6980658266693354
place pumkin 7.0278750360012054
food pumkin 7.8201000690460205
excellent pumkin 6.58218489587307
sea pumkin 4.113268226385117
bass pumkin 3.6368344705551863
recommend pumkin 4.580437943339348
bukhara pumkin -1.489341914653778
rock pumkin 4.584118060767651
awsome food 0
awsome good 0
awsome great 0
awsome delicious 0
awsome dish 0
awsome pizza 0
a

hollondaise order 0
hollondaise table 0
hollondaise waiter 0
hollondaise attentive 0
hollondaise time 0
hollondaise rice 0
hollondaise service 0
hollondaise staff 0
hollondaise rude 0
hollondaise food 0
hollondaise good 0
hollondaise friendly 0
hollondaise thing 0
hollondaise place 0
hollondaise great 0
hollondaise restaurant 0
hollondaise wait 0
hollondaise order 0
hollondaise table 0
hollondaise waiter 0
hollondaise attentive 0
hollondaise time 0
hollondaise rice 0
hollondaise nice 0
hollondaise great 0
hollondaise good 0
hollondaise atmosphere 0
hollondaise place 0
hollondaise service 0
hollondaise vibe 0
hollondaise food 0
hollondaise ambiance 0
hollondaise seat 0
hollondaise restaurant 0
hollondaise table 0
hollondaise excellent 0
hollondaise entertainment 0
hollondaise bit 0
hollondaise night 0
hollondaise garden 0
hollondaise nice 0
hollondaise great 0
hollondaise good 0
hollondaise atmosphere 0
hollondaise place 0
hollondaise service 0
hollondaise vibe 0
hollondaise food 0
holl

tramezzinis nice 0
tramezzinis excellent 0
tramezzinis selection 0
tramezzinis chicken 0
tramezzinis fish 0
tramezzinis wine 0
tramezzinis good 0
tramezzinis beer 0
tramezzinis price 0
tramezzinis drink 0
tramezzinis service 0
tramezzinis selection 0
tramezzinis menu 0
tramezzinis sake 0
tramezzinis big 0
tramezzinis happy 0
tramezzinis husband 0
tramezzinis delicious 0
tramezzinis food 0
tramezzinis dish 0
tramezzinis guest 0
tramezzinis pumkin 0
palate pumkin 2.633838728070259
tramezzinis wine 0
tramezzinis good 0
tramezzinis beer 0
tramezzinis price 0
tramezzinis drink 0
tramezzinis service 0
tramezzinis selection 0
tramezzinis menu 0
tramezzinis sake 0
tramezzinis big 0
tramezzinis happy 0
tramezzinis husband 0
tramezzinis delicious 0
tramezzinis food 0
tramezzinis dish 0
tramezzinis guest 0
tramezzinis pumkin 0
palate pumkin 2.633838728070259
tramezzinis service 0
tramezzinis staff 0
tramezzinis rude 0
tramezzinis food 0
tramezzinis good 0
tramezzinis friendly 0
tramezzinis thing 

jsut dish 0
jsut pizza 0
jsut service 0
jsut rice 0
jsut sushi 0
jsut price 0
jsut appetizer 0
jsut restaurant 0
jsut nice 0
jsut excellent 0
jsut selection 0
jsut chicken 0
jsut fish 0
jsut wine 0
jsut good 0
jsut beer 0
jsut price 0
jsut drink 0
jsut service 0
jsut selection 0
jsut menu 0
jsut sake 0
jsut big 0
jsut happy 0
jsut husband 0
jsut delicious 0
jsut food 0
jsut dish 0
jsut guest 0
jsut pumkin 0
wrong pumkin 5.443862244486809
evil pumkin 2.922521867789328
incompetent pumkin 1.2844831570982933
jsut service 0
jsut staff 0
jsut rude 0
jsut food 0
jsut good 0
jsut friendly 0
jsut thing 0
jsut place 0
jsut great 0
jsut restaurant 0
jsut wait 0
jsut order 0
jsut table 0
jsut waiter 0
jsut attentive 0
jsut time 0
jsut rice 0
jsut nice 0
jsut great 0
jsut good 0
jsut atmosphere 0
jsut place 0
jsut service 0
jsut vibe 0
jsut food 0
jsut ambiance 0
jsut seat 0
jsut restaurant 0
jsut table 0
jsut excellent 0
jsut entertainment 0
jsut bit 0
jsut night 0
jsut garden 0
jsut sidewalk 0
js

outstanding pumkin 4.370920333079994
particulary pumkin -1.5451452657580376
change pumkin 6.359005734324455
mojito pumkin 0.2537422850728035
change pumkin 6.359005734324455
mojito pumkin 0.2537422850728035
rice pumkin 5.848369300365448
dish pumkin 6.393249750137329
day pumkin 7.129663944244385
sum pumkin 3.349520705640316
day pumkin 7.129663944244385
sum pumkin 3.349520705640316
dim pumkin 2.4088463466614485
good pumkin 8.241787642240524
outstanding pumkin 4.370920333079994
good pumkin 8.241787642240524
patio pumkin 3.252232006751001
well pumkin 7.941395968198776
restaurant pumkin 7.065643101930618
excellent pumkin 6.58218489587307
food pumkin 7.8201000690460205
good pumkin 8.241787642240524
place pumkin 7.0278750360012054
nice pumkin 6.757991969585419
calm pumkin 3.824478767812252
service pumkin 5.628849819302559
bit pumkin 5.888781860470772
slow pumkin 5.135080128908157
selection pumkin 6.073234647512436
selection pumkin 6.073234647512436
nice pumkin 6.757991969585419
food pumkin 7.8

particular pumkin 6.308066368103027
sushi pumkin 4.439757427200675
spot pumkin 6.078522592782974
occasion pumkin 5.564333036541939
date pumkin 5.104424923658371
great pumkin 6.809209495782852
nice pumkin 6.757991969585419
food pumkin 7.8201000690460205
delivery pumkin 4.513613998889923
good pumkin 8.241787642240524
terrible pumkin 3.850486034527421
hour pumkin 5.510664641857147
food pumkin 7.8201000690460205
abrupt pumkin 1.5285855025285855
late pumkin 4.806421183049679
restaurant pumkin 7.065643101930618
rice pumkin 5.848369300365448
dinner pumkin 7.092815935611725
minute pumkin 4.497881323099136
minute pumkin 4.497881323099136
cold pumkin 5.863706991076469
place pumkin 7.0278750360012054
wonderful pumkin 7.0069510489702225
service pumkin 5.628849819302559
excellent pumkin 6.58218489587307
waiter pumkin 3.957614801824093
promptly pumkin 2.730367489159107
nice pumkin 6.757991969585419
cordial pumkin 2.006953180185519
drink pumkin 7.497124150395393
good pumkin 8.241787642240524
appetizi

daiquiries seat 0
daiquiries restaurant 0
daiquiries table 0
daiquiries excellent 0
daiquiries entertainment 0
daiquiries bit 0
daiquiries night 0
daiquiries garden 0
daiquiries sidewalk 0
daiquiries location 0
daiquiries world 0
daiquiries house 0
daiquiries foot 0
daiquiries chart 0
daiquiries 6th 0
daiquiries view 0
daiquiries top 0
daiquiries avenue 0
daiquiries atmosphere 0
daiquiries ambience 0
daiquiries cool 0
daiquiries idea 0
daiquiries evening 0
daiquiries wonderful 0
daiquiries great 0
daiquiries restaurant 0
daiquiries place 0
daiquiries time 0
daiquiries good 0
daiquiries great 0
daiquiries price 0
daiquiries spot 0
daiquiries food 0
daiquiries friend 0
daiquiries gem 0
daiquiries dinner 0
daiquiries favorite 0
daiquiries date 0
daiquiries city 0
daiquiries big 0
daiquiries home 0
daiquiries special 0
trip pumkin 5.343074753880501
battery pumkin 2.7299019787460566
park pumkin 4.143937200307846
city pumkin 4.571762178093195
place pumkin 7.0278750360012054
manhattan pumkin 

murray pumkin 3.0548511878587306
anything pumkin 6.831951677799225
service pumkin 5.628849819302559
good pumkin 8.241787642240524
enough pumkin 7.204892843961716
crowd pumkin 4.43070550262928
people pumkin 6.177103966474533
fool pumkin 4.258128326386213
service pumkin 5.628849819302559
awful pumkin 4.410741340368986
place pumkin 7.0278750360012054
worth pumkin 5.551965698599815
love pumkin 6.909896820783615
pizza pumkin 6.081620410084724
thing pumkin 7.309315741062164
pizza pumkin 6.081620410084724
thing pumkin 7.309315741062164
hungry pumkin 4.867042817175388
spot pumkin 6.078522592782974
spot pumkin 6.078522592782974
williamsburg pumkin 0.8178862556815147
spot pumkin 6.078522592782974
pleasantly pumkin 2.461444290704094
surprising pumkin 4.756657615303993
pizza pumkin 6.081620410084724
proprietor pumkin 3.2888258025050163
delicious pumkin 6.593433536589146
nice pumkin 6.757991969585419
time pumkin 7.126186475157738
disappoint pumkin 1.7988803063053638
nice pumkin 6.757991969585419
cr

iam pumkin -2.0116618419997394
visit pumkin 4.762818276882172
place pumkin 7.0278750360012054
quiet pumkin 5.1882356852293015
delightful pumkin 4.519541881978512
service pumkin 5.628849819302559
food pumkin 7.8201000690460205
good pumkin 8.241787642240524
wonderful pumkin 7.0069510489702225
caviar pumkin 3.438278805464506
caviar pumkin 3.438278805464506
good pumkin 8.241787642240524
spot pumkin 6.078522592782974
good pumkin 8.241787642240524
best pumkin 7.59813392162323
wait pumkin 5.893356576561928
staff pumkin 4.787265190854669
pleasant pumkin 4.9475133419036865
fun pumkin 6.986879050731659
gorgeous pumkin 3.733355793636292
wonderful pumkin 7.0069510489702225
food pumkin 7.8201000690460205
yummy pumkin 0.7827956080436707
part pumkin 6.399925276637077
night pumkin 6.15258215367794
atmosphere pumkin 4.2735239416360855
delightfully pumkin 1.4127324963919818
free pumkin 6.492040723562241
martini pumkin 3.228912577033043
martini pumkin 3.228912577033043
vanilla pumkin 3.7855130238458514
m

suggestion pumkin 5.010215312242508
suggestion pumkin 5.010215312242508
poor pumkin 5.2633112370967865
ok pumkin 5.149627178907394
serving pumkin 5.69103516638279
salmon pumkin 4.4820797592401505
wasnt pumkin -0.18424495682120323
salmon pumkin 4.4820797592401505
wasnt pumkin -0.18424495682120323
good pumkin 8.241787642240524
impressed pumkin 4.233211800456047
dessert pumkin 6.354553788900375
joke pumkin 5.593205839395523
bother pumkin 3.3622069880366325
friendly pumkin 5.629759252071381
world pumkin 5.672976776957512
famine pumkin 1.572113474830985
world pumkin 5.672976776957512
family pumkin 5.685057386755943
family pumkin 5.685057386755943
enormous pumkin 4.848863676190376
famine pumkin 1.572113474830985
least pumkin 5.879221498966217
regard pumkin 4.99603033810854
anti-pasta food 0
anti-pasta good 0
anti-pasta great 0
anti-pasta delicious 0
anti-pasta dish 0
anti-pasta pizza 0
anti-pasta service 0
anti-pasta rice 0
anti-pasta sushi 0
anti-pasta price 0
anti-pasta appetizer 0
anti-pa

expensive pumkin 6.019999235868454
well pumkin 7.941395968198776
worth pumkin 5.551965698599815
island pumkin 3.894666016101837
manhattan pumkin 4.033198565244675
restaurant pumkin 7.065643101930618
drive pumkin 5.086357489228249
corona pumkin 0.5772185111418366
great pumkin 6.809209495782852
exceptional pumkin 4.4328718110919
holiday pumkin 6.37392646074295
time pumkin 7.126186475157738
wonderful pumkin 7.0069510489702225
atomosphere food 0
atomosphere good 0
atomosphere great 0
atomosphere delicious 0
atomosphere dish 0
atomosphere pizza 0
atomosphere service 0
atomosphere rice 0
atomosphere sushi 0
atomosphere price 0
atomosphere appetizer 0
atomosphere restaurant 0
atomosphere nice 0
atomosphere excellent 0
atomosphere selection 0
atomosphere chicken 0
atomosphere fish 0
atomosphere wine 0
atomosphere good 0
atomosphere beer 0
atomosphere price 0
atomosphere drink 0
atomosphere service 0
atomosphere selection 0
atomosphere menu 0
atomosphere sake 0
atomosphere big 0
atomosphere hap

selecion food 0
selecion good 0
selecion great 0
selecion delicious 0
selecion dish 0
selecion pizza 0
selecion service 0
selecion rice 0
selecion sushi 0
selecion price 0
selecion appetizer 0
selecion restaurant 0
selecion nice 0
selecion excellent 0
selecion selection 0
selecion chicken 0
selecion fish 0
selecion food 0
selecion good 0
selecion great 0
selecion delicious 0
selecion dish 0
selecion pizza 0
selecion service 0
selecion rice 0
selecion sushi 0
selecion price 0
selecion appetizer 0
selecion restaurant 0
selecion nice 0
selecion excellent 0
selecion selection 0
selecion chicken 0
selecion fish 0
food pumkin 7.8201000690460205
selecion wine 0
selecion good 0
selecion beer 0
selecion price 0
selecion drink 0
selecion service 0
selecion selection 0
selecion menu 0
selecion sake 0
selecion big 0
selecion happy 0
selecion husband 0
selecion delicious 0
selecion food 0
selecion dish 0
selecion guest 0
selecion pumkin 0
selecion wine 0
selecion good 0
selecion beer 0
selecion pri

setting/atmosphere table 0
setting/atmosphere waiter 0
setting/atmosphere attentive 0
setting/atmosphere time 0
setting/atmosphere rice 0
setting/atmosphere service 0
setting/atmosphere staff 0
setting/atmosphere rude 0
setting/atmosphere food 0
setting/atmosphere good 0
setting/atmosphere friendly 0
setting/atmosphere thing 0
setting/atmosphere place 0
setting/atmosphere great 0
setting/atmosphere restaurant 0
setting/atmosphere wait 0
setting/atmosphere order 0
setting/atmosphere table 0
setting/atmosphere waiter 0
setting/atmosphere attentive 0
setting/atmosphere time 0
setting/atmosphere rice 0
setting/atmosphere nice 0
setting/atmosphere great 0
setting/atmosphere good 0
setting/atmosphere atmosphere 0
setting/atmosphere place 0
setting/atmosphere service 0
setting/atmosphere vibe 0
setting/atmosphere food 0
setting/atmosphere ambiance 0
setting/atmosphere seat 0
setting/atmosphere restaurant 0
setting/atmosphere table 0
setting/atmosphere excellent 0
setting/atmosphere entertainm

warm pumkin 6.182961359620094
super pumkin 4.516464956104755
fast pumkin 6.260908901691437
mint pumkin 3.53104244871065
mint pumkin 3.53104244871065
drink pumkin 7.497124150395393
drink pumkin 7.497124150395393
amazing pumkin 5.086354769766331
usha pumkin -2.0540526770055294
bartender pumkin 3.378168866969645
manhattan pumkin 4.033198565244675
nice pumkin 6.757991969585419
fav pumkin -3.2235145270824432
lassi pumkin -1.6153446901589632
place pumkin 7.0278750360012054
roti pumkin -0.3827094519510865
roll pumkin 5.543170049786568
amazing pumkin 5.086354769766331
unda pumkin -4.686683803796768
egg pumkin 4.761530011892319
roll pumkin 5.543170049786568
unda pumkin -4.686683803796768
egg pumkin 4.761530011892319
roll pumkin 5.543170049786568
recommend pumkin 4.580437943339348
spice pumkin 4.752017043530941
onion pumkin 3.6324004158377647
egg pumkin 4.761530011892319
roti pumkin -0.3827094519510865
delicate pumkin 4.148470144718885
amazing pumkin 5.086354769766331
food pumkin 7.8201000690460

tasting pumkin 6.431734561920166
food pumkin 7.8201000690460205
price pumkin 6.434644788503647
good pumkin 8.241787642240524
incredible pumkin 4.6442685425281525
service pumkin 5.628849819302559
wong pumkin 1.396270698402077
place pumkin 7.0278750360012054
great pumkin 6.809209495782852
music pumkin 5.579688012599945
food pumkin 7.8201000690460205
service pumkin 5.628849819302559
price pumkin 6.434644788503647
good pumkin 8.241787642240524
great pumkin 6.809209495782852
speedy pumkin 2.3596483934670687
affordable pumkin 4.394409731030464
wine pumkin 7.441040739417076
good pumkin 8.241787642240524
neighborhood pumkin 3.8649308010935783
trip pumkin 5.343074753880501
tapa pumkin -1.5644035642035306
consistently pumkin 3.7781015932559967
good pumkin 8.241787642240524
drink pumkin 7.497124150395393
service pumkin 5.628849819302559
pretty pumkin 6.551564425230026
good pumkin 8.241787642240524
atmosphere pumkin 4.2735239416360855
nice pumkin 6.757991969585419
relax pumkin 4.090528063476086
fo

In [396]:
y_true=res_single_df['entity']
y_pred=res_single_df['prediction_entities']
precision_recall_fscore_support(y_true, y_pred, average="weighted")

(0.5133224949625554, 0.4857142857142857, 0.4668674903700146, None)

In [287]:
model.similarity('disappointing', 'atmosphere')

0.22963682

In [295]:
res_single_df[res_single_df['aspect_term']=='NUM-piece']

Unnamed: 0,reviewID,sentenceID,review,category,polarity,preprocessed_sentence,aspect_term,opinion_term,prediction_category,entity,prediction_entities


In [362]:
re.sub(r'(.)\1{3,}','', preprocessing_with_lemma('If you go, try the marinara/arrabiatta sauce, the mozzarella en Carozza is mmmmmmmm..... everything is just delicious.'))

'if you go try the marinaraarrabiatta sauce the mozzarella en carozza be  everything be just delicious'

In [391]:
preprocessing_with_lemma('The wine list is interesting and has many good values')

'the wine list be interesting and have many good value'

In [411]:
aspect_extraction('They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.')

KeyboardInterrupt: 

In [412]:
res_single_df

Unnamed: 0,reviewID,sentenceID,review,category,polarity,preprocessed_sentence,aspect_term,opinion_term,entity,prediction_entities
0,1004293,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT#GENERAL,negative,judging from previous posts this used to be a ...,"[[post, place]]",[[good]],RESTAURANT,RESTAURANT
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE#GENERAL,negative,"we, there were four of us, arrived at noon - t...","[[place], [], [noon]]","[[empty], [rude], []]",SERVICE,RESTAURANT
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",SERVICE#GENERAL,negative,"they never brought us complimentary noodles, i...","[[noodle, request, dish, noodle, request]]",[[complimentary]],SERVICE,FOOD
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD#QUALITY,negative,the food was lousy - too sweet or too salty an...,[[food]],"[[lousy, sweet, salty]]",FOOD,FOOD
4,1004293,1004293:4,"After all that, they complained to me about th...",SERVICE#GENERAL,negative,"after all that, they complained to me about th...",[[tip]],[[complain]],SERVICE,RESTAURANT
5,1004293,1004293:5,Avoid this place!,RESTAURANT#GENERAL,negative,avoid this place!,[[place]],[[]],RESTAURANT,RESTAURANT
6,1014458,1014458:0,"I have eaten at Saul, many times, the food is ...",FOOD#QUALITY,positive,"i have eaten at saul, many times, the food is ...","[[food], [saul]]","[[consistently, outrageously, good], [many]]",FOOD,AMBIENCE
7,1014458,1014458:1,Saul is the best restaurant on Smith Street an...,RESTAURANT#GENERAL,positive,saul is the best restaurant on smith street an...,"[[saul, restaurant]]",[[best]],RESTAURANT,RESTAURANT
8,1014458,1014458:2,The duck confit is always amazing and the foie...,FOOD#QUALITY,positive,the duck confit is always amazing and the foie...,"[[duck, confit]]",[[amazing]],FOOD,FOOD
9,1014458,1014458:3,The wine list is interesting and has many good...,DRINKS#STYLE_OPTIONS,positive,the wine list is interesting and has many good...,"[[wine, list, value]]","[[interesting, good]]",DRINKS,RESTAURANT
