In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nlp = spacy.load("en_core_web_sm")

from absa_functions import *

In [2]:
train_path = 'train.xml'
corpus_df, categories = create_corpus(train_path)

In [3]:
reviews = corpus_df.text.to_list()
reviews[:5]

["It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you.",
 'Hostess was extremely accommodating when we arrived an hour early for our reservation.',
 "We were a couple of minutes late for our reservation and minus one guest, but we didn't think we deserved the attitude we got from the hostess.",
 'Though the service might be a little slow, the waitresses are very friendly.',
 'Although we arrived at the restaurant 10 min late, the hostess did not have a table for us.']

In [4]:
#get syntactic dependecy relation, syntactic parent, coarse-grained part-of-speech tag for each token
count = 0
for review in reviews[:3]:
  count = 0
  doc = nlp(review)
  for token in doc:
    count += 1
    if count <= 10:
      print(token.text, token.dep_, token.head.text, token.head.pos_, token.pos_,[child for child in token.children])

It nsubj be VERB PRON []
might aux be VERB AUX []
be ROOT be VERB VERB [It, might, sit, ,, so, be]
the det best ADJ DET []
best nsubj sit VERB ADJ [the]
sit dep be VERB VERB [best, down, food]
down prt sit VERB ADP []
food dobj sit VERB NOUN [had]
I nsubj had VERB PRON []
've aux had VERB AUX []
Hostess nsubj accommodating VERB PROPN []
was aux accommodating VERB AUX []
extremely advmod accommodating VERB ADV []
accommodating ROOT accommodating VERB VERB [Hostess, was, extremely, arrived, .]
when advmod arrived VERB ADV []
we nsubj arrived VERB PRON []
arrived advcl accommodating VERB VERB [when, we, early]
an det hour NOUN DET []
hour npadvmod early ADV NOUN [an]
early advmod arrived VERB ADV [hour, for]
We nsubj were AUX PRON []
were ROOT were AUX AUX [We, couple, late, ,, but, think]
a det couple NOUN DET []
couple attr were AUX NOUN [a, of]
of prep couple NOUN ADP [minutes]
minutes pobj of ADP NOUN []
late advmod were AUX ADV [for]
for prep late ADV ADP [reservation]
our poss reser

In [5]:
#parse sentences for adjectives using POS tags
count = 0
for review in reviews:
    count += 1
    if count < 5:
        doc = nlp(review)
        adjectives = []
        for token in doc:
            if token.pos_ == 'ADJ':
                adjectives.append(token)
        print(review)
        print(adjectives)

It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you.
[best, upright]
Hostess was extremely accommodating when we arrived an hour early for our reservation.
[]
We were a couple of minutes late for our reservation and minus one guest, but we didn't think we deserved the attitude we got from the hostess.
[]
Though the service might be a little slow, the waitresses are very friendly.
[little, slow, friendly]


In [6]:
#parse sentences for adverbs & adjectives using POS tags
count = 0
for review in reviews:
  count += 1
  if count <= 5:
    doc = nlp(review)
    adjectives = []
    for token in doc:
      if token.pos_ == 'ADJ':
        prepend = ''
        for child in token.children:
          if child.pos_ != 'ADV':
            continue
          prepend += child.text + ' '
        adjectives.append(prepend + token.text)
    print(review)
    print(adjectives)

It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you.
['best', 'upright']
Hostess was extremely accommodating when we arrived an hour early for our reservation.
[]
We were a couple of minutes late for our reservation and minus one guest, but we didn't think we deserved the attitude we got from the hostess.
[]
Though the service might be a little slow, the waitresses are very friendly.
['little', 'slow', 'very friendly']
Although we arrived at the restaurant 10 min late, the hostess did not have a table for us.
[]


In [7]:
#check for subject nouns, assign to nearest adverbs & adjectives
count = 0
aspects = []
for review in reviews:
    count += 1
    if count <= 5:
        doc = nlp(review)
        descriptive_term = ''
        target = ''
        for token in doc:
            if (token.dep_ == 'nsubj' or token.dep_ == 'dobj') and token.pos_ == 'NOUN':
                target = token.text
            if token.pos_ == 'ADJ':
                prepend = ''
                for child in token.children:
                    if child.pos_ != 'ADV':
                        continue
                    prepend += child.text + ' '
                descriptive_term = prepend + token.text
        if target != '' and descriptive_term != '':
            aspects.append({'aspect': target,'description': descriptive_term})
print(aspects)

[{'aspect': 'food', 'description': 'upright'}, {'aspect': 'waitresses', 'description': 'very friendly'}]


In [8]:
#check for subject nouns, assign to nearest adverbs & adjectives
count = 0
aspects = []
adjectives = []
targets = []
for review in reviews[:5]:
    doc = nlp(review)
    target = ''
    adjective = ''
    for token in doc:
        if (token.dep_ == 'nsubj' or token.dep_ == 'dobj') and token.pos_ == 'NOUN':
            target = token.text
            targets.append(target)
        if token.pos_ == 'ADJ':
            prepend = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                prepend += child.text + ' '
            adjective = prepend + token.text
            adjectives.append(adjective)
        if target != '' and adjective != '':
            aspects.append({'aspect': target,'description': adjective})
print(aspects)

[{'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'best'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description': 'upright'}, {'aspect': 'food', 'description'

In [9]:
targets, adjectives

(['food', 'attitude', 'service', 'waitresses', 'hostess', 'table'],
 ['best', 'upright', 'little', 'slow', 'very friendly'])

In [10]:
def parse_targets(nlp, review):
    doc = nlp(review)
    targets = []
    target = ''

    for token in doc:
        if (token.dep_ in ['nsubj','dobj']) and (token.pos_ =='NOUN' or token.pos_ == 'PROPN'):
            target = token.text
            targets.append(target)

    return targets

In [11]:
def parse_adjectives(nlp, review):
    doc = nlp(review)
    adjectives = []
    adjective = ''

    for token in doc:
        if token.pos_ == 'ADJ' and token.dep_ != 'amod':
            prepend = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                prepend += child.text + ' '
            adjective = prepend + token.text
            adjectives.append(adjective)

    return adjectives

In [12]:
reviews[1]

'Hostess was extremely accommodating when we arrived an hour early for our reservation.'

In [13]:
parse_adjectives(nlp, reviews[2])

[]

In [14]:
corpus_df.columns

Index(['text', 'sentiment', 'food', 'place', 'staff', 'miscellaneous',
       'service', 'price', 'menu', 'ambience'],
      dtype='object')

In [15]:
corpus_df['aspects'] = corpus_df['text'].apply(lambda x: parse_targets(nlp, x))
corpus_df['descriptions'] = corpus_df['text'].apply(lambda x: parse_adjectives(nlp, x))

In [16]:
corpus_df.head(15)

Unnamed: 0,text,sentiment,food,place,staff,miscellaneous,service,price,menu,ambience,aspects,descriptions
0,It might be the best sit down food I've had in...,"{'food': 'positive', 'place': 'neutral'}",positive,neutral,,,,,,,[food],[best]
1,Hostess was extremely accommodating when we ar...,"{'staff': 'positive', 'miscellaneous': 'neutral'}",,,positive,neutral,,,,,[Hostess],[]
2,We were a couple of minutes late for our reser...,"{'miscellaneous': 'neutral', 'staff': 'negative'}",,,negative,neutral,,,,,[attitude],[]
3,"Though the service might be a little slow, the...","{'service': 'negative', 'staff': 'positive'}",,,positive,,negative,,,,"[service, waitresses]","[slow, very friendly]"
4,Although we arrived at the restaurant 10 min l...,"{'staff': 'negative', 'miscellaneous': 'neutral'}",,,negative,neutral,,,,,"[hostess, table]",[]
5,I like the smaller portion size for dinner.,"{'miscellaneous': 'negative', 'food': 'neutral'}",neutral,,,negative,,,,,[size],[]
6,The bill was surprisingly inexpensive consider...,"{'food': 'neutral', 'price': 'positive', 'misc...",neutral,,,neutral,,positive,,,"[bill, appetizers, rounds]","[surprisingly inexpensive, alcoholic, non]"
7,") other food is served in too-small portions, ...","{'miscellaneous': 'negative', 'food': 'neutral'}",neutral,,,negative,,,,,[room],[least]
8,"It was very loud, I felt too crowded, the man ...","{'miscellaneous': 'neutral', 'staff': 'negative'}",,,negative,neutral,,,,,"[chair, waiters]","[very loud, too crowded, next, impossible]"
9,"After ordering drinks, we both decided on the ...","{'food': 'neutral', 'miscellaneous': 'positive'}",neutral,,,positive,,,,,[drinks],[]


In [17]:
corpus = pickle.load(open('corpus.pkl', 'rb'))
dictionary = corpora.Dictionary.load('dictionary.gensim')
lda_model = LdaModel.load('best_lda_model.gensim')

In [18]:
corpus_df['aspects'][0]

['food']

In [19]:
#numerical mapping for topics in LDA model
topic_map = {0: 'menu', 1: 'service', 2: 'miscellaneous', 3: 'place', 4: 'price', 5: 'food', 6: 'staff'}

In [20]:
def get_topic_from_word(word, lda_model, topic_map):
    topics_raw = lda_model.get_term_topics(word, minimum_probability=0.0000001)
    topic_dict = {topic_map[tup[0]]: tup[1] for tup in topics_raw}
    best_topic = max(topic_dict, key=topic_dict.get)

    # best_topic_item = {best_topic: topic_dict[best_topic]}
    return best_topic

In [21]:
get_topic_from_word('sushi', lda_model, topic_map)

'menu'

In [22]:
corpus_df.aspects[3]

['service', 'waitresses']

In [23]:
corpus_df['pred_topic'] = ''

In [24]:
for i in range(0, len(corpus_df)):
    topics = []
    for word in corpus_df.aspects[i]:
        try:
            topics.append(get_topic_from_word(prepare_text_for_lda(word)[0], lda_model, topic_map))
        except:
            topics.append(None)
    corpus_df['pred_topic'][i] = topics

In [25]:
# corpus_df['pred_topic'] = corpus_df['aspects'].apply(lambda x: [get_topic_from_word(prepare_text_for_lda(word)[0], 
#                                                                                         lda_model, 
#                                                                                         topic_map) for word in x])
corpus_df.head()

Unnamed: 0,text,sentiment,food,place,staff,miscellaneous,service,price,menu,ambience,aspects,descriptions,pred_topic
0,It might be the best sit down food I've had in...,"{'food': 'positive', 'place': 'neutral'}",positive,neutral,,,,,,,[food],[best],[food]
1,Hostess was extremely accommodating when we ar...,"{'staff': 'positive', 'miscellaneous': 'neutral'}",,,positive,neutral,,,,,[Hostess],[],[miscellaneous]
2,We were a couple of minutes late for our reser...,"{'miscellaneous': 'neutral', 'staff': 'negative'}",,,negative,neutral,,,,,[attitude],[],[price]
3,"Though the service might be a little slow, the...","{'service': 'negative', 'staff': 'positive'}",,,positive,,negative,,,,"[service, waitresses]","[slow, very friendly]","[service, service]"
4,Although we arrived at the restaurant 10 min l...,"{'staff': 'negative', 'miscellaneous': 'neutral'}",,,negative,neutral,,,,,"[hostess, table]",[],"[miscellaneous, service]"


In [26]:
sid = SentimentIntensityAnalyzer()

In [27]:
sid.polarity_scores('neutral')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [57]:
corpus_df['pred_sentiment'] = ''

for i in range(0, len(corpus_df)):
    sentiments = []
    if len(corpus_df.descriptions[i]) > 0:
        for word in corpus_df.descriptions[i]:
            score = sid.polarity_scores(word)['compound']
            sentiments.append('positive' if score > 0 else ('neutral' if score == 0 else 'negative'))
    else:
        sentiments.append(0)
    corpus_df['pred_sentiment'][i] = sentiments

In [58]:
corpus_df.head()

Unnamed: 0,text,sentiment,food,place,staff,miscellaneous,service,price,menu,ambience,aspects,descriptions,pred_topic,pred_sentiment
0,It might be the best sit down food I've had in...,"{'food': 'positive', 'place': 'neutral'}",positive,neutral,,,,,,,[food],[best],[food],[positive]
1,Hostess was extremely accommodating when we ar...,"{'staff': 'positive', 'miscellaneous': 'neutral'}",,,positive,neutral,,,,,[Hostess],[],[miscellaneous],[0]
2,We were a couple of minutes late for our reser...,"{'miscellaneous': 'neutral', 'staff': 'negative'}",,,negative,neutral,,,,,[attitude],[],[price],[0]
3,"Though the service might be a little slow, the...","{'service': 'negative', 'staff': 'positive'}",,,positive,,negative,,,,"[service, waitresses]","[slow, very friendly]","[service, service]","[neutral, positive]"
4,Although we arrived at the restaurant 10 min l...,"{'staff': 'negative', 'miscellaneous': 'neutral'}",,,negative,neutral,,,,,"[hostess, table]",[],"[miscellaneous, service]",[0]


In [59]:
pred_df = corpus_df[['text', 'aspects', 'descriptions', 'pred_topic', 'pred_sentiment']]
pred_df.head()

Unnamed: 0,text,aspects,descriptions,pred_topic,pred_sentiment
0,It might be the best sit down food I've had in...,[food],[best],[food],[positive]
1,Hostess was extremely accommodating when we ar...,[Hostess],[],[miscellaneous],[0]
2,We were a couple of minutes late for our reser...,[attitude],[],[price],[0]
3,"Though the service might be a little slow, the...","[service, waitresses]","[slow, very friendly]","[service, service]","[neutral, positive]"
4,Although we arrived at the restaurant 10 min l...,"[hostess, table]",[],"[miscellaneous, service]",[0]


In [60]:
for i in range(0, len(pred_df)):
    if len(pred_df.pred_sentiment[i]) == 1 and pred_df.pred_sentiment[i][0] == 0 and len(pred_df.aspects[i]) > 1:
        pred_df.pred_sentiment[i] = [0] * len(pred_df.aspects[i])
pred_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,text,aspects,descriptions,pred_topic,pred_sentiment
0,It might be the best sit down food I've had in...,[food],[best],[food],[positive]
1,Hostess was extremely accommodating when we ar...,[Hostess],[],[miscellaneous],[0]
2,We were a couple of minutes late for our reser...,[attitude],[],[price],[0]
3,"Though the service might be a little slow, the...","[service, waitresses]","[slow, very friendly]","[service, service]","[neutral, positive]"
4,Although we arrived at the restaurant 10 min l...,"[hostess, table]",[],"[miscellaneous, service]","[0, 0]"


In [61]:
sentiments = []
for i in range(0, len(pred_df)):
    sentiments.append(dict(zip(pred_df.pred_topic[i], pred_df.pred_sentiment[i])))
pred_df['sentiment_pairs'] = sentiments
pred_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['sentiment_pairs'] = sentiments


Unnamed: 0,text,aspects,descriptions,pred_topic,pred_sentiment,sentiment_pairs
0,It might be the best sit down food I've had in...,[food],[best],[food],[positive],{'food': 'positive'}
1,Hostess was extremely accommodating when we ar...,[Hostess],[],[miscellaneous],[0],{'miscellaneous': 0}
2,We were a couple of minutes late for our reser...,[attitude],[],[price],[0],{'price': 0}
3,"Though the service might be a little slow, the...","[service, waitresses]","[slow, very friendly]","[service, service]","[neutral, positive]",{'service': 'positive'}
4,Although we arrived at the restaurant 10 min l...,"[hostess, table]",[],"[miscellaneous, service]","[0, 0]","{'miscellaneous': 0, 'service': 0}"


In [62]:
for cat in categories:
        pred_df[f'{cat}'] = pred_df['sentiment_pairs'].apply(lambda x: encode_category(x, cat))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df[f'{cat}'] = pred_df['sentiment_pairs'].apply(lambda x: encode_category(x, cat))


In [63]:
# pred_df = pred_df.fillna(None)
pred_df = pred_df.replace({np.nan: None})
pred_df.head()

Unnamed: 0,text,aspects,descriptions,pred_topic,pred_sentiment,sentiment_pairs,food,place,staff,miscellaneous,service,price,menu,ambience
0,It might be the best sit down food I've had in...,[food],[best],[food],[positive],{'food': 'positive'},positive,,,,,,,
1,Hostess was extremely accommodating when we ar...,[Hostess],[],[miscellaneous],[0],{'miscellaneous': 0},,,,0.0,,,,
2,We were a couple of minutes late for our reser...,[attitude],[],[price],[0],{'price': 0},,,,,,0.0,,
3,"Though the service might be a little slow, the...","[service, waitresses]","[slow, very friendly]","[service, service]","[neutral, positive]",{'service': 'positive'},,,,,positive,,,
4,Although we arrived at the restaurant 10 min l...,"[hostess, table]",[],"[miscellaneous, service]","[0, 0]","{'miscellaneous': 0, 'service': 0}",,,,0.0,0,,,


In [64]:
cat_df = create_label_df(corpus_df, categories)
cat_df.head()

Unnamed: 0,food,place,staff,miscellaneous,service,price,menu,ambience
0,positive,neutral,,,,,,
1,,,positive,neutral,,,,
2,,,negative,neutral,,,,
3,,,positive,,negative,,,
4,,,negative,neutral,,,,


In [74]:
lda_df = create_label_df(pred_df, categories)
lda_df = lda_df.replace(0, 'neutral')
lda_df.head()

Unnamed: 0,food,place,staff,miscellaneous,service,price,menu,ambience
0,positive,,,,,,,
1,,,,neutral,,,,
2,,,,,,neutral,,
3,,,,,positive,,,
4,,,,neutral,neutral,,,


In [75]:
calculate_accuracy(cat_df, lda_df)

64.08831003811945

In [76]:
cat_df = add_label_col(cat_df, categories)
lda_df = add_label_col(lda_df, categories)

In [77]:
precision, recall, f1 = calculate_scores(lda_df, cat_df)
precision, recall, f1

(0.8942442712243536, 0.6148499210110584, 0.6740012224014724)