## Aim of this notebook:

- split sentence by punctuation
- filter by length
- run each phrase through LDA
- sentiment analysis on each phrase
- POS tags for extra info
- post-processing: do we combine like topics? etc

In [10]:
import pandas as pd
import numpy as np
import gradio as gr
import nltk
import spacy
import pickle
import re
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nlp = spacy.load("en_core_web_sm")
nltk.download('vader_lexicon')

from absa_functions import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Evan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [44]:
def parse_targets(nlp, review):
    doc = nlp(review)
    targets = []
    target = ''

    for token in doc:
        if (token.dep_ in ['nsubj','dobj', 'pobj', 'ROOT']) and (token.pos_ in ['NOUN', 'PROPN']):
            target = token.text
            targets.append(target)

    return targets

In [3]:
def parse_adjectives(nlp, review):
    doc = nlp(review)
    adjectives = []
    adjective = ''

    for token in doc:
        if token.pos_ == 'ADJ':
            prepend = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                prepend += child.text + ' '
            adjective = prepend + token.text
            adjectives.append(adjective)

    return adjectives

In [4]:
def get_topic_from_word(word, lda_model, topic_map):
    try:
        topics_raw = lda_model.get_term_topics(word, minimum_probability=0.0000001)
        topic_dict = {topic_map[tup[0]]: tup[1] for tup in topics_raw}
        best_topic = max(topic_dict, key=topic_dict.get)
    except:
        best_topic = 'miscellaneous'

    return best_topic

In [5]:
corpus = pickle.load(open('corpus.pkl', 'rb'))
dictionary = corpora.Dictionary.load('dictionary.gensim')
lda_model = LdaModel.load('best_lda_model.gensim')

In [6]:
sid = SentimentIntensityAnalyzer()

In [7]:
#numerical mapping for topics in LDA model
topic_map = {0: 'menu', 1: 'service', 2: 'miscellaneous', 3: 'place', 4: 'price', 5: 'food', 6: 'staff'}

In [14]:
review = '''Great restaurant, the food was great and I liked their non alcoholic drink option. It is on the pricey side but that’s expected and both indoor/outdoor seating is beautiful. The service was okay, I didn’t find my server to be very friendly but by no means was it a bad experience he just wasn’t talkative and was straight to the point which is okay. This is a great location and I’d definitely recommend to others for a special day/night out.'''

In [16]:
split_review = re.split('[?.,;!]', review)
split_review

['Great restaurant',
 ' the food was great and I liked their non alcoholic drink option',
 ' It is on the pricey side but that’s expected and both indoor/outdoor seating is beautiful',
 ' The service was okay',
 ' I didn’t find my server to be very friendly but by no means was it a bad experience he just wasn’t talkative and was straight to the point which is okay',
 ' This is a great location and I’d definitely recommend to others for a special day/night out',
 '']

In [36]:
def lda_prediction(restaurant_review):
    clean_sample = prepare_text_for_lda(restaurant_review)
    sample_2bow = dictionary.doc2bow(clean_sample)

    topics = lda_model.get_document_topics(sample_2bow)
    topic_dict = {topic_map[x[0]]:x[-1] for x in topics}
    top_topic = max(topic_dict.items(), key=lambda x:x[1])
    return top_topic

In [37]:
for phrase in split_review:
    print(lda_prediction(phrase))

('place', 0.38115704)
('service', 0.45517653)
('price', 0.3819581)
('service', 0.6870697)
('food', 0.3230243)
('price', 0.33639187)
('menu', 0.14285715)


In [39]:
for phrase in split_review:
    score = sid.polarity_scores(phrase)['compound']
    sentiment = 'positive' if score > 0 else ('neutral' if score == 0 else 'negative')
    print(score, sentiment)

0.6249 positive
0.7845 positive
0.7469 positive
0.2263 positive
-0.3825 negative
0.9001 positive
0.0 neutral


In [45]:
for phrase in split_review:
    print(parse_targets(nlp, phrase))

['restaurant']
['food', 'option']
['side', 'seating']
['service']
['server', 'means', 'point']
['others', 'night']
[]


In [42]:
for phrase in split_review:
    print(parse_adjectives(nlp, phrase))

['Great']
['great', 'non', 'alcoholic']
['pricey', 'indoor', 'outdoor', 'beautiful']
['okay']
['very friendly', 'bad', 'talkative', 'straight', 'okay']
['great', 'special']
[]


In [53]:
def pos_chunk_prediction(restaurant_review):
    nlp = spacy.load("en_core_web_sm")
    outputs = []

    phrases = re.split('[?.,;!]', restaurant_review)
    phrases = [phrase for phrase in phrases if len(phrase) > 4]

    for phrase in phrases:
        output = {}
        topic = lda_prediction(phrase)[0]
        score = sid.polarity_scores(phrase)['compound']
        sentiment = 'positive' if score > 0 else ('neutral' if score == 0 else 'negative')
        subjects = parse_targets(nlp, phrase)
        descriptors = parse_adjectives(nlp, phrase)
        output.update({'phrase': phrase, 'topic': topic, 'sentiment': sentiment, 'subjects': subjects, 'descriptors': descriptors})
        outputs.append(output)
        
    
    df = pd.DataFrame(outputs)
    return df
        


In [54]:
sample = '''I expected more. And maybe this is where Google reviews fall short. 
I wonder how many people have actually been to Spain and experienced real paella. 
This one was as decent as it could be but scallops were not juicy, the rice was quite dry, and seafood just not as fresh as you’d want. 
I appreciate the effort and the ambiance created at the restaurant. 
I can say good thing about the pimiento stuffed with beef cheeks — very tender and soft, as they should be. 
Then shrimps pil pil style definitely missed that strong garlic flavour and salt. 
Lovely service that made the stay very enjoyable'''

In [55]:
pos_chunk_prediction(sample)

Unnamed: 0,phrase,topic,sentiment,subjects,descriptors
0,I expected more,service,neutral,[],[more]
1,And maybe this is where Google reviews fall s...,menu,neutral,[reviews],[short]
2,\nI wonder how many people have actually been...,staff,neutral,"[people, Spain, paella]","[how many, real]"
3,\nThis one was as decent as it could be but s...,food,neutral,"[one, scallops]","[as decent, juicy]"
4,the rice was quite dry,staff,neutral,[rice],[quite dry]
5,and seafood just not as fresh as you’d want,food,negative,[],[just as fresh]
6,\nI appreciate the effort and the ambiance cr...,food,positive,"[effort, restaurant]",[]
7,\nI can say good thing about the pimiento stu...,menu,positive,"[thing, pimiento, cheeks]","[good, very tender, soft]"
8,as they should be,menu,neutral,[],[]
9,\nThen shrimps pil pil style definitely misse...,service,positive,"[style, flavour]","[strong, garlic]"


In [30]:
iface = gr.Interface(fn=pos_prediction, 
                    inputs=gr.inputs.Textbox(lines=2, placeholder='Enter restaurant review here...'),
                    outputs=gr.outputs.Dataframe(headers=['Aspect','Adjective', 'Topic', 'Polarity']),
                    examples=[
                        ['The restaurant is too dark, and the bathroom was not clean. Also, everyone there is rude.'],
                        ['Fabulous dinner & environment but the older waiters have a real sense of entitlement.'],
                        ['Entrees were way too expensive.'],
                        ['The dinner was great, and the waiter was super friendly.']
                    ])
iface.launch(share=True)

Running locally at: http://127.0.0.1:7861/
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://58123.gradio.app
Interface loading below...


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7861/',
 'https://58123.gradio.app')

[2021-10-06 21:59:26,341] ERROR in app: Exception on /api/predict/ [POST]
Traceback (most recent call last):
  File "<ipython-input-28-2b8dce64a55e>", line 24, in pos_prediction
    topic = get_topic_from_word(prepare_text_for_lda(targets[i])[0], lda_model, topic_map)
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\nlpenv\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\ProgramData\Anaconda3\envs\nlpenv\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\ProgramData\Anaconda3\envs\nlpenv\lib\site-packages\flask_cors\extension.py", line 165, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "C:\ProgramData\Anaconda3\envs\nlpenv\lib\site-packages\flask\app.py", line 1821, in handle_user_

In [None]:
['topic', 'meta', 'polarity']

#group by topics, meta: ['delicious-beautiful', 'experience-outstanding']
#hide rows that are None


TODO:
- split sentence by punctuation
- filter by length
- run each phrase through LDA
- sentiment analysis on each phrase
- POS tags for extra info
- post-processing: do we combine like topics? etc