## Aim of this notebook:

- split sentence by punctuation
- filter by length
- run each phrase through LDA
- sentiment analysis on each phrase
- POS tags for extra info
- post-processing: do we combine like topics? etc

In [3]:
import pandas as pd
import numpy as np
import gradio as gr
import nltk
import spacy
import pickle
import re
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nlp = spacy.load("en_core_web_sm")
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Evan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from absa_functions import *

In [5]:
def parse_targets(nlp, review):
    doc = nlp(review)
    targets = []
    target = ''

    for token in doc:
        if (token.dep_ in ['nsubj','dobj', 'pobj', 'ROOT']) and (token.pos_ in ['NOUN', 'PROPN']):
            target = token.text
            targets.append(target)

    return targets

In [6]:
def parse_adjectives(nlp, review):
    doc = nlp(review)
    adjectives = []
    adjective = ''

    for token in doc:
        if token.pos_ == 'ADJ':
            prepend = ''
            for child in token.children:
                if child.pos_ != 'ADV':
                    continue
                prepend += child.text + ' '
            adjective = prepend + token.text
            adjectives.append(adjective)

    return adjectives

In [7]:
def get_topic_from_word(word, lda_model, topic_map):
    try:
        topics_raw = lda_model.get_term_topics(word, minimum_probability=0.0000001)
        topic_dict = {topic_map[tup[0]]: tup[1] for tup in topics_raw}
        best_topic = max(topic_dict, key=topic_dict.get)
    except:
        best_topic = 'miscellaneous'

    return best_topic

In [8]:
corpus = pickle.load(open('corpus.pkl', 'rb'))
dictionary = corpora.Dictionary.load('dictionary.gensim')
lda_model = LdaModel.load('best_lda_model.gensim')

In [9]:
sid = SentimentIntensityAnalyzer()

In [10]:
#numerical mapping for topics in LDA model
topic_map = {0: 'menu', 1: 'service', 2: 'miscellaneous', 3: 'place', 4: 'price', 5: 'food', 6: 'staff'}

In [11]:
review = '''Great restaurant, the food was great and I liked their non alcoholic drink option. It is on the pricey side but that’s expected and both indoor/outdoor seating is beautiful. The service was okay, I didn’t find my server to be very friendly but by no means was it a bad experience he just wasn’t talkative and was straight to the point which is okay. This is a great location and I’d definitely recommend to others for a special day/night out.'''

In [12]:
split_review = re.split('[?.,;!]', review)
split_review

['Great restaurant',
 ' the food was great and I liked their non alcoholic drink option',
 ' It is on the pricey side but that’s expected and both indoor/outdoor seating is beautiful',
 ' The service was okay',
 ' I didn’t find my server to be very friendly but by no means was it a bad experience he just wasn’t talkative and was straight to the point which is okay',
 ' This is a great location and I’d definitely recommend to others for a special day/night out',
 '']

In [13]:
def lda_prediction(restaurant_review):
    clean_sample = prepare_text_for_lda(restaurant_review)
    sample_2bow = dictionary.doc2bow(clean_sample)

    topics = lda_model.get_document_topics(sample_2bow)
    topic_dict = {topic_map[x[0]]:x[-1] for x in topics}
    top_topic = max(topic_dict.items(), key=lambda x:x[1])
    return top_topic

In [14]:
for phrase in split_review:
    print(lda_prediction(phrase))

('place', 0.3811573)
('service', 0.45506796)
('price', 0.38191462)
('service', 0.6870692)
('food', 0.32325694)
('price', 0.3363734)
('menu', 0.14285715)


In [15]:
for phrase in split_review:
    score = sid.polarity_scores(phrase)['compound']
    sentiment = 'positive' if score > 0 else ('neutral' if score == 0 else 'negative')
    print(score, sentiment)

0.6249 positive
0.7845 positive
0.7469 positive
0.2263 positive
-0.3825 negative
0.9001 positive
0.0 neutral


In [16]:
for phrase in split_review:
    print(parse_targets(nlp, phrase))

['restaurant']
['food', 'option']
['side', 'seating']
['service']
['server', 'means', 'point']
['others', 'night']
[]


In [17]:
for phrase in split_review:
    print(parse_adjectives(nlp, phrase))

['Great']
['great', 'non', 'alcoholic']
['pricey', 'indoor', 'outdoor', 'beautiful']
['okay']
['very friendly', 'bad', 'talkative', 'straight', 'okay']
['great', 'special']
[]


In [18]:
def pos_chunk_prediction(restaurant_review):
    nlp = spacy.load("en_core_web_sm")
    outputs = []

    phrases = re.split('[?.,;!]', restaurant_review)
    phrases = [phrase for phrase in phrases if len(phrase) > 4]

    for phrase in phrases:
        output = {}
        topic = lda_prediction(phrase)[0]
        score = sid.polarity_scores(phrase)['compound']
        sentiment = 'positive' if score > 0 else ('neutral' if score == 0 else 'negative')
        subjects = parse_targets(nlp, phrase)
        descriptors = parse_adjectives(nlp, phrase)
        output.update({'phrase': phrase, 'topic': topic, 'sentiment': sentiment, 'subjects': subjects, 'descriptors': descriptors})
        outputs.append(output)
        
    
    df = pd.DataFrame(outputs)
    return df
        


In [19]:
sample = '''Increased amounts of kimchi in your diet leads to increased abundance of Lactobacillus in your gut.'''

In [20]:
pos_chunk_prediction(sample)

Unnamed: 0,phrase,topic,sentiment,subjects,descriptors
0,Increased amounts of kimchi in your diet leads...,menu,positive,"[amounts, kimchi, diet, abundance, Lactobacill...",[]
