In [38]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [39]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Farza
[nltk_data]     Nurifan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [42]:
import pandas as pd

df = pd.read_csv('res_mul_all.csv')
df.head()

Unnamed: 0,id,reviewID,sentenceID,review,category,polarity,entity,preprocessed_sentence,type_sentence
0,0,RL#3,RL#3:1,I am not necessarily fanatical about this plac...,VALUE#PRICES,positive,VALUE,i am not necessarily fanatical about this plac...,compound_sentence
1,2,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
2,3,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
3,4,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence
4,5,TR#2,TR#2:2,The high prices you're going to pay is for the...,VALUE#PRICES,negative,VALUE,the high prices you 're going to pay is for th...,complex_sentence


In [43]:
text_data = []
for sentence in df['review']:
    tokens = prepare_text_for_lda(sentence)
    text_data.append(tokens)

In [44]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [173]:
import gensim

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 30, id2word=dictionary, random_state = 42)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics()

In [176]:
for a in topics:
    print(a)
    print()

(20, '0.026*"bagel" + 0.018*"location" + 0.018*"place" + 0.018*"different" + 0.018*"especially" + 0.018*"service" + 0.018*"always" + 0.009*"means" + 0.009*"tomato" + 0.009*"simple"')

(5, '0.034*"excellent" + 0.034*"great" + 0.023*"price" + 0.012*"shanty" + 0.012*"drink" + 0.012*"phenomenal" + 0.012*"glass" + 0.012*"staff" + 0.012*"music" + 0.012*"quality"')

(2, '0.037*"place" + 0.037*"service" + 0.037*"price" + 0.029*"plentiful" + 0.029*"flavorful" + 0.029*"reasonably" + 0.022*"great" + 0.015*"offer" + 0.015*"would" + 0.015*"garden"')

(17, '0.023*"delicious" + 0.023*"service" + 0.016*"setting" + 0.016*"special" + 0.016*"staff" + 0.016*"thing" + 0.016*"several" + 0.016*"excellent" + 0.016*"cheap" + 0.016*"serve"')

(12, '0.042*"service" + 0.017*"notch" + 0.017*"waiter" + 0.017*"location" + 0.017*"great" + 0.017*"attentive" + 0.009*"better" + 0.009*"check" + 0.009*"letting" + 0.009*"husband"')

(15, '0.090*"great" + 0.049*"place" + 0.033*"plentiful" + 0.033*"reasonably" + 0.033*"flavo

In [159]:
all_topics = []
for topic in topics:
    for pair in topic[1].split(' + '):
        word = pair.split('*')[1].replace("\"", '')
        all_topics.append(word)

In [171]:
import pandas as pd

def aspect_topic(all_topics):
    sf = pd.DataFrame(columns=['id','review','category','term'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        for cat in df['category'][index].split(','):
            category.append(cat.split('#')[0])
        id_name = df['id'][index]
        for topic in all_topics:
            if topic in lowercased:
                term.append(topic)
#         print(term)
        if len(term) == 0:
            print(sentence)
            count += 1
        sf = sf.append({'id': id_name, 'review': sentence.strip().lower().replace('  ', ' '), 'category': '|'.join(category), 'term': '|'.join(term)}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("lda.csv")
    sf.to_excel("lda.xlsx")

In [172]:
aspect_topic(list(dict.fromkeys(all_topics)))

Food is bad.
Food is disgusting.
The rice to fish ration was also good and they didn't try to overpack the rice.
The food here is not good.
I like the somosas, chai, and the chole, but the dhosas and dhal were kinda dissapointing.
The lava cake dessert was terrible.
My wife had the fried shrimp which are huge and loved it.
The hostess is rude to the point of being offensive.
The food was bland oily.
I went there for lunch and the lunch was not as good as I expected from the reviews I read.
Sauce was watery and the food didn't have much flavor.
It may be a bit packed on weekends, but the vibe is good and it is the best French food you will find in the area.
The food was bad.
The tuna and wasabe potatoes are bad.
It's simply the best meal in NYC.
My chow fun and chow see was really bland and oily.
Salads were bad.
Ingredients are organic which is a real plus for me.
The Yellowtail was particularly good as well.
We even had a visit from the Manager who wanted to make sure we were enjoying

In [99]:
list(dict.fromkeys(all_topics))

['atmosphere',
 'casual',
 'relax',
 'absolutely',
 'always',
 'staff',
 'price',
 'appropriate',
 'moderate',
 'serve',
 'flavorful',
 'plentiful',
 'reasonably',
 'place',
 'great',
 'amaze',
 'sushi',
 'service',
 'appetizer',
 'would',
 'pleasant',
 'raise',
 'french',
 'attentive',
 'taste',
 'selection',
 'salad',
 'decor',
 'restaurant',
 'waiter',
 'server',
 'closing',
 'special',
 'consider',
 'another',
 'manager',
 'italian',
 'never',
 'dish',
 'outside',
 'ambience',
 'pizza',
 'caviar',
 'constitute',
 'fresh',
 'everything',
 'quick',
 'thing',
 'terrible',
 'different',
 'despite',
 'outstanding',
 'really',
 'worst',
 'every',
 'cheese',
 'spicy',
 'delicious',
 'indian',
 'could',
 'ingredient',
 'value',
 'drink',
 'quality',
 'sweet',
 'prompt',
 'lasagna',
 'friendly',
 'worth',
 'going',
 'pasta',
 'slice',
 'portion',
 'bagel',
 'tasty',
 'fancy',
 'ask',
 'beginning',
 'shabu',
 'vegetarian',
 'pumkin',
 'tortelini',
 'following',
 'particular',
 'order',
 'cra