In [1]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 

nltk_words = set(nltk.corpus.stopwords.words('english'))
stop_words = []
for word in nltk_words:
    stop_words.append(word.translate(str.maketrans('', '', string.punctuation)))

def preprocess(sentence):
    res = sentence.lower()
    res = res.translate(str.maketrans('', '', string.punctuation))
    tokenized_words = nltk.word_tokenize(res)
    res = [word for word in tokenized_words if word not in stop_words]
    res = [lemmatizer.lemmatize(r) for r in res]
    res = [re.sub(r"[^A-Za-z]+", '', r) for r in res]
    res = [r for r in res if len(r) > 3]
    return res

In [2]:
import pandas as pd

df = pd.read_csv('dataset/res16_baru.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewID,sentenceID,review,target,category,polarity
0,0,1014458,1014458:3,The wine list is interesting and has many good...,"wine list,wine list","FOOD,PRICES","positive,positive"
1,1,1028246,1028246:1,"Service was devine, oysters where a sensual as...","Service,oysters,NULL","SERVICE,FOOD,PRICES","positive,positive,positive"
2,2,1064477,1064477:4,We took advanatage of the half price sushi dea...,"half price sushi deal,half price sushi deal","FOOD,PRICES","positive,positive"
3,3,1084394,1084394:3,The Prix Fixe menu is worth every penny and yo...,"Prix Fixe menu,Prix Fixe menu,Prix Fixe menu","FOOD,FOOD,PRICES","positive,positive,positive"
4,4,1086415,1086415:2,"$6 and there is much tasty food, all of it fre...","food,food,food","FOOD,FOOD,PRICES","positive,positive,positive"


In [3]:
text_data = []
for sentence in df['review']:
    tokens = preprocess(sentence)
    text_data.append(tokens)

In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [5]:
import gensim

ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, random_state = 42)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics()

In [6]:
all_topics = []
for topic in topics:
    for pair in topic[1].split(' + '):
        word = pair.split('*')[1].replace("\"", '')
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))

['actually', 'admittedly', 'allergy', 'almost', 'alone', 'also', 'always', 'amazing', 'area', 'asked', 'atmosphere', 'avenue', 'away', 'back', 'baluchi', 'bargain', 'bathroom', 'best', 'better', 'bgel', 'bland', 'block', 'came', 'charm', 'clean', 'completely', 'crowded', 'decor', 'delight', 'devine', 'dinner', 'drink', 'eaten', 'elegant', 'elsewhere', 'european', 'even', 'evening', 'ever', 'everchanging', 'every', 'excellent', 'fairly', 'feel', 'fish', 'food', 'fought', 'freindly', 'french', 'friend', 'garden', 'good', 'gras', 'great', 'green', 'greeting', 'hand', 'hate', 'hesititate', 'high', 'hostess', 'inedible', 'inside', 'kill', 'laugh', 'lawn', 'like', 'little', 'lively', 'located', 'look', 'made', 'magnificent', 'make', 'making', 'manager', 'many', 'menu', 'modern', 'month', 'much', 'music', 'need', 'nice', 'night', 'nothing', 'offer', 'open', 'oyster', 'part', 'phenomenal', 'pick', 'place', 'plate', 'portion', 'possible', 'price', 'priced', 'prompt', 'quality', 'quite', 'rather

In [7]:
import pandas as pd

def aspect_topic(tipe, all_topics):
    sf = pd.DataFrame(columns=['id','review','target', 'category','term','polarity'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        polarity = df['polarity'][index]
        category = df['category'][index]
        id_name = df['sentenceID'][index]
        target = df['target'][index]
        for topic in all_topics:
            tokens = lowercased.split(' ')
            for token in tokens:
                if topic in token:
                    term.append(topic)
#         print(term)
        if len(term) == 0:
            print(lowercased)
            count += 1
        sf = sf.append({'id': id_name,
                        'review': sentence.strip().lower().replace('  ', ' '),
                        'target': target,
                        'category': category,
                        'term': '|'.join(term),
                        'polarity': polarity}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("Results/ATE/"+ tipe +".csv")
    sf.to_excel("Results/ATE/"+ tipe +".xlsx")

In [8]:
aspect_topic('lda',list(dict.fromkeys(all_topics)))

but the pizza is way to expensive.
a large is $20, and toppings are about $3 each.
still, any quibbles about the bill were off-set by the pour-your-own measures of liquers which were courtesey of the house...
what you are paying for is the environment and the name.
the four seasons has history and it is a sort of landmark of new york city restaurants, but trust me, they will charge you through the nose just so that you can say "i've been to the four seasons restaurant".
the lunch buffet is expensive but is deff worth it.
too bad i had paid an extra $2 for the stone bowl.
can’t believe how an expensive nyc restaurant can be so disrespectful to its clients.
ambiance- relaxed and stylish.
the lava cake dessert was incredible and i recommend it.
this tiny restaurant is as cozy as it gets, with that certain parisian flair.
took my mom for mother's day, and the maitre d' was pretty rude.
raga's is a romantic, cozy restaurant.
the staff is incredibly helpful and attentive.
the staff is no non