In [1]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() 

nltk_words = set(nltk.corpus.stopwords.words('english'))
stop_words = []
for word in nltk_words:
    stop_words.append(word.translate(str.maketrans('', '', string.punctuation)))

def preprocess(sentence):
    res = sentence.lower()
    res = res.translate(str.maketrans('', '', string.punctuation))
    tokenized_words = nltk.word_tokenize(res)
    res = [word for word in tokenized_words if word not in stop_words]
    res = [lemmatizer.lemmatize(r) for r in res]
    res = [re.sub(r"[^A-Za-z]+", '', r) for r in res]
    res = [r for r in res if len(r) > 3]
    return res

In [2]:
import pandas as pd

df = pd.read_csv('dataset/res16_baru.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewID,sentenceID,review,target,category,polarity
0,0,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",staff,SERVICE,negative
1,1,1004293,1004293:3,The food was lousy - too sweet or too salty an...,"food,portions","FOOD,FOOD","negative,negative"
2,2,1014458,1014458:0,"I have eaten at Saul, many times, the food is ...",food,FOOD,positive
3,3,1014458,1014458:2,The duck confit is always amazing and the foie...,"foie gras terrine with figs,duck confit","FOOD,FOOD","positive,positive"
4,4,1014458,1014458:3,The wine list is interesting and has many good...,"wine list,wine list","FOOD,PRICES","positive,positive"


In [3]:
text_data = []
for sentence in df['review']:
    tokens = preprocess(sentence)
    text_data.append(tokens)

In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [5]:
import gensim

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word=dictionary, random_state = 42)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics()

In [6]:
all_topics = []
for topic in topics:
    for pair in topic[1].split(' + '):
        word = pair.split('*')[1].replace("\"", '')
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))

['always', 'atmosphere', 'cheap', 'food', 'friendly', 'good', 'great', 'like', 'never', 'nice', 'pizza', 'place', 'price', 'restaurant', 'rude', 'service', 'staff', 'time', 'tuna', 'wine', 'would']


In [7]:
all_topics = []
for topic in topics:
    for pair in topic[1].split(' + '):
        word = pair.split('*')[1].replace("\"", '')
        all_topics.append(word)
print(sorted(list(dict.fromkeys(all_topics))))

['always', 'atmosphere', 'cheap', 'food', 'friendly', 'good', 'great', 'like', 'never', 'nice', 'pizza', 'place', 'price', 'restaurant', 'rude', 'service', 'staff', 'time', 'tuna', 'wine', 'would']


In [10]:
import pandas as pd

def aspect_topic(tipe, all_topics):
    sf = pd.DataFrame(columns=['id','review','target', 'category','term','polarity'])
    count = 0
    index = 0
    res = []
    for sentence in df['review']:
        lowercased = sentence.lower()
        term = []
        category = []
        polarity = df['polarity'][index]
        category = df['category'][index]
        id_name = df['sentenceID'][index]
        target = df['target'][index]
        for topic in all_topics:
            tokens = lowercased.split(' ')
            for token in tokens:
                if topic in token:
                    term.append(topic)
#         print(term)
        if len(term) == 0:
            print(lowercased)
            count += 1
        sf = sf.append({'id': id_name,
                        'review': sentence.strip().lower().replace('  ', ' '),
                        'target': target,
                        'category': category,
                        'term': '|'.join(term),
                        'polarity': polarity}, ignore_index=True)
        index += 1
    print(count)
    sf.to_csv("Results/Aspect Terms Extraction/"+ tipe +".csv")
    sf.to_excel("Results/Aspect Terms Extraction/"+ tipe +".xlsx")

In [11]:
aspect_topic('lda',list(dict.fromkeys(all_topics)))

ive asked a cart attendant for a lotus leaf wrapped rice and she replied back rice and just walked away.
i had the duck breast special on my last visit and it was incredible.
the only thing i moderately enjoyed was their grilled chicken special with edamame puree.
their sake list was extensive, but we were looking for purple haze, which wasn't listed but made for us upon request!
ambiance- relaxed and stylish.
the sushi seemed pretty fresh and was adequately proportioned.
the prix fixe menu is worth every penny and you get more than enough (both in quantity and quality).
if you've ever been along the river in weehawken you have an idea of the top of view the chart house has to offer.
the lava cake dessert was incredible and i recommend it.
once you step into cosette, you're miraculously in a small, off-the-beaten path parisian bistro.
i think i've had some the best meals of my life at minnow.
the combination of super-fresh ingredients in the dishes are unusual but really delicious.
my 