In [100]:
import xml.etree.ElementTree as et
import pandas as pd
import random
import pickle

import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser  = English()

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Evan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
tree = et.parse('train.xml')
root = tree.getroot()

In [3]:
root.tag

'sentences'

In [13]:
root[0][1].tag

'aspectCategories'

In [15]:
len(root[0][1])

2

In [23]:
root[0][0].text

"It might be the best sit down food I've had in the area, so if you are going to the upright citizen brigade, or the garden, it could be just the place for you."

In [40]:
root[0][1][0].attrib['category']

'food'

In [21]:
len(root)

3149

In [38]:
m = len(root) - 1
max_aspects = 0

for i in range(m):
    if len(root[i][1]) > max_aspects:
        max_aspects = len(root[i][1])

print('Most aspect categories in a single document: ', max_aspects)

Most aspect categories in a single document:  5


In [62]:
corpus = {}

for i in range(m):
    doc = {}
    doc['text'] = root[i][0].text
    for j in range(len(root[i][1])):
        doc[f'aspectCategory{j}'] = root[i][1][j].attrib['category']
        doc[f'aspectPolarity{j}'] = root[i][1][j].attrib['polarity']
    corpus.update({i: doc})

In [67]:
df_corpus = pd.DataFrame(corpus).transpose()
df_corpus.sample(3)

Unnamed: 0,text,aspectCategory0,aspectPolarity0,aspectCategory1,aspectPolarity1,aspectCategory2,aspectPolarity2,aspectCategory3,aspectPolarity3,aspectCategory4,aspectPolarity4
606,After a couple of drinks upstairs before our m...,food,neutral,miscellaneous,positive,,,,,,
1761,We would have to flag down the bored looking w...,staff,negative,food,neutral,,,,,,
2045,The Scene The bright mix of colors painted on ...,miscellaneous,positive,place,neutral,,,,,,


In [72]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens 

In [92]:
print(tokenize(df_corpus['text'][0]))

['it', 'might', 'be', 'the', 'best', 'sit', 'down', 'food', 'i', "'ve", 'had', 'in', 'the', 'area', ',', 'so', 'if', 'you', 'are', 'going', 'to', 'the', 'upright', 'citizen', 'brigade', ',', 'or', 'the', 'garden', ',', 'it', 'could', 'be', 'just', 'the', 'place', 'for', 'you', '.']


In [74]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [95]:
get_lemma('looking')

'looking'

In [75]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [96]:
get_lemma2('looking')

'looking'

In [77]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [97]:
prepare_text_for_lda(df_corpus['text'][0])

['might', 'going', 'upright', 'citizen', 'brigade', 'garden', 'could', 'place']

In [98]:
text_data = []
for i in range(len(df_corpus) - 1):
    tokens = prepare_text_for_lda(df_corpus['text'][i])
    if random.random() > .99:
        print(tokens)
        text_data.append(tokens)

['decide', 'place', 'bigger', 'portion', 'normal']
['someone', 'trashy', 'review', 'favorite', 'coffee', 'place', 'compare', 'place', 'great']
['brand', 'gloss', 'table', 'waiter', 'throw']
['intrigue', 'ambience', 'restaurant', 'friend', 'decide', 'dinner', 'paladar', 'saturday', 'night']
['chance', 'garden', 'hop', 'happy', 'slopers', 'sip', 'wine', 'start', 'bottle', 'munch', 'dozen', 'kind', 'press', 'sandwich']
['worst', 'service', 'enter', 'restaurant', 'wait', 'minutes', 'deciding', 'extremely', 'small', 'waitress']
['along', 'every', 'patron', 'force', 'overcrowd', 'employee', 'carry', 'chair', 'head', 'prepare', 'dinner', 'shoulder', 'shoulder', 'stand', 'cattle', 'waiting']
['saying', 'dinner', 'actually', 'take', 'service']
['friend', 'dinner', 'walk', 'stay', 'decor', 'charm', 'want', 'french']
['service', 'inadequate', 'warn', 'bring', 'ready-', 'order', 'serve', 'order', 'entree']
['treat', 'instead', 'snooty', 'attitude', 'waiter', 'maitre', 'dinner', 'could']
['end', 'h

In [101]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [104]:
NUM_TOPICS = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words = 8)
for topic in topics:
    print(topic)

(0, '0.025*"dinner" + 0.025*"restaurant" + 0.025*"decide" + 0.025*"friend" + 0.025*"intrigue" + 0.025*"ambience" + 0.025*"night" + 0.025*"saturday"')
(1, '0.027*"dinner" + 0.027*"entree" + 0.027*"want" + 0.027*"quickly" + 0.014*"restaurant" + 0.014*"selection" + 0.014*"include" + 0.014*"decor"')
(2, '0.043*"manager" + 0.030*"cutting" + 0.030*"basically" + 0.016*"table" + 0.016*"serve" + 0.016*"waiter" + 0.016*"ask" + 0.016*"someone"')
(3, '0.024*"restaurant" + 0.024*"shoulder" + 0.024*"service" + 0.013*"dinner" + 0.013*"take" + 0.013*"small" + 0.013*"drink" + 0.013*"prepare"')
(4, '0.038*"place" + 0.026*"service" + 0.026*"order" + 0.026*"review" + 0.014*"price" + 0.014*"great" + 0.014*"serve" + 0.014*"snooty"')
(5, '0.029*"dinner" + 0.029*"order" + 0.029*"better" + 0.029*"noodle" + 0.015*"price" + 0.015*"table" + 0.015*"waitress" + 0.015*"instead"')
(6, '0.024*"small" + 0.024*"waiter" + 0.024*"anything" + 0.024*"stay" + 0.024*"check" + 0.024*"place" + 0.024*"end" + 0.024*"hours"')
(7, 