In [104]:
# ref :https://blog.naver.com/upennsolution/221437143732
import re # We clean text using regex
import csv # To read the csv

from collections import defaultdict # For accumlating values
from nltk.corpus import stopwords # To remove stopwords

from gensim import corpora # To create corpus and dictionary for the LDA model
from gensim.models import LdaModel # To use the LDA model

import pyLDAvis.gensim # To visualise LDA model effectively

import pandas as pd

In [105]:
fileContents = defaultdict(list)

with open('../../../data/reviews_sample.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader: # read a row as {column1: value1, column2: value2,...}
        for (k,v) in row.items(): # go over each column name and value
            fileContents[k].append(v) # append the value into the appropriate list

In [106]:
print(fileContents.keys())

reviews = fileContents['review_body']

dict_keys(['\ufeff', 'id', 'author', 'instance_id', 'rating', 'review_body', 'review_date', 'review_header', 'review_link', 'review_votes', 'sentiment', 'verified_purchase'])


# Cleansing

In [107]:
# Punchuation marks
reviews = [re.sub(r'[^\w\s]','',str(item)) for item in reviews]

# Stop word, conversion to lowcase
stop_words = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stop_words] for document in reviews]

# 사용빈도 낮은 단어 제거하기
# feequcney ={ Token: frequency}
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1] for text in texts]
texts[:5]

[['doesnt',
  'know',
  'much',
  'cant',
  'answer',
  'questions',
  'google',
  'instead'],
 ['easy', 'use'],
 ['alexa', 'assistant', 'around'],
 ['fun'],
 ['think',
  'google',
  'home',
  'much',
  'works',
  'better',
  'also',
  'great',
  'kids',
  'love']]

# Build Dictionary

In [108]:
# Build dictioary
dictionary = corpora.Dictionary(texts)

In [109]:
print(dictionary,len(dictionary))

Dictionary(196 unique tokens: ['answer', 'cant', 'doesnt', 'google', 'instead']...) 196


In [110]:
# corpus (word_id, word_frequency)
corpus = [dictionary.doc2bow(text) for text in texts]
print('\ncorpus ==> ', corpus[50])
print('\ntexts ==> ', texts[50])
print('\ndictioary ==> ',dictionary[26],dictionary[40])   # word_id 26,40이 texts의 50행에서 가각 2회 사용


corpus ==>  [(22, 1), (23, 1), (24, 1), (26, 2), (38, 1), (40, 2), (47, 1), (96, 1), (106, 1), (107, 1), (117, 1), (140, 1), (157, 1), (163, 1), (164, 1), (165, 1), (166, 1)]

texts ==>  ['bought', 'second', 'echo', 'dot', 'one', 'night', 'stand', 'one', 'room', 'enjoy', 'different', 'music', 'day', 'play', 'sleep', 'music', 'set', 'alarm', 'stuff']

dictioary ==>  one music


# LDA Model Training

In [111]:
NUM_TOPICS = 9 # This is an assumption.
ldamodel = LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15) #This might take some time.

# 분석결과

In [112]:
topics = ldamodel.show_topics()
for topic in topics:
    print('\n',topic)


 (0, '0.069*"alexa" + 0.044*"speaker" + 0.035*"bluetooth" + 0.027*"laptop" + 0.027*"assistant" + 0.027*"awesome" + 0.027*"better" + 0.025*"sound" + 0.021*"much" + 0.018*"love"')

 (1, '0.086*"love" + 0.067*"like" + 0.045*"much" + 0.045*"product" + 0.041*"excellent" + 0.031*"great" + 0.031*"kids" + 0.026*"questions" + 0.020*"useful" + 0.017*"google"')

 (2, '0.060*"still" + 0.031*"also" + 0.031*"bought" + 0.031*"price" + 0.031*"learning" + 0.030*"small" + 0.021*"great" + 0.017*"love" + 0.016*"use" + 0.016*"product"')

 (3, '0.032*"sound" + 0.031*"price" + 0.031*"use" + 0.031*"alexa" + 0.031*"quality" + 0.031*"also" + 0.031*"ordered" + 0.031*"im" + 0.021*"echo" + 0.021*"actual"')

 (4, '0.057*"alexa" + 0.041*"would" + 0.033*"ask" + 0.033*"apps" + 0.033*"amazon" + 0.033*"link" + 0.032*"google" + 0.025*"one" + 0.025*"make" + 0.025*"honeywell"')

 (5, '0.052*"great" + 0.041*"one" + 0.033*"echo" + 0.029*"dot" + 0.029*"alexa" + 0.025*"set" + 0.025*"music" + 0.025*"dont" + 0.022*"love" + 0.02

- 단어 앞의 숫자는 단어의 해당 토픽에 대한 기여도

In [116]:
word_dict ={}
for i in range(NUM_TOPICS):
    words = ldamodel.show_topic(i, topn =10)
    word_dict['Topic#'+'{:02d}'.format(i+1)] = [i[0] for i in words]
    
pd.DataFrame(word_dict)    

Unnamed: 0,Topic#01,Topic#02,Topic#03,Topic#04,Topic#05,Topic#06,Topic#07,Topic#08,Topic#09
0,alexa,love,still,sound,alexa,great,really,easy,amazon
1,speaker,like,also,price,would,one,love,use,get
2,bluetooth,much,bought,use,ask,echo,use,connection,product
3,laptop,product,price,alexa,apps,dot,anything,music,great
4,assistant,excellent,learning,quality,amazon,alexa,voice,crackle,like
5,awesome,great,small,also,link,set,great,dot,alexa
6,better,kids,great,ordered,google,music,cant,via,home
7,sound,questions,love,im,one,dont,home,theatre,old
8,much,useful,use,echo,make,love,alexa,echo,price
9,love,google,product,actual,honeywell,works,good,digital,little


# PyLDAvis 시각화

In [17]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
