# TOPIC MODELING
Latent Dirichlet Allocation (LDA)
The LDA is based upon two general assumptions:

-Documents that have similar words usually have the same topic
-Documents that have groups of words frequently occurring together usually have the same topic

In [239]:
reviews=pd.read_csv("./review_data.csv",usecols=["useful","text", "cuisine"])
reviews=reviews.groupby('cuisine').apply(lambda s: s.sample(50000))
reviews["labels"]= reviews["useful"].apply(lambda x: 1 if x >= 1  else 0)
reviews.cuisine.value_counts()

Asian       50000
American    50000
Mexican     50000
Name: cuisine, dtype: int64

In [240]:
reviews_datasets = reviews
reviews_datasets = reviews[reviews.cuisine == "Asian"]
# reviews_datasets = reviews_datasets[reviews_datasets.labels == 1]
reviews_datasets.dropna()
reviews_datasets.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cuisine,useful,text,labels
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Asian,1158252,Asian,0,This place is an amazing deal for hibachi! Eac...,0
Asian,25513,Asian,0,Good stuff here!!! We've been going to Phoeni...,0
Asian,53457,Asian,1,I got take out from here tonight and it was go...,1
Asian,107523,Asian,0,My friend recommended Malee's one day for lunc...,0
Asian,330831,Asian,0,"I wasn't sure what I was expecting, but wow! W...",0


In [241]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')  
doc_term_matrix = count_vect.fit_transform(reviews_datasets['text'].values.astype('U'))

In [242]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=3, random_state=42)  
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [243]:
#Fetching words randomly to check that the words are present in the vocabulary
import random

for i in range(10):  
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

talkative
eyebrows
tells
chai
molded
seaside
blinded
barbeque
redecorating
chaffing


In [244]:
#Find top 20 words with highest probability of 5 topics
first_topic = LDA.components_[0]  
second_topic = LDA.components_[1]
third_topic = LDA.components_[2] 
# fourth_topic = LDA.components_[3]
# fifth_topic = LDA.components_[4]

In [245]:
#Sorting the index according to probability

top_topic_words_first = first_topic.argsort()[-20:] 
top_topic_words_second = second_topic.argsort()[-20:]  
top_topic_words_third = third_topic.argsort()[-20:]  
# top_topic_words_fourth = fourth_topic.argsort()[-20:]  
# top_topic_words_fifth = fifth_topic.argsort()[-20:]

In [246]:
top_topic_words_first
top_topic_words_second
top_topic_words_third
# top_topic_words_fourth
# top_topic_words_fifth

array([11010,   952, 14001,  4844, 10241,  7213, 14114,  8611,  1908,
       10481,  7191, 14864,  8143, 14857, 15555,  7017,  7689, 13101,
        7839, 17299])

In [247]:
topic_1 = []
topic_2 = []
topic_3 = []
# topic_4 = []
# topic_5 = []

for i in range(0,20):  
    topic_1.append(count_vect.get_feature_names()[top_topic_words_first[i]])
    topic_2.append(count_vect.get_feature_names()[top_topic_words_second[i]])
    topic_3.append(count_vect.get_feature_names()[top_topic_words_third[i]])
#     topic_4.append(count_vect.get_feature_names()[top_topic_words_fourth[i]])
#     topic_5.append(count_vect.get_feature_names()[top_topic_words_fifth[i]])

In [248]:
import pandas as pd
df = pd.DataFrame(list(zip(topic_1, topic_2, topic_3)), 
               columns =['Topic 1', 'Topic 2', 'Topic 3']) 
# df = pd.DataFrame(list(zip(topic_1, topic_2, topic_3, topic_4, topic_5)), 
#                columns =['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5']) 
df 

Unnamed: 0,Topic 1,Topic 2,Topic 3
0,went,really,menu
1,did,service,amazing
2,asked,beef,ramen
3,good,restaurant,delicious
4,don,ve,like
5,said,just,friendly
6,restaurant,sauce,really
7,table,soup,hour
8,ordered,ordered,best
9,got,fried,love


In [259]:
#Words with highest probabilties for all five topics
for i,topic in enumerate(LDA.components_):  
    print(f'Top 10 words for topic #{i+1}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 10 words for topic #1:
['said', 'restaurant', 'table', 'ordered', 'got', 'minutes', 'came', 'didn', 'place', 'like', 'service', 'time', 'just', 'order', 'food']


Top 10 words for topic #2:
['just', 'sauce', 'soup', 'ordered', 'fried', 'pho', 'chinese', 'great', 'like', 'rice', 'thai', 'place', 'chicken', 'good', 'food']


Top 10 words for topic #3:
['friendly', 'really', 'hour', 'best', 'love', 'fresh', 'rolls', 'happy', 'roll', 'service', 'food', 'good', 'place', 'great', 'sushi']




In [250]:
#assign probability of all the topics to each document and adds a new column to show about which topic the review belongs
topic_values = LDA.transform(doc_term_matrix)
reviews_datasets['Topic'] = topic_values.argmax(axis=1)+1 
reviews_datasets.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Unnamed: 1_level_0,cuisine,useful,text,labels,Topic
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Asian,1158252,Asian,0,This place is an amazing deal for hibachi! Eac...,0,3
Asian,25513,Asian,0,Good stuff here!!! We've been going to Phoeni...,0,2
Asian,53457,Asian,1,I got take out from here tonight and it was go...,1,2
Asian,107523,Asian,0,My friend recommended Malee's one day for lunc...,0,2
Asian,330831,Asian,0,"I wasn't sure what I was expecting, but wow! W...",0,3
Asian,1004316,Asian,1,This place is so good. I usually call ahead an...,1,2
Asian,924761,Asian,1,I just wanna talk about the employees here. W...,1,1
Asian,308604,Asian,2,"I""m a fan, I like it I love it I'll eat more o...",1,2
Asian,890146,Asian,0,The only place you should buy potstickers! The...,0,2
Asian,1078992,Asian,0,Third time and still amazing! Had egg rolls an...,0,2


In [269]:
reviews_datasets.groupby(['Topic','labels']).size()

Topic  labels
1      0          5539
       1          4916
2      0         11309
       1          9318
3      0         11631
       1          7287
dtype: int64

In [258]:
#Topic 2: Service/Ambience
reviews_datasets['text'][4]

"I wasn't sure what I was expecting, but wow! What a great price for so much food! I was surprised how fresh the fish I was. I wish this was here when I lived in Hassy... I would've been here everyday"

In [255]:
#Topic 1: Service/Time
reviews_datasets['Topic'][15]

1

In [254]:
df.to_csv('asian_lda.csv', index=False)