### Import modules

In [46]:
from utils import * 

import numpy as np
import pandas as pd
from pprint import pprint
import os
import matplotlib.pyplot as plt
from collections import defaultdict

# Gensim
from gensim.test.utils import datapath
from gensim.test.utils import common_texts, get_tmpfile

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.option_context('display.max_colwidth', 500);

In [2]:
# Import dataset
df = pd.read_pickle('raw_data/netflix.pkl')
reviews = df.review

NLPpipeline

In [3]:
term_doc = pd.read_pickle('preprocessed_data/term_doc.pkl')
data_lemmatized = pd.read_pickle('preprocessed_data/data_lemmatized.pkl')
dictionary = pd.read_pickle('preprocessed_data/dictionary.pkl')
tf_idf = pd.read_pickle('preprocessed_data/tf_idf.pkl')

<string>Code for piepelining</strong>
<code style="font-size: 10px; background-color:transparent;">
nlp_pipe = NLPpipe()
term_doc = nlp_pipe.fit_transform(reviews, min_count = 3, threshold = -0.5)
tf_idf = nlp_pipe.transform(reviews, tf_idf = True)
data_lemmatized = nlp_pipe.clean_text
dictionary = create_dictionary(data_lemmatized)
</code>

### (Standard) LDA model after tuning

In [33]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=term_doc,
                                           id2word=dictionary,
                                           num_topics= 6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=2000,
                                           passes=10,
                                           alpha= 1.5,
                                           per_word_topics=True)

#### Visualize Standard LDA model

In [34]:
vis_data = pyLDAvis.gensim.prepare(lda_model, term_doc, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_topics=6&a=1.5&batchsize=1.html')

*With chunksize = 100 after tuning, although the coherence scores are higher than stochastic one('update_every=1'), the topics are not much distinguishable and uninterpretable.*

In [35]:
lda_model_100 = gensim.models.ldamodel.LdaModel(corpus=term_doc,
                                           id2word=dictionary,
                                           num_topics= 15, 
                                           random_state=100,
                                           update_every=100,
                                           chunksize=2000,
                                           passes=10,
                                           alpha= 1.5,
                                           per_word_topics=True)

vis_data = pyLDAvis.gensim.prepare(lda_model_100, term_doc, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_topics=15&a=1.5&batchsize=100.html')

In [36]:
coherence_model = CoherenceModel(model=lda_model_100, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
np.mean(coherence_model.get_coherence_per_topic())

-0.07157395340265418

## <mark>Mallet's LDA model after tuning &#8592; Best Model
*The difference between Mallet and Gensim’s standard LDA is that Gensim uses a Variational Bayes sampling method which is faster but less precise that Mallet’s Gibbs Sampling.*  [link](https://towardsdatascience.com/basic-nlp-on-the-texts-of-harry-potter-topic-modeling-with-latent-dirichlet-allocation-f3c00f77b0f5)

In [37]:
# mallet_path = '../mallet-2.0.8/bin/mallet' # update this path

import os
os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'})

# mallet_path = 'mallet-2.0.8/bin/mallet.bat' # update this path
mallet_path = 'C:/mallet-2.0.8/bin/mallet.bat'

ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, 
                                             corpus = term_doc,
                                             num_topics = 6, 
                                             random_seed = 100,
                                             id2word = dictionary,
                                             alpha = 1.5)

In [38]:
ldamallet.save(datapath("model"))
# ldamallet = gensim.models.wrappers.LdaMallet.load(datapath("model"))

#### Visualize Mallet's LDA model

In [39]:
# Visualize the topics
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
pyLDAvis.enable_notebook()

In [40]:
vis = pyLDAvis.gensim.prepare(model, term_doc, dictionary)
pyLDAvis.save_html(vis, 'mallet_lda_vis/mallet_lda_topics=6&a=1.5.html')

  default_term_info = default_term_info.sort_values(


In [41]:
coherence_model_m = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

[-0.059887677995452145,
 -0.08540541347219917,
 -0.026354790404547596,
 -0.04867638381820212,
 0.002685968866744477,
 -0.08213902199226049]

In [42]:
# model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
# model.top_topics(corpus = term_doc,topn=10)

#### For each topic, we could look at frequent and relevant words

In [43]:
frequencies = ldamallet.word_topics.sum(axis=0)
p_word = frequencies / ldamallet.word_topics.sum()
p_word_given_topic = ldamallet.word_topics / np.sum(ldamallet.word_topics, axis = 0)

lamda = 0.5
relevance = lamda * p_word_given_topic + (1-lamda) * p_word_given_topic / p_word 

  p_word_given_topic = ldamallet.word_topics / np.sum(ldamallet.word_topics, axis = 0)


In [44]:
topic_dict = {0.: "Platform/Device", 1.: "User Experience", 2.: "Value", 3.: "Service", 4.: "Trouble-shooting", 5.:"Shows"}

for topic_id in range(6):
    words = []
    for id in np.argsort(relevance[topic_id,])[::-1][:15]:
        words.append(dictionary[id])
    print(f"Topic: {topic_dict[topic_id]}")
    print(words)
    print('\n')

Topic: Platform/Device
['take', 'none', 'become', 'one', 'please', 'allow', 'welcome', 'come', 'mean', 'anywhere', 'believe', 'com', 'th', 'try', 'out']


Topic: User Experience
['ok', 'everyone', 'one', 'seeing', 'que', 'believe', 'together', 'try', 'second', 'sorry', 'down', 'become', 'course', 'considering', 'trying']


Topic: Value
['sorry', 'value', 'wonder', 'last', 'etc', 'self', 'course', 'novel', 'over', 'nothing', 'together', 'try', 'd', 'help', 'like']


Topic: Service
['need', 'value', 'believe', 's', 'way', 'seeing', 'thank', 'tell', 'something', 'might', 'out', 'enough', 'other', 'whole', 'want']


Topic: Trouble-shooting
['way', 'together', 'd', 'enough', 'welcome', 'everyone', 'novel', 'etc', 'one', 'anywhere', 'overall', 'might', 'try', 'th', 'wonder']


Topic: Shows
['like', 'allow', 'need', 'd', 'up', 'enough', 'name', 'look', 'over', 'want', 'overall', 'might', 'please', 'thank', 'value']




## Interpret the topic model
1. Finding the dominant topic in each document
2. Find the most representative document for each topic
3. Topic distribution across documents
**The code used here for interpretation of the model are based on this website with a little modification by the user myself: <br>
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling**

### Standard LDA - Interpret the model

In [48]:
# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=term_doc, texts=data_lemmatized, df=df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=term_doc, texts=data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

#### 1. Standard LDA - Finding the dominant topic in each document

In [49]:
df_dominant_topic = find_dominant_topic_in_each_doc(df_topic_sents_keywords, df=df)
print("Finding the dominant topic in each document")
df_dominant_topic.head(5).style.set_properties(subset=['review'], **{'width': '600px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"service, thing, program, product, kid, cable, option, shows_movie, stuff, device",[],5,0,0,Five Stars netflix is great!
1,1.0,0.3545,"time, show, price, computer, watch_movie, content, screen, movie, quality, player","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,2.0,0.2711,"love, phone, year, episode, month, lot, list, day, picture, series","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"service, thing, program, product, kid, cable, option, shows_movie, stuff, device",[],5,0,0,Five Stars The best
4,2.0,0.1729,"love, phone, year, episode, month, lot, list, day, picture, series",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Standard LDA - Find the most representative document for each topic

In [50]:
print("Find the most representative document for each topic")
sent_topics_sorteddf = find_most_representative_doc_for_each_doc(df_topic_sents_keywords,df=df)
sent_topics_sorteddf.style.set_properties(subset=['review'], **{'width': '1000px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.7518,"service, thing, program, product, kid, cable, option, shows_movie, stuff, device",5,18,21,"What's not to love about streaming movies and TV? I've taken the 1 month free trial on streaming netflix. I used to be a netflix DVD, and then blue-ray, but had two kids and it became an expense that wasn't worth it (we'd kept a movie for 30 days, never watched it, and sent it back). I wasn't sure how well it would work this go around with just the streaming service, but so far the whole family has enjoyed it. The streaming service has improved a lot since 3 years ago when I cancelled the DVD/streaming. So far the service has been positive and I'm leaning heavily on reup'ing when the free trial ends. This has probably been said a hundred times over, so I'll bullet the features that are great: Pros: - Multi-platform support: iPhone, Android, Kindle, PS3, Wii - and the kids (both under 5) can operate them all but the PS3 - Kid only section - the kids don't need to sift through the adult stuff to find their own interests (younger kids) - Low buffering even with HD movies on a slower internet connection speed (Time Warners $40 month solo plan) - 3 different throttling options for network speed issues and data plans with smaller allotments - Simultaneous use on multiple platforms - one kid watches one thing on one device, wife and I on another, other kid on another device - Decent selection of TV shows - lots of good kids programing from Disney, Nick, Cartoon Network, HUB (kids really like the superhero shows - would be an extra $6 with DirecTV to get these monthly, and I'd have to worry about configuring DVR (not hard, but videos corrupt over time), and deal with commercial breaks and then shows starting up for a min that then end. Granted that setting can be changed for some programing, but it then cuts off the end of the show. - Movie selection is good for Parents that haven't seen some movies and don't want to try and rent through, or want to catch up on some shows that aren't on anymore - The venture into new content solely available on netflix is an interesting concept, will be nice to see where it goes Cons - not too many yet but... - Movies are older, about what you'd expect to see as filler with HBO, Cinemax, Showtime, etc., but not a lot of new stuff on the streaming. - noticeable strain on ISP network speed - but that was to be expected, especially with HD, but kinda of a bummer when trying to stay with lower tiered network plans (but that's more of an infrastructure issue with the US and ISPs than Netflix) - however the throttling options to help that. - shows drop off - over time, things fall off the streaming availability, but I guess with licensing fees and storage constraints they can't stay up forever... Would be nice if there was a feature to allow you to request an older show that would let you live stream but store instead for a day or two - I'd be willing to pay extra money for that ($2-$3 extra a month perhaps?)"
1,1.0,0.8159,"time, show, price, computer, watch_movie, content, screen, movie, quality, player",3,1,2,"What DOES Netflix do with all their money? Netflix, or the Android Netflix app, I don't know which, is pretty much the minimum one might expect from streaming video. If all you want to do is watch a movie or TV show continuously, or with a few pauses and resumes, then it does okay. A TIVO this is not and I find that simple things such as rewinding and playing back a challenge the app just is not up to. At least, this has been my experience. Here is a list of some issues I have met: 1. The app does not respond to the pause and resume buttons on my Android G-Box remote. I have to use their on-screen stuff. 2. The app almost always stalls a few seconds into streaming and I have to click on the red button in the progress bar in order to resume. It might go on its own if I were to give it enough time. I am not patient enough with it.. 3. When I pause, I have to click on the pause icon; as I said, the remote is not recognized. Almost always, the app will not resume clicking the play icon. I have to use the escape key to get out of the video and then renter it. 4. The video is pixilated when it first starts to run. I assume that is to stream lower quality video until it has pumped out enough to get going reliably. As you can see from above, it has trouble even doing that. 5. Clicking on the replay icon takes you back so short a distance as to be virtually worthless. There is a mental delay from the time you think you want to rewind, and then the time it takes to get the keyboard out and do it; that time is longer than tha time Netflix goes back. When I click the icon several times, I often lose the stream or it can go WAY back erratically. 6. Fast forward? Forget about it. You can shift the video forward by manipulating the slider bar image but it is hit and miss because you don't have any time info on the screen, so you have to guess where you might want to be. I have to do that to the end of the show if I want to fool Netflix into believing that I have watched the show. 7. Some things such as changing lists need to be set on Netflix on your computer, rather than through the G-Box on the TV screen. It surprises me that Netflix seems to have such a clunky, kludgey interface. Netflix has been around since the late 1990s but the G-Box is much more recent. Computers don't usually have remote controls, nor do phones. Perhaps this is partly why Netflix is remote challenged. My G-Box came with a Netflix app installed, but one day Netflix crashed and would no longer open. I downloaded this app and upgraded the old app. That didn't fix the problem. A call to Netflix eventually led to my solution which was to flush the caches on the app. This app has the same functionality, and disfunctionality, of the original app had. I had read the reviews on this app and I frankly wonder what planet some of the gushy and gooey reviews are from. My experience is different. Perhaps I am a victim of expectations. I have Dish and its TIVO-like pause. rewind, and fast forward work like a dream. Netflix is Cro-Magnon in this respect. There are benefits to Netflix. It usually streams more reliably that 1 Channel or Ice on the XBMC app on the box. On the other side, Netflix has limited content and is usually at least a year behind on TV shows currently running. It must be licensing issues for the TV shows. I like old movies and this is not a forte of Netflix. Movies are spotty. Don't expect to find Casablanca, Cabaret, or The Maltese Falcon on Netflix. I find four, count them four, John Wayne movies on Netflix. Classic Sci-Fi? Unlikely. Downton Abbey? No, Amazon has that. (Amazon does not have an Android app yet for its Prime Instant Video movies, alas.) Netflix focuses on the current and that liked by today's (juvenile) tastes. Netflix has made a recent profit of $8M. What is it doing with its money? Is it making Netflix better? Is it making its Android app any better? I would hope so. Hmm, maybe I'll check out Hulu? Update: I forgot about subtitles. We often use subtitles because many actors mumble and swallow their words, especially at the punch line, some show's music drowns out dialog, and we watch some British stuff (\\""Sherlock\\"" and \\""Doc Martin\\"", for example) that is often audibly challenged to my American ear. Netflix's subtitles proceed the actual spoken words, sometimes by several seconds. If there is to be a lag, I would prefer if the subtitle followed. That way, if I don't understand something I can keep an eye out for it to come up in a second. Netflix's way is perverse. Netflix's subtitles on my G-Box are yellow upper and lower case on a transparent background. This makes them difficult to read, especially on a brighter image. As with most of Android Land, documentation is scarce so I do not know if I have choices available for subtitles on Netflix. (Update this update: I did find a way to modify subtitle settings on my computer and logged into my account. Now I have small caps in white on a black background. Now if I could solve the sync thing...) The sorry documentation is a general lament for Android apps. It seems that using Android is a never ending parade of Easter Egg opportunities; that is, fining out hidden and obscure surprises built in by the programmers for those who are not old enough for the term \\""easter Egg\\"". This of clever Android programmers coming up with all sorts of goodies but failing to let us in on them. If we are lucky, we stumble on them from time to time. Netfix support can be a little Nitwitflix at times. I called early on to see how I could scroll a window and I got several bat $^!# crazy answers, none of which were at all correct. Eventually I found out by accident to hold down the left mouse key and drag the window, one direction or the other, I forget which. All together, the Netflix app appears to be a dull knife of the cutting edge./> Hmm, maybe I'll check out Hulu? Update: I forgot about subtitles. We often use subtitles because many actors mumble and swallow their words, especially at the punch line, some show's music drowns out dialog, and we watch some British stuff (\\""Sherlock\\"" and \\""Doc Martin\\"", for example) that is often audibly challenged to my American ear. Netflix's subtitles proceed the actual spoken words, sometimes by several seconds. If there is to be a lag, I would prefer if the subtitle followed. That way, if I don't understand something I can keep an eye out for it to come up in a second. Netflix's way is perverse. Netflix's subtitles on my G-Box are yellow upper and lower case on a transparent background. This makes them difficult to read, especially on a brighter image. As with most of Android Land, documentation is scarce so I do not know if I have choices available for subtitles on Netflix. (Update this update: I did find a way to modify subtitle settings on my computer and logged into my account. Now I have small caps in white on a black background. Now if I could solve the sync thing...) The sorry documentation is a general lament for Android apps. It seems that using Android is a never ending parade of Easter Egg opportunities; that is, fining out hidden and obscure surprises built in by the programmers for those who are not old enough for the term \\""easter Egg\\"". This of clever Android programmers coming up with all sorts of goodies but failing to let us in on them. If we are lucky, we stumble on them from time to time. Netfix support can be a little Nitwitflix at times. I called early on to see how I could scroll a window and I got several bat $^!# crazy answers, none of which were at all correct. Eventually I found out by accident to hold down the left mouse key and drag the window, one direction or the other, I forget which. All together, the Netflix app appears to be a dull knife of the cutting edge."
2,2.0,0.6028,"love, phone, year, episode, month, lot, list, day, picture, series",2,9,14,"Netflix Ignores Customer Feedback I like Netflix, but this company is notorious for not listening to their customers. The latest app update is the worst yet. I like watching the intros and credits of my shows and movies, but now the app skips the TV show intros and always minimizes the show or movie when the credits roll. Often the minimizing happens while the show is still playing!!!! This is extremely irritating. To top it all off, Netflix no longer allows user feedback on its Facebook page. I desperately wish Netflix had more competition, because I would have left years ago. Amazon Prime and Hulu Plus are good, but unfortunately Netflix is still the industry leader. Overall the service is valuable, but the user experience is terrible and continues to get worse regularly. I've had Netflix for years, but I hate it more and more everyday."
3,3.0,0.641,"movie, tv, selection, tablet, choice, film, family, commercial, use, watch",3,0,0,"life saver!! My husband and I are staying with his sister until our new place is ready to move in and her husband is a total d-Dag so I have been avoiding him by staying in our room and watching netflix. While Netflix is good and it has some awesome films on its service, I really wish there was a bigger selection of genres. For an example, I'm from Ireland so I tend to miss the Irish shows now that I am in the States, I also miss my period dramas like Emma (Kate Beckinsale one) and the newest Persuasion."
4,4.0,0.6779,"account, kindle, problem, tv_show, money, access, streaming, version, people, subscription",5,5,7,"Great Movies, Shows, and Money Saver You have to pay fee every month to gain access to a large library of movies/TV shows. Pro: - There are no commercials - Low monthly fee (I think about 9$) - Stream through multiple devices: I used it on my PC, tabet (Ipad/Samsung), and PS3/4 - Great anime library (I watch a lot of anime) - Great collection of TV shows: SuperNatural, Walking Dead, and other well known shows Cons - Several seasons behind current shows - The user interface is a little getting used to. I think it could be better Overall, I really enjoy Netflix. If you don't mind waiting for shows to eventually stream through Netflix, this app could save you a lot on cable bills. I don't pay or have cable. For the prices of NetFlix compared to cable tv, it can't be beat."
5,5.0,0.6531,"app, fire, video, issue, problem, update, device, review, load, way",2,22,26,"constant problems on the kindle fire The kindle Fire version of this app has persistent problems. Most of the time it will load the video, but then we just get a black screen with sound. Since the Fire lacks a task manager there and basic menu access for each app, there is no way to force close the app and restart it. I think this is a fundamental design flaw in the Fire, that it is difficult to close/exit an app and restart it. Netflix needs to issue an update."


#### 3. Standard LDA - Topic distribution across documents

In [51]:
# Show
print("Topic distribution across documents")
df_dominant_topic = topic_distribution_across_docs(df_topic_sents_keywords)
df_dominant_topic.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"service, thing, program, product, kid, cable, option, shows_movie, stuff, device",3853,0.3066
1,1.0,"time, show, price, computer, watch_movie, content, screen, movie, quality, player",1883,0.1498
2,2.0,"love, phone, year, episode, month, lot, list, day, picture, series",1810,0.144
3,3.0,"movie, tv, selection, tablet, choice, film, family, commercial, use, watch",1760,0.1401
4,4.0,"account, kindle, problem, tv_show, money, access, streaming, version, people, subscription",1672,0.1331
5,5.0,"app, fire, video, issue, problem, update, device, review, load, way",1588,0.1264


In [52]:
coherence_model = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model.get_coherence_per_topic()

[-0.15454075846523566,
 -0.2798088158452113,
 -0.20324172476237276,
 -0.1508941120870968,
 -0.1815778265361459,
 -0.01638147407704615]

### <mark>Mallet Model - Interpret the model</mark>

In [53]:
# ldamallet = gensim.models.wrappers.LdaMallet.load(datapath('model'))
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [55]:
# df_topic_sents_keywords_m = format_topics_sentences(model, term_doc, data_lemmatized, df)
df_topic_sents_keywords_m = format_topics_sentences(model, term_doc, data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

In [56]:
all_topics = model.get_document_topics(bow = term_doc)

In [57]:
df_topic_sents_keywords_m

Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,0
0,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]
1,2.0,0.7205,"movie, program, cable, tv, show, selection, lo...","[buck, show, buck]"
2,4.0,0.4058,"app, video, problem, issue, update, time, devi...","[navigation, movies_show, hitch, device]"
3,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]
4,5.0,0.5000,"love, movie, tv_show, season, lot, watch, tv, ...",[original_series]
...,...,...,...,...
12561,4.0,0.6429,"app, video, problem, issue, update, time, devi...","[zone, edge]"
12562,0.0,0.4958,"time, work, app, fire, love, phone, movie, tab...",[work]
12563,3.0,0.4965,"service, streaming, time, product, year, price...",[service]
12564,0.0,0.1667,"time, work, app, fire, love, phone, movie, tab...",[]


#### 1. Mallet LDA - Finding the dominant topic in each document

In [58]:
df_dominant_topic_m = find_dominant_topic_in_each_doc(df_topic_sents_keywords_m, df)
print("Finding the dominant topic in each document")
df_dominant_topic_m.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"time, work, app, fire, love, phone, movie, tablet, download, kindle",[],5,0,0,Five Stars netflix is great!
1,2.0,0.7205,"movie, program, cable, tv, show, selection, love, service, year, choice","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,4.0,0.4058,"app, video, problem, issue, update, time, device, fire, work, fix","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"time, work, app, fire, love, phone, movie, tablet, download, kindle",[],5,0,0,Five Stars The best
4,5.0,0.5,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Mallet LDA - Find the most representative document for each topic

In [36]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_m = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_m, df)
sent_topics_sorteddf_m.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.8936,"time, work, app, fire, love, phone, movie, tablet, download, kindle",4,2,3,"very good I like this app overall but I don't like that their can only be two people on it at a time. It would be better if you could use it without Internet. Sometimes it takes a long to to load or buffer an it makes you shows blurry. There is not as much movies and shows that some other thing like netflix have. Netflix is very good app for sitting down with you family and watching a movie or being alone and watching a show or movie on the couch or even in your bed. Netflix has a lot of great movies and shows to watch. If you find one that you like but you don't want to watch it the you can add it to your list. Overall netflix is really great app and if  you like to watch movies then I would get this app for sure. I like the fact that it is free so for sure get it.I guess you can pay more money monthly for netflix to work on more than two devices at a time. Overall other than the long load times that we encounter once in a great while we will continue to be a netflix customer. We have been now for 4 years and it runs great on out kindles,laptops,and I pods. You can also use it where they have free WiFi like mcdonalds."
1,1.0,0.9232,"movie, kid, profile, account, selection, family, child, shows_movie, site, watch",4,10,12,"NETFLIX is great! I really enjoy using NETFLIX. It is user-friendly, very convenient and inexpensive. The one thing I would change about NETFLIX is the content availability, for example, my children, ages 7 and 8 love to watch movies on NETFLIX and they do have a great selection of children/family content, but at the same time, my kids have access to R-rated, gay-lesbian, horror, etc. I wish they had a setting which I could prevent them from viewing those genres. For now, I just closely monitor them, as any good parent should do anyway!"
2,2.0,0.9235,"movie, program, cable, tv, show, selection, love, service, year, choice",4,1,2,"It's worth the monthly fee! Considering the cost of a movie theater ticket, what a bargain! I can enjoy a movie without a considerable amount of commercials, if I am interruped, I can pause the movie and if I am hard of hearing, most movies have closed caption! I can even make my own popcorn at home! I can also just stop the movie if it think it's something not of my liking, and start another!! I don't have to drive to and from the theater, no long lines, etc. and if you happen to own a large screen .......... what more could you ask for????????"
3,3.0,0.9433,"service, streaming, time, product, year, price, customer, movie, film, option",1,21,29,"Netflix and the Arrogant App The arrogance of this application is amazing. I am sure some ""team"" thought they new enough, did enough market research, etc. to believe the credits to a film don't matter. The ""team"" also apparently believes giving people fewer options is better because it's all just too advanced for us sheep to understand. I now have to scroll more, hit the back button more, use the navigation ring on Fire TV remote more, and I don't enjoy Netflix as much as I used to. Eventually there will be increasing options outside of Netflix and that will be a good thing. I'm thinking maybe I should try Amazon Prime. Although Amazon is getting arrogant too, at least the Prime service lets the movie play to the end without shrinking the credits into oblivion. I'll decide when to stop watching a movie Netflix, you don't need to prompt me."
4,4.0,0.9348,"app, video, problem, issue, update, time, device, fire, work, fix",3,0,0,"good, browsing ui can be a bit too sluggish Streaming works fairly well. I've noticed that the browsing UI can be a bit sluggish. My guess is that some tasks are being handled in the UI thread, which should actually be delegated to a background thread (I'm a professional developer). This issue should be a high priority fix -- it really detracts from the experience of browsing the available movies and shows, when the Netfix app takes seconds to respond to a drag request. Also, this should have been caught during QA -- it's pretty basic and important stuff.."
5,5.0,0.8667,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",5,0,0,Who really doesn't love netflix. It is everything in one place Who really doesn't love netflix. It is everything in one place. I mean they could have a few more things but if not there is project TV or hulu.


#### 3. Mallet LDA - Topic distribution across documents

In [37]:
# Show
print("Topic distribution across documents")
df_dominant_topic_m = topic_distribution_across_docs(df_topic_sents_keywords_m)
df_dominant_topic_m.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"time, work, app, fire, love, phone, movie, tablet, download, kindle",4030,0.3207
1,1.0,"movie, kid, profile, account, selection, family, child, shows_movie, site, watch",1985,0.158
2,2.0,"movie, program, cable, tv, show, selection, love, service, year, choice",1823,0.1451
3,3.0,"service, streaming, time, product, year, price, customer, movie, film, option",1629,0.1296
4,4.0,"app, video, problem, issue, update, time, device, fire, work, fix",1615,0.1285
5,5.0,"love, movie, tv_show, season, lot, watch, tv, watch_movie, love_great, month",1484,0.1181


In [38]:
coherence_model = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model.get_coherence_per_topic()

[-0.059887677995452145,
 -0.08540541347219917,
 -0.026354790404547596,
 -0.04867638381820212,
 0.002685968866744477,
 -0.08213902199226049]

## Try the model with TF-IDF dataset

### Standard LDA with TF-IDF

In [5]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=tf_idf,
                                           num_topics= 6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=2000,
                                           passes=10,
                                           alpha=1.5,
                                           per_word_topics=True)

In [6]:
vis_data = pyLDAvis.gensim.prepare(lda_model_tfidf, tf_idf, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, 'std_lda_vis/std_lda_vis_tfidf_num_topics=6&alpha=1.5.html')

  default_term_info = default_term_info.sort_values(


In [7]:
coherence_model_m = CoherenceModel(model=lda_model_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

[-0.12354832277346144,
 -0.27506340107114513,
 -0.21936770506896217,
 -0.15886486640839953,
 -0.315369990391699,
 -0.06717851894875705]

In [8]:
df_topic_sents_keywords_tfidf = format_topics_sentences(ldamodel=lda_model_tfidf, corpus=term_doc, texts=data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

#### 1. Standard LDA with TF-IDF - Finding the dominant topic in each document

In [15]:
df_dominant_topic_tfidf = find_dominant_topic_in_each_doc(df_topic_sents_keywords_tfidf, df)
print("Finding the dominant topic in each document")
df_dominant_topic_tfidf.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"168, 272, 84, 190, 64, 358, 364, 122, 118, 240",[],5,0,0,Five Stars netflix is great!
1,1.0,0.3401,"721, 59, 85, 18, 529, 165, 19, 1, 284, 128","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,2.0,0.2738,"109, 173, 230, 303, 186, 132, 23, 102, 111, 332","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"168, 272, 84, 190, 64, 358, 364, 122, 118, 240",[],5,0,0,Five Stars The best
4,2.0,0.169,"109, 173, 230, 303, 186, 132, 23, 102, 111, 332",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Standard LDA with TF-IDF - Find the most representative document for each topic

In [16]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_tfidf = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_tfidf, df)
sent_topics_sorteddf_tfidf.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.54,"168, 272, 84, 190, 64, 358, 364, 122, 118, 240",5,0,5,i ate a mushroom cloud I ate a mushroom cloud out of the toilet and it smelled bad. I had to spray perfume in the toilet....oopsie.
1,1.0,0.5233,"721, 59, 85, 18, 529, 165, 19, 1, 284, 128",4,11,14,"Excellent streaming service, probably the best one. Netflix is great, still a good deal for the price and NO COMMERCIALS. Interface is fluid and easy to navigate. Content is good. Content used to be great but there doesn't seem to be as many new releases(good new releases that is) and blockbusters as in the past but there is still a lot of offerings. When you think Netflix most people think movies but Netflix has tons of good shows. Complete collections of old and new t.v. shows. Lots of documentaries(but not enough serious science or political ones in my opinion). Plenty of cartoons and other kids shows. Some anime shows and movies. One thing I don't like is how they recommend shows to you based on what you watched previously. ""Because you watched Shrek"" and then it'll show you one related movie and twenty unrelated movies. It wouldn't be so bad if it didn't take up appx.50% of its interface. Its not bad just mostly inaccurate in my opinion. Also I would like to be able to search/browse by genre. In the main interface it lists movies under genre but you can't browse every movie or show under that genre.These are very minor issues and more like pet peeves than actual issues. There's a lot of B-movies(which I like) and then there are movies that are so bad they shouldn't exist but you are forced to navigate through them to find something good to watch. I'd like would like if I didn't have dig through these worse than B-movies. Overall Netflix is the best streaming service of its kind and if they would start offering just a few more blockbusters or above 3 star new releases a month then I give it 5 stars."
2,2.0,0.5602,"109, 173, 230, 303, 186, 132, 23, 102, 111, 332",2,1,2,"1000's of movies, but none you want to watch Every time I hear about a movie I want to see it's never available on Netflix. Even basic classic movies like Clockwork Orange. Only about 20% of the titles I actually want to watch are available. Another thing is that I will often times see movies I'm interested in watching and all of a sudden they disappear and I can't even search for them. Even though it was recommended under a week ago. Not a good value."
3,3.0,0.5044,"47, 266, 77, 20, 85, 216, 541, 1, 2, 204",5,1,1,"Quick turn around I get three discs at a time, but the price is going up yet again, so I'm going to let the meteor land and the dinosaurs turn into fossils, uh landfill at the end their long and expensive reign, and go 100% offsite and online for movies and tv, which frankly is inevitable anyway. The price hike just makes the time to go become clear."
4,4.0,0.5565,"76, 117, 212, 35, 96, 328, 53, 218, 469, 610",2,0,1,Netflix Kindle version doesn't support latest features (profiles) of netflix Netflix recently added a profile feature that allows each member of the family to have their own queues and viewing history instead of the old way of just having a single primary shared profile. The Kindle version of Netflix only supports interacting with the original primary profile not my own personal profile. My TV has the same problem but there's no updated firmware for it. You'd think Amazon would keep the app up to date. Or maybe not since Netflix competes with Amazon Instant. Other than that the movies play great on my new Kindle Fire HDX!
5,5.0,0.5677,"85, 49, 242, 302, 297, 30, 452, 237, 339, 420",3,0,0,"great looking app but some probs that need to.b fixed while I love the layout of netflix on the fire, they need to fix video playback. Hulu plus blows away netflix in that respect, and the qulaity of hulu plus makes me hopful that netflix can fix their playback issues. Netflix also needs to get rid of that bar at the bottom of the screen. there should also b a way to change volume without pausing video like u can in hulu plus."


#### 3. Standard LDA with TF-IDF - Topic distribution across documents

In [18]:
# Show
print("Topic distribution across documents")
df_dominant_topic_tfidf = topic_distribution_across_docs(df_topic_sents_keywords_tfidf)
df_dominant_topic_tfidf.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"168, 272, 84, 190, 64, 358, 364, 122, 118, 240",3901,0.3104
1,1.0,"721, 59, 85, 18, 529, 165, 19, 1, 284, 128",1851,0.1473
2,2.0,"109, 173, 230, 303, 186, 132, 23, 102, 111, 332",1797,0.143
3,3.0,"47, 266, 77, 20, 85, 216, 541, 1, 2, 204",1705,0.1357
4,4.0,"76, 117, 212, 35, 96, 328, 53, 218, 469, 610",1691,0.1346
5,5.0,"85, 49, 242, 302, 297, 30, 452, 237, 339, 420",1621,0.129


In [20]:
coherence_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')

In [21]:
coherence_model_tfidf.get_coherence_per_topic()

[-0.12354832277346144,
 -0.27506340107114513,
 -0.21936770506896217,
 -0.15886486640839953,
 -0.315369990391699,
 -0.06717851894875705]

### Mallet's LDA with TF-IDF

In [22]:
import os
os.environ.update({'MALLET_HOME':r'C:/mallet-2.0.8/'})

mallet_path = 'C:/mallet-2.0.8/bin/mallet.bat'

ldamallet_tfidf = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                   corpus=tf_idf, 
                                                   num_topics=6, 
                                                   random_seed = 100,
                                                   id2word=dictionary,
                                                   alpha = 1.5)

In [23]:
# Visualize the topics
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet_tfidf)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, tf_idf, dictionary)
pyLDAvis.save_html(vis, 'mallet_lda_vis/mallet_lda_vis_tfidf_num_topics=6&alpha=1.5.html')

In [24]:
coherence_model_m = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')
coherence_model_m.get_coherence_per_topic()

[-0.4393429433580468,
 -0.30618507026926095,
 -0.30168656037564956,
 -0.3299832958579991,
 -0.2606296052781025,
 -0.35151733260671064]

In [25]:
df_topic_sents_keywords_tfidf_m = format_topics_sentences(ldamodel=ldamallet_tfidf, corpus=term_doc, texts=data_lemmatized)

Getting main topic for document...
0  1000  2000  3000  4000  5000  6000  7000  8000  9000  10000  11000  12000  

#### 1. Mallet's LDA with TF-IDF - Finding the dominant topic in each document

In [27]:
# df_dominant_topic_tfidf_m = find_dominant_topic_in_each_doc(df_topic_sents_keywords_tfidf_m)
df_dominant_topic_tfidf_m = find_dominant_topic_in_each_doc(df_topic_sents_keywords_tfidf_m, df)
print("Finding the dominant topic in each document")
df_dominant_topic_tfidf_m.head(5).style.set_properties(subset=['review'], **{'width': '400px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,star_rating,helpful_votes,total_votes,review
0,0.0,0.1667,"love, lot, film, house, choice_movie, account, picture, freeze, content, great_movie",[],5,0,0,Five Stars netflix is great!
1,2.0,0.5247,"tv, four_great, fire, program, tv_show, service, price, option, newer_movie, good_movie","['buck', 'show', 'buck']",5,0,0,GREAT Why pay per view like the others? 8 bucks for all you want to watch. Want to watch every episode of a show? 8 bucks.
2,0.0,0.3025,"love, lot, film, house, choice_movie, account, picture, freeze, content, great_movie","['navigation', 'movies_show', 'hitch', 'device']",3,0,0,"Satisfactory App runs fine on kindle fire HD. Navigation thru app efficient, movies/shows play without hitch, sound is great. Video quality needs work, expected better from an HD device."
3,0.0,0.1667,"love, lot, film, house, choice_movie, account, picture, freeze, content, great_movie",[],5,0,0,Five Stars The best
4,3.0,0.4556,"love, kid, cable, version, interface, issue, order, excelent, year, friend",['original_series'],5,0,0,Excellent! Love netflix! Worked great on my Kindle. Watched one of the original series that I've been wanting to watch for awhile.


#### 2. Mallet's LDA with TF-IDF - Find the most representative document for each topic

In [29]:
print("Find the most representative document for each topic")
sent_topics_sorteddf_tfidf_m = find_most_representative_doc_for_each_doc(df_topic_sents_keywords_tfidf_m, df)
sent_topics_sorteddf_tfidf_m.style.set_properties(subset=['review'], **{'width': '400px'})

Find the most representative document for each topic


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,star_rating,helpful_votes,total_votes,review
0,0.0,0.7727,"love, lot, film, house, choice_movie, account, picture, freeze, content, great_movie",4,1,2,"Enjoy selection of films Enjoy having such a large selection of movies, but was not able to locate an older film. Would like to have the ability to pull up some classic films."
1,1.0,0.8077,"movie, app, watch, choice, thing, commercial, phone, download, month, four_work",1,3,5,"Fails to install on Velocity Cruz T301 with Android 2.2 Amazon reports that this device works with my tablet, which is linked to my Amazon account. But the installation process fails. My tablet is a Velocity Cruz T301 with Android 2.2. Note that this is not the only app that fails to install in the T301. About 1/2 of the apps I download from Amazon fail to install, and I have read that other users have not been able to install Netflix on the T301. So this is a limitation on the T301. But would be nice if Amazon accurately identified that the app does not install on the T301, or that it is not tested."
2,2.0,0.7906,"tv, four_great, fire, program, tv_show, service, price, option, newer_movie, good_movie",5,0,1,"keep up the excellent service and quality productions Outstanding service! My only issue isn't with the company, it's with me. I'm upset with myself for waiting until this year to purchase the service. I have been busy trying to catch up to everything I have been missing. Thank you NETFLIX, keep up the excellent service and quality productions."
3,3.0,0.7222,"love, kid, cable, version, interface, issue, order, excelent, year, friend",2,0,0,Netflix After I downloaded it said you need a membership which is $8 a month I couldn't watch anything because of the stupid membership. No good if u don't have a membership.
4,3.0,0.7222,"love, kid, cable, version, interface, issue, order, excelent, year, friend",5,0,0,love netflix! I wish there were a few more movies on here but the app is fantastic. The interface is very User friendly...my kids have it figured out! We don't have cable but who needs it when there's so many shows on here?! I do wonder where TNT shows are on here...everything else is stellar.
5,3.0,0.7222,"love, kid, cable, version, interface, issue, order, excelent, year, friend",1,0,0,"netflix I do not have, or want netflex. I did not order Netflix. idid not order Netflix. please do not include me in this order."
6,3.0,0.7222,"love, kid, cable, version, interface, issue, order, excelent, year, friend",5,0,0,Grandchild I have two grandchildren the problem was that one of them is 17 and her brother is 3 so they do not want to watch the same thing but they want to be in the same room I have headphones and I give the 3 year old the Kindle and let him watch videos and the problem is solved
7,4.0,0.7727,"selection, time, watch_movie, tablet, show, love_great, computer, recent_movie, series, episode",3,4,4,Do not like the netflix app on amazon fire Do not like the netflix app on amazon fire. It does not work as good as the computer and doesnt easily show you what you watched and also some show that are on the computer dont show up on the app.
8,5.0,0.8333,"work, kindle, problem, family, shows_movie, update, entertainment, good_good, season, rock",3,0,0,"Such Profile. Does what you expect it to do, except there are no profiles! Where are my profiles! Needs more profiles! Wow, such profile, many profile, amaze profile!!"


#### 3. Mallet's LDA with TF-IDF - Topic distribution across documents

In [30]:
# Show
print("Topic distribution across documents")
df_dominant_topic_tfidf_m = topic_distribution_across_docs(df_topic_sents_keywords_tfidf_m)
df_dominant_topic_tfidf_m.style.set_properties(subset=['Keywords'], **{'width': '400px'})

Topic distribution across documents


Unnamed: 0,Dominant_Topic,Keywords,Num_Documents,Perc_Documents
0,0.0,"love, lot, film, house, choice_movie, account, picture, freeze, content, great_movie",4131,0.3287
1,1.0,"movie, app, watch, choice, thing, commercial, phone, download, month, four_work",2224,0.177
2,2.0,"tv, four_great, fire, program, tv_show, service, price, option, newer_movie, good_movie",1873,0.1491
3,3.0,"love, kid, cable, version, interface, issue, order, excelent, year, friend",1601,0.1274
4,4.0,"selection, time, watch_movie, tablet, show, love_great, computer, recent_movie, series, episode",1598,0.1272
5,5.0,"work, kindle, problem, family, shows_movie, update, entertainment, good_good, season, rock",1139,0.0906


In [31]:
coherence_model_tfidf_m = CoherenceModel(model=ldamallet_tfidf, texts=data_lemmatized, dictionary=dictionary, coherence='c_npmi')

In [32]:
coherence_model_tfidf_m.get_coherence_per_topic()

[-0.4393429433580468,
 -0.30618507026926095,
 -0.30168656037564956,
 -0.3299832958579991,
 -0.2606296052781025,
 -0.35151733260671064]