In [26]:
"""
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/EPFL/AppliedDataScience/Project/ada-2022-project-adlucere2022/src/"
"""
path=""

In [27]:
# Basic stuff
import pandas as pd

# NLP
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline


# pyLDAvis
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis



Load data

In [28]:
preferred_index = pd.read_csv(path + '../data/preferred_index.csv').values.flatten()

In [29]:
plots = pd.read_csv(path + '../data/movie_data_imdbscores_final.csv').drop('plot_summary', axis = 1)
plots_processed = pd.read_csv(path + '../data/plot/plot_summaries_preprocessed.csv')

plots = pd.merge(plots, plots_processed, on = 'movie_id')

plots = plots.set_index('movie_id').loc[preferred_index].reset_index()

In [30]:
plots.head()

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
0,30332673,#1 cheerleader camp,2010,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.666667,3.7,3222,horny college guy summer job cheerleader camp ...
1,4213160,$,1971,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.25,6.3,2631,set hamburg west germany criminal advantage ge...
2,20624798,$9.99,2008,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.111111,7.2,22,film mainly focus year old dave peck unemploye...
3,2250713,'68,1988,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,5.9,82,father escaped soviet invasion budapest run hu...
4,10331139,'gator bait,1974,True,False,False,False,False,False,False,...,False,False,False,False,False,False,1.0,5.4,1277,film follows poacher named desiree life deep s...


Transform sentences to list of words

In [31]:
summaries_corpus = plots.plot_summary.apply(lambda x: x.split()).values.tolist()

Transform list of words to list of bigrams

In [32]:
#enrich our tokens by creating bigrams for each summary

def make_bigrams(summaries): 
    bigram = gensim.models.Phrases(summaries_corpus, min_count=5, threshold=100) # default parameters 
    bigram_model=gensim.models.phrases.Phraser(bigram) #create the model
    return [bigram_model[summary] for summary in summaries] # return the output of the bigram model for each summary

#for trigrams 
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
#trigram_model=gensim.models.phrases.Phraser(trigram) #create the model
#return [trigram_mod[bigram_model[summary]] for summary in summaries]


summaries_corpus=make_bigrams(summaries_corpus)

Extract dictionary and corpus

In [33]:
#create the dictionary( a mapping between words and their integer ids) and corpus for LDA analaysis based on the preprocesseed summaries
dictionary = corpora.Dictionary(summaries_corpus)

#so for each word we have already mapped it in tuple of word id and frequency , so a summary is a bag of these tuples
corpus = [dictionary.doc2bow(summary) for summary in summaries_corpus]

Number of plot summaries

In [34]:
len(summaries_corpus)

31291

Number of tokens in dictionary

In [35]:
len(dictionary)

120627

Corpus representation

``I do not know what this is``

In [36]:
#LDA implementation using the previous corpus 
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,# how often the model parameters should be updated
                                           chunksize=100, # number of summaries to be used in each training chunk
                                           passes=10, # training passes
                                           alpha='auto', # hyperparameter
                                           per_word_topics=True)
#so the LDA find the maximum likelihood distribution of words and summaries over topics(set to 4) 

In [37]:
lda_model.print_topics()

[(0,
  '0.007*"life" + 0.007*"father" + 0.007*"love" + 0.006*"film" + 0.006*"family" + 0.006*"mother" + 0.006*"friend" + 0.006*"find" + 0.005*"home" + 0.005*"tell"'),
 (1,
  '0.006*"men" + 0.005*"kill" + 0.004*"soldier" + 0.004*"order" + 0.004*"army" + 0.004*"killed" + 0.004*"war" + 0.004*"police" + 0.004*"village" + 0.004*"escape"'),
 (2,
  '0.007*"tom" + 0.007*"jack" + 0.006*"money" + 0.006*"car" + 0.006*"joe" + 0.006*"frank" + 0.005*"chris" + 0.005*"police" + 0.005*"town" + 0.005*"jim"'),
 (3,
  '0.010*"find" + 0.006*"kill" + 0.005*"escape" + 0.005*"tell" + 0.004*"body" + 0.004*"try" + 0.004*"man" + 0.004*"house" + 0.004*"car" + 0.004*"begin"')]

In [38]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=summaries_corpus, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.304194553748708

Coherence Score:  0.3681100788222885


In [39]:

#interactive plot of topic representation 
#in the left each circle represent a topic and it's size determine the importance.Also in the right are the most representative words for each topic  
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [40]:
pyLDAvis.save_html(vis, path+'../data/clustering/lda_vis.html')

In [41]:
#term to topics matrix distribution 
"""
topics_terms = lda_model.state.get_lambda() 
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
words = [lda_model.id2word[i] for i in range(topics_terms_proba.shape[1])]
pd.DataFrame(topics_terms_proba,columns=words)
"""

'\ntopics_terms = lda_model.state.get_lambda() \ntopics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)\nwords = [lda_model.id2word[i] for i in range(topics_terms_proba.shape[1])]\npd.DataFrame(topics_terms_proba,columns=words)\n'

In [42]:
#doc/movies to topics matrix distribution
train_vecs = []
for i in range(len(summaries_corpus)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(4)]
    train_vecs.append(topic_vec)

In [43]:
#movies topic distribution dataframe 
movie_topic_distribution = pd.DataFrame(train_vecs[0:], columns=['topic_1','topic_2','topic_3','topic_4'])
movie_topic_distribution['movie_id']=plots.movie_id.values

In [44]:
movie_topic_distribution

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,movie_id
0,0.633801,0.099370,0.236645,0.030183,30332673
1,0.165583,0.188212,0.245007,0.401198,4213160
2,0.801630,0.024238,0.107932,0.066200,20624798
3,0.635494,0.272188,0.073477,0.018841,2250713
4,0.378305,0.029088,0.236551,0.356057,10331139
...,...,...,...,...,...
31286,0.846602,0.098983,0.010952,0.043463,1719500
31287,0.632356,0.174139,0.080308,0.113198,31799966
31288,0.919264,0.058464,0.006877,0.015395,13983035
31289,0.729333,0.202205,0.009401,0.059061,25818705


In [45]:
movie_topic_distribution.to_csv(path+'../data/clustering/movie_topic_distribution.csv', index=False)

In [46]:
#top 5 most representative movies for each topics based on the movie-topic matrix distribution
most_representaive_movies_topic1=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_1.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic2=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_2.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic3=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_3.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic4=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_4.sort_values(ascending=False).index[:5]].movie_id.values)]

Topic 1 =>Seems to be romantic comedies:

https://en.wikipedia.org/wiki/Puccini_for_Beginners

https://en.wikipedia.org/wiki/Waiter_(film)

https://en.wikipedia.org/wiki/Heaven_Can_Wait_%281943_film%29

https://en.wikipedia.org/wiki/Ullam_Ketkumae

https://en.wikipedia.org/wiki/2_Young

In [47]:
most_representaive_movies_topic1

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
113,2014153,2 young,2005,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.333333,6.5,525,natalie different family background fu father ...
10645,77653,heaven can wait,1943,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.333333,7.6,6693,aged henry van cleve enters opulent reception ...
19136,12091274,puccini for beginners,2007,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.833333,6.0,2459,story begin samantha breaking allegra lesbian ...
27244,7342730,waiter,2006,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.6,7.0,3875,waiter tell story edgar waiter flair unfortuna...
29490,5045106,ullam ketkumae,2005,False,False,False,False,False,False,True,...,False,False,False,False,False,True,0.428571,7.1,450,movie group college friend shaam emaan arya po...


Topic 2 => Some kind of historical movies about periods of upheaval and adversity

https://en.wikipedia.org/wiki/Stalingrad_(1990_film)

https://en.wikipedia.org/wiki/Operation_Thunderbolt_(film)

https://en.wikipedia.org/wiki/The_Unforgettable_Year_1919

https://en.wikipedia.org/wiki/Battle_of_Britain_(film)

https://en.wikipedia.org/wiki/The_Sino-Dutch_War_1661

In [48]:
most_representaive_movies_topic2

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
2927,240483,battle of britain,1969,True,True,False,False,False,False,False,...,True,False,False,False,False,False,0.052632,6.9,22836,battle france raf pilot escaping german blitzk...
15947,5826081,mivtsa yonatan,1977,True,True,False,False,False,False,False,...,False,False,False,False,False,False,0.2,6.6,947,july air france flight tel aviv paris athens h...
22221,29677455,stalingrad,1990,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.6,83,january adolf hitler appoints fedor von bock c...
27567,28869936,the sino-dutch war 1661,2000,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.666667,6.4,80,story begin northern china controlled manchu l...
28082,33113634,the unforgettable year 1919,1952,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,6.1,51,city petrograd bolsheviks stronghold russia at...


Topic 3 

https://en.wikipedia.org/wiki/Smart_Alecks

https://en.wikipedia.org/wiki/The_Crime_Patrol

https://en.wikipedia.org/wiki/The_Casino_Job

https://en.wikipedia.org/wiki/Roarin%27_Lead

https://en.wikipedia.org/wiki/Mr._Muggs_Rides_Again

In [49]:
most_representaive_movies_topic3

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
16267,32173262,mr. muggs rides again,1945,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,6.1,329,jockey muggs mistakenly accused cheating big r...
20014,32852772,roarin' lead,1936,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,6.1,87,hackett cattlemen association bankrupting men ...
21715,24121382,smart alecks,1942,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.076923,5.7,516,hank leaf east kid apprentice crook job lookou...
24169,25432586,the casino job,2009,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.333333,3.5,964,sexy vega vixen master plan hoping beat odds r...
24403,24190384,the crime patrol,1936,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.133333,5.5,58,boxer bob neal join police losing fight office...


Topic 4 => Seems to be cartoon movies

https://en.wikipedia.org/wiki/Zoom_and_Bored

https://en.wikipedia.org/wiki/Ready,_Woolen_and_Able

https://en.wikipedia.org/wiki/Hare-Breadth_Hurry

https://en.wikipedia.org/wiki/Lickety-Splat

In [50]:
most_representaive_movies_topic4

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
10471,23617806,hare-breadth hurry,1963,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.1,374,introduction cartoon open typical wile coyote ...
14220,9930507,lickety-splat,1961,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.2,348,introduction wile coyote standing road pull ar...
19593,14205601,"ready, woolen and able",1960,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.6,454,introduction like sam sheepdog ralph wolf shor...
30631,10131383,"whoa, be-gone!",1958,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.4,751,introduction road runner chased roadway wile c...
31257,10131333,zoom and bored,1957,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,7.5,855,introduction pair zoom view begin chase freezi...
