In [26]:
"""
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/EPFL/AppliedDataScience/Project/ada-2022-project-adlucere2022/src/"
"""
path=""

In [27]:
# Basic stuff
import pandas as pd

# NLP
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline


# pyLDAvis
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis



Load data

In [28]:
preferred_index = pd.read_csv(path + '../data/preferred_index.csv').values.flatten()

In [29]:
plots = pd.read_csv(path + '../data/movie_data_imdbscores_final.csv').drop('plot_summary', axis = 1)
plots_processed = pd.read_csv(path + '../data/plot/plot_summaries_preprocessed.csv')

plots = pd.merge(plots, plots_processed, on = 'movie_id')

plots = plots.set_index('movie_id').loc[preferred_index].reset_index()

In [30]:
plots.head()

Unnamed: 0,movie_id,movie_name,release_date,english_language,german_language,silent_film_language,spanish_language,japanese_language,italian_language,tamil_language,...,united_kingdom,germany,japan,france,italy,india,F_gender,averageRating,numVotes,plot_summary
0,30332673,#1 cheerleader camp,2010,False,False,False,False,False,False,False,...,False,False,False,False,False,False,0.666667,3.7,3222,horny college guy summer job cheerleader camp ...
1,4213160,$,1971,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.25,6.3,2631,set hamburg west germany criminal advantage ge...
2,20624798,$9.99,2008,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.111111,7.2,22,film mainly focus year old dave peck unemploye...
3,2250713,'68,1988,True,False,False,False,False,False,False,...,False,False,False,False,False,False,0.0,5.9,82,father escaped soviet invasion budapest run hu...
4,10331139,'gator bait,1974,True,False,False,False,False,False,False,...,False,False,False,False,False,False,1.0,5.4,1277,film follows poacher named desiree life deep s...


Transform sentences to list of words

In [31]:
summaries_corpus = plots.plot_summary.apply(lambda x: x.split()).values.tolist()

Transform list of words to list of bigrams

In [32]:
#enrich our tokens by creating bigrams for each summary

def make_bigrams(summaries): 
    bigram = gensim.models.Phrases(summaries_corpus, min_count=5, threshold=100) # default parameters 
    bigram_model=gensim.models.phrases.Phraser(bigram) #create the model
    return [bigram_model[summary] for summary in summaries] # return the output of the bigram model for each summary

#for trigrams 
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
#trigram_model=gensim.models.phrases.Phraser(trigram) #create the model
#return [trigram_mod[bigram_model[summary]] for summary in summaries]


summaries_corpus=make_bigrams(summaries_corpus)

Extract dictionary and corpus

In [33]:
#create the dictionary( a mapping between words and their integer ids) and corpus for LDA analaysis based on the preprocesseed summaries
dictionary = corpora.Dictionary(summaries_corpus)

#so for each word we have already mapped it in tuple of word id and frequency , so a summary is a bag of these tuples
corpus = [dictionary.doc2bow(summary) for summary in summaries_corpus]

Number of plot summaries

In [34]:
len(summaries_corpus)

31291

Number of tokens in dictionary

In [35]:
len(dictionary)

120627

Corpus representation

``I do not know what this is``

In [None]:
#LDA implementation using the previous corpus 
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,# how often the model parameters should be updated
                                           chunksize=100, # number of summaries to be used in each training chunk
                                           passes=10, # training passes
                                           alpha='auto', # hyperparameter
                                           per_word_topics=True)
#so the LDA find the maximum likelihood distribution of words and summaries over topics(set to 4) 

In [None]:
lda_model.print_topics()

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=summaries_corpus, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:

#interactive plot of topic representation 
#in the left each circle represent a topic and it's size determine the importance.Also in the right are the most representative words for each topic  
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis

In [None]:
pyLDAvis.save_html(vis, path+'../data/clustering/lda_vis.html')

In [None]:
#term to topics matrix distribution 
"""
topics_terms = lda_model.state.get_lambda() 
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
words = [lda_model.id2word[i] for i in range(topics_terms_proba.shape[1])]
pd.DataFrame(topics_terms_proba,columns=words)
"""

In [None]:
#doc/movies to topics matrix distribution
train_vecs = []
for i in range(len(summaries_corpus)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(4)]
    train_vecs.append(topic_vec)

In [None]:
#movies topic distribution dataframe 
movie_topic_distribution = pd.DataFrame(train_vecs[0:], columns=['topic_1','topic_2','topic_3','topic_4'])
movie_topic_distribution['movie_id']=plots.movie_id.values

In [None]:
movie_topic_distribution

In [None]:
movie_topic_distribution.to_csv(path+'../data/clustering/movie_topic_distribution.csv', index=False)

In [None]:
#top 5 most representative movies for each topics based on the movie-topic matrix distribution
most_representaive_movies_topic1=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_1.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic2=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_2.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic3=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_3.sort_values(ascending=False).index[:5]].movie_id.values)]
most_representaive_movies_topic4=plots[plots.movie_id.isin(plots.iloc[movie_topic_distribution.topic_4.sort_values(ascending=False).index[:5]].movie_id.values)]

Topic 1 =>Seems to be romantic comedies:

https://en.wikipedia.org/wiki/Puccini_for_Beginners

https://en.wikipedia.org/wiki/Waiter_(film)

https://en.wikipedia.org/wiki/Heaven_Can_Wait_%281943_film%29

https://en.wikipedia.org/wiki/Ullam_Ketkumae

https://en.wikipedia.org/wiki/2_Young

In [None]:
most_representaive_movies_topic1

Topic 2 => Some kind of historical movies about periods of upheaval and adversity

https://en.wikipedia.org/wiki/Stalingrad_(1990_film)

https://en.wikipedia.org/wiki/Operation_Thunderbolt_(film)

https://en.wikipedia.org/wiki/The_Unforgettable_Year_1919

https://en.wikipedia.org/wiki/Battle_of_Britain_(film)

https://en.wikipedia.org/wiki/The_Sino-Dutch_War_1661

In [None]:
most_representaive_movies_topic2

Topic 3 

https://en.wikipedia.org/wiki/Smart_Alecks

https://en.wikipedia.org/wiki/The_Crime_Patrol

https://en.wikipedia.org/wiki/The_Casino_Job

https://en.wikipedia.org/wiki/Roarin%27_Lead

https://en.wikipedia.org/wiki/Mr._Muggs_Rides_Again

In [None]:
most_representaive_movies_topic3

Topic 4 => Seems to be cartoon movies

https://en.wikipedia.org/wiki/Zoom_and_Bored

https://en.wikipedia.org/wiki/Ready,_Woolen_and_Able

https://en.wikipedia.org/wiki/Hare-Breadth_Hurry

https://en.wikipedia.org/wiki/Lickety-Splat

In [None]:
most_representaive_movies_topic4