In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [2]:
df = pd.read_csv('./data/film_festivals_with_plots.csv', keep_default_na=False)
df.head()

Unnamed: 0,id,title_english,title_original,director,country,winner,festival,year_festival,link_film,link_director,...,Response,InternetMovieDatabaseRating,RottenTomatoesRating,MetacriticRating,Error,totalSeasons,Ratings,Plot_Wikipedia,Plot_Selected,Plot_Combined
0,0,Adieu Bonaparte,وداعا بونابرت,Youssef Chahine,Egypt,0,cannes,1985,https://en.wikipedia.org/wiki/Adieu_Bonaparte,https://en.wikipedia.org/wiki/Youssef_Chahine,...,True,6.4/10,,,,,,,A story during the French Occupation of Egypt ...,A story during the French Occupation of Egypt ...
1,1,Birdy,,Alan Parker,United States,0,cannes,1985,https://en.wikipedia.org/wiki/Birdy_(film),https://en.wikipedia.org/wiki/Alan_Parker,...,True,7.3/10,85%,,,,,In a 1960s working-class neighborhood in Phila...,In a 1960s working-class neighborhood in Phila...,In a 1960s working-class neighborhood in Phila...
2,2,Bliss,,Ray Lawrence,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/Bliss_(1985_film),https://en.wikipedia.org/wiki/Ray_Lawrence_(fi...,...,True,6.9/10,,,,,,"Harry Joy, an advertising executive in an unna...","Harry Joy, an advertising executive in an unna...","Harry Joy, an advertising executive in an unna..."
3,3,Chicken with Vinegar,Poulet au vinaigre,Claude Chabrol,France,0,cannes,1985,https://en.wikipedia.org/wiki/Chicken_with_Vin...,https://en.wikipedia.org/wiki/Claude_Chabrol,...,False,,,,Movie not found!,,,"In a small town in Normandy, Louis Cuno, a you...","In a small town in Normandy, Louis Cuno, a you...","In a small town in Normandy, Louis Cuno, a you..."
4,4,The Coca-Cola Kid,,Dušan Makavejev,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/The_Coca-Cola_Kid,https://en.wikipedia.org/wiki/Du%C5%A1an_Makav...,...,True,6.0/10,44%,,,,,"Becker, a hotshot American marketing executive...","Becker, a hotshot American marketing executive...","Becker, a hotshot American marketing executive..."


In [3]:
plots = df['Plot_Selected']
plots.head()

0    A story during the French Occupation of Egypt ...
1    In a 1960s working-class neighborhood in Phila...
2    Harry Joy, an advertising executive in an unna...
3    In a small town in Normandy, Louis Cuno, a you...
4    Becker, a hotshot American marketing executive...
Name: Plot_Selected, dtype: object

### preprocessing

In [4]:
plots = [plot.lower() for plot in plots] # lowercase
plots = [re.sub('[!#$\(\)*+,./:;<=>?@^_`|~]', '', plot) for plot in plots] # remove punctuation
plots = [re.sub('\w*\d\w*', '', plot) for plot in plots] # remove numbers and words that contain numbers
plots = [re.sub('-', ' ', plot) for plot in plots] # handle compound words
plots = [word_tokenize(plot) for plot in plots] # tokenize
stopwords = set(stopwords.words('english'))
stopwords.update({'``', '\'\'', '\'s', 'n\'t', '\''})
fdist = FreqDist([word for plot in plots for word in plot])
commonwords = [word[0] for word in fdist.most_common(1000)] # try 1000
stopwords.update(commonwords) # remove most common words in corpus of film plots
lemmatizer = WordNetLemmatizer()
plots = [
    [lemmatizer.lemmatize(word) for word in plot if word not in stopwords] # remove stopwords
    for plot in plots
]
df['bag_of_words'] = plots
df.head()

Unnamed: 0,id,title_english,title_original,director,country,winner,festival,year_festival,link_film,link_director,...,InternetMovieDatabaseRating,RottenTomatoesRating,MetacriticRating,Error,totalSeasons,Ratings,Plot_Wikipedia,Plot_Selected,Plot_Combined,bag_of_words
0,0,Adieu Bonaparte,وداعا بونابرت,Youssef Chahine,Egypt,0,cannes,1985,https://en.wikipedia.org/wiki/Adieu_Bonaparte,https://en.wikipedia.org/wiki/Youssef_Chahine,...,6.4/10,,,,,,,A story during the French Occupation of Egypt ...,A story during the French Occupation of Egypt ...,"[occupation, egypt, depicting, conflict, tradi..."
1,1,Birdy,,Alan Parker,United States,0,cannes,1985,https://en.wikipedia.org/wiki/Birdy_(film),https://en.wikipedia.org/wiki/Alan_Parker,...,7.3/10,85%,,,,,In a 1960s working-class neighborhood in Phila...,In a 1960s working-class neighborhood in Phila...,In a 1960s working-class neighborhood in Phila...,"[neighborhood, philadelphia, pennsylvania, tee..."
2,2,Bliss,,Ray Lawrence,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/Bliss_(1985_film),https://en.wikipedia.org/wiki/Ray_Lawrence_(fi...,...,6.9/10,,,,,,"Harry Joy, an advertising executive in an unna...","Harry Joy, an advertising executive in an unna...","Harry Joy, an advertising executive in an unna...","[joy, advertising, executive, unnamed, austral..."
3,3,Chicken with Vinegar,Poulet au vinaigre,Claude Chabrol,France,0,cannes,1985,https://en.wikipedia.org/wiki/Chicken_with_Vin...,https://en.wikipedia.org/wiki/Claude_Chabrol,...,,,,Movie not found!,,,"In a small town in Normandy, Louis Cuno, a you...","In a small town in Normandy, Louis Cuno, a you...","In a small town in Normandy, Louis Cuno, a you...","[normandy, cuno, postman, disabled, eccentric,..."
4,4,The Coca-Cola Kid,,Dušan Makavejev,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/The_Coca-Cola_Kid,https://en.wikipedia.org/wiki/Du%C5%A1an_Makav...,...,6.0/10,44%,,,,,"Becker, a hotshot American marketing executive...","Becker, a hotshot American marketing executive...","Becker, a hotshot American marketing executive...","[becker, hotshot, marketing, executive, robert..."


### LDA

In [5]:
dictionary = Dictionary(df['bag_of_words'])
dictionary.filter_extremes(no_below = 10) # try other values for no_below
lda_corpus = df['bag_of_words'].apply(dictionary.doc2bow)
print(dictionary)

Dictionary(5233 unique tokens: ['conflict', 'egypt', 'fighting', 'occupation', 'ability']...)


In [6]:
lda = LdaModel(corpus=lda_corpus, id2word=dictionary, num_topics=15)

In [7]:
lda.print_topics()

[(0,
  '0.006*"manuel" + 0.006*"eva" + 0.005*"betty" + 0.004*"ward" + 0.004*"gabriel" + 0.003*"rebel" + 0.003*"band" + 0.003*"circus" + 0.003*"bride" + 0.003*"monastery"'),
 (1,
  '0.009*"cooper" + 0.007*"lili" + 0.004*"yu" + 0.004*"woo" + 0.004*"dean" + 0.004*"mi" + 0.003*"friday" + 0.003*"hong" + 0.003*"agnes" + 0.003*"partisan"'),
 (2,
  '0.006*"jin" + 0.003*"rose" + 0.003*"rita" + 0.003*"hook" + 0.002*"kang" + 0.002*"cause" + 0.002*"paulo" + 0.002*"larry" + 0.002*"ranch" + 0.002*"barry"'),
 (3,
  '0.004*"catherine" + 0.004*"adam" + 0.003*"gloria" + 0.003*"snow" + 0.003*"george" + 0.003*"foster" + 0.002*"thief" + 0.002*"key" + 0.002*"royal" + 0.002*"crash"'),
 (4,
  '0.006*"carter" + 0.005*"victoria" + 0.005*"jerry" + 0.004*"eddie" + 0.004*"”" + 0.004*"“" + 0.003*"claire" + 0.003*"salvatore" + 0.003*"bernard" + 0.003*"roy"'),
 (5,
  '0.006*"mill" + 0.005*"albert" + 0.005*"samurai" + 0.005*"sophie" + 0.005*"harold" + 0.004*"grace" + 0.004*"rita" + 0.004*"wei" + 0.003*"al" + 0.003*"ci

In [8]:
for i in range(15):
    print('Terms: ' + ', '.join([term for term, _ in lda.show_topic(i)]))
    print()

Terms: manuel, eva, betty, ward, gabriel, rebel, band, circus, bride, monastery

Terms: cooper, lili, yu, woo, dean, mi, friday, hong, agnes, partisan

Terms: jin, rose, rita, hook, kang, cause, paulo, larry, ranch, barry

Terms: catherine, adam, gloria, snow, george, foster, thief, key, royal, crash

Terms: carter, victoria, jerry, eddie, ”, “, claire, salvatore, bernard, roy

Terms: mill, albert, samurai, sophie, harold, grace, rita, wei, al, circus

Terms: eric, henri, linda, mark, johnson, isabelle, minister, francis, rosa, jerry

Terms: nick, miriam, edward, jackie, maurice, ray, fernando, miller, felix, fanny

Terms: lou, emily, sword, roger, smith, carmen, ann, antoine, hang, nun

Terms: martha, bobby, guy, jane, ghost, marina, cause, pick, lena, patient

Terms: dora, jeanne, count, steve, jim, victim, stone, ball, danny, mountain

Terms: jimmy, howard, karen, alain, e, el, di, hunter, giovanni, refugee

Terms: pete, dan, christine, julia, rené, antonio, jones, ira, duke, cathol