Notebook containing initial analyses and data handling pipelines. We will grade the correctness, quality of code, and quality of textual descriptions.


# Plots and Actors in Blockbusters: What Do People Favor?


# 0.0 Load the dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/EPFL_course_project/ada-2022-project-superplainteamname2022/

/content/drive/MyDrive/EPFL_course_project/ada-2022-project-superplainteamname2022


In [3]:
import pandas as pd
import numpy as np

In [4]:
# `movie.metadata.tsv` [3.4 M]

# Metadata for 81,741 movies, extracted from the Noverber 4, 2012 dump of Freebase.  Tab-separated; columns:

# 1. Wikipedia movie ID
# 2. Freebase movie ID
# 3. Movie name
# 4. Movie release date
# 5. Movie box office revenue
# 6. Movie runtime
# 7. Movie languages (Freebase ID:name tuples)
# 8. Movie countries (Freebase ID:name tuples)
# 9. Movie genres (Freebase ID:name tuples)

movie_metadata = pd.read_csv(
    "./data/MovieSummaries/movie.metadata.tsv",
    sep="\t",
    header=None,
    names=[
        "movie_id",
        "freebase_movie_id",
        "movie_name",
        "movie_release_date",
        "movie_box_office_revenue",
        "movie_runtime",
        "movie_languages",
        "movie_countries",
        "movie_genres",
    ],
    parse_dates=["movie_release_date"],
    date_parser=lambda x: pd.to_datetime(x, errors="coerce"),
)

movie_metadata.head()
# movie id is wikipedia page id
# https://en.wikipedia.org/?curid={movie_id}

# How to use query freebase id?
# https://edstem.org/eu/courses/134/discussion/3845

# https://query.wikidata.org/#PREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E%0APREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E%0APREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E%0A%0ASELECT%20%20%3Fs%20%3FsLabel%20%3Fp%20%20%3Fo%20%3FoLabel%20WHERE%20%7B%0A%20%3Fs%20wdt%3AP646%20%22%2Fm%2F0181lj%22%20%0A%0A%20%20%20SERVICE%20wikibase%3Alabel%20%7B%0A%20%20%20%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%0A%20%20%20%7D%0A%20%7D

Unnamed: 0,movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988-01-01,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987-01-01,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983-01-01,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [5]:
# `character.metadata.tsv` [14 M]

# Metadata for 450,669 characters aligned to the movies above, extracted from the Noverber 4, 2012 dump of Freebase.  Tab-separated; columns:

# 1. Wikipedia movie ID
# 2. Freebase movie ID
# 3. Movie release date
# 4. Character name
# 5. Actor date of birth
# 6. Actor gender
# 7. Actor height (in meters)
# 8. Actor ethnicity (Freebase ID)
# 9. Actor name
# 10. Actor age at movie release
# 11. Freebase character/actor map ID
# 12. Freebase character ID
# 13. Freebase actor ID

character_metadata = pd.read_csv(
    "./data/MovieSummaries/character.metadata.tsv",
    sep="\t",
    header=None,
    names=[
        "movie_id",
        "freebase_movie_id",
        "movie_release_date",
        "character_name",
        "actor_birthdate",
        "actor_gender",
        "actor_height",
        "actor_ethnicity",
        "actor_name",
        "actor_age",
        "freebase_character_actor_map_id",
        "freebase_character_id",
        "freebase_actor_id",
    ],
    parse_dates=["movie_release_date", "actor_birthdate"],
    date_parser=lambda x: pd.to_datetime(x, errors="coerce", utc=True),
)
character_metadata['movie_release_date']= character_metadata['movie_release_date'].dt.date
character_metadata['actor_birthdate']= character_metadata['actor_birthdate'].dt.date
character_metadata.head()

Unnamed: 0,movie_id,freebase_movie_id,movie_release_date,character_name,actor_birthdate,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [6]:
movie_metadata[movie_metadata['movie_id'] == 5676692] # Iron Man

Unnamed: 0,movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
46672,5676692,/m/0dzlbx,Iron Man,2008-04-14,585174222.0,126.0,"{""/m/032f6"": ""Persian Language"", ""/m/0jzc"": ""A...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."


In [19]:
# Movie plots data.

# 42303 movies' plots in pure text

# Movie Wiki ID \t Plot summary
movie_plot = pd.read_csv('./data/MovieSummaries/plot_summaries.txt',sep='\t', \
                         names = ['movie_id', \
                               'Plot'])
plot_example = movie_plot[movie_plot['movie_id'] == 5676692]['Plot'].iloc[0] # Iron Man
movie_plot[movie_plot['movie_id'] == 5676692]

Unnamed: 0,movie_id,Plot
2999,5676692,"Playboy and genius Tony Stark, who has inherit..."


In [8]:
# merge plot data and movie metadata
movie_data = pd.merge(left = movie_metadata, right = movie_plot, left_on = 'movie_id', right_on = 'movie_id', how = 'inner')
print('Total Movie Number: ', movie_data.shape[0])
movie_data[movie_data['movie_id'] == 5676692]

Total Movie Number:  42204


Unnamed: 0,movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,Plot
24161,5676692,/m/0dzlbx,Iron Man,2008-04-14,585174222.0,126.0,"{""/m/032f6"": ""Persian Language"", ""/m/0jzc"": ""A...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Playboy and genius Tony Stark, who has inherit..."


# 0.0.1 Load some word vectors
#### Use GloVe from https://nlp.stanford.edu/projects/glove/

In [9]:
PADDING_WORD = '<PAD>'
UNKNOWN_WORD = '<UNK>'
embedding_file_name = './data/GloVe/glove.6B.100d.txt'


def load_glove_embeddings(embedding_file, padding_idx=0, padding_word=PADDING_WORD, unknown_word=UNKNOWN_WORD):
    """
    The function to load GloVe word embeddings
    
    :param      embedding_file:  The name of the txt file containing GloVe word embeddings
    :type       embedding_file:  str
    :param      padding_idx:     The index, where to insert padding and unknown words
    :type       padding_idx:     int
    :param      padding_word:    The symbol used as a padding word
    :type       padding_word:    str
    :param      unknown_word:    The symbol used for unknown words
    :type       unknown_word:    str
    
    :returns:   (a vocabulary size, vector dimensionality, embedding matrix, mapping from words to indices)
    :rtype:     a 4-tuple
    """
    word2index, embeddings, N = {}, [], 0
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word2index[word] = N
            N += 1
    D = len(embeddings[0])
    
    if padding_idx is not None and type(padding_idx) is int:
        embeddings.insert(padding_idx, [0]*D)
        embeddings.insert(padding_idx + 1, [-1]*D)
        for word in word2index:
            if word2index[word] >= padding_idx:
                word2index[word] += 2
        word2index[padding_word] = padding_idx
        word2index[unknown_word] = padding_idx + 1
                
    return N, D, np.array(embeddings, dtype=np.float32), word2index
num_word_vec, dim, embeddings, word2index = load_glove_embeddings(embedding_file = embedding_file_name)
print('Number of word vectors: ', num_word_vec)
print('Dimension of word vectors: ', dim)

Number of word vectors:  400000
Dimension of word vectors:  100


# 0.1 Genre counts and group


In [10]:
# convert dict-format string to dictionary
from collections import defaultdict
genres = movie_data['movie_genres'].apply(eval)
def zero_count():
  return 0
# count the time that a certain genre occurs
genre_count = defaultdict(zero_count)
for i, genre_dict in enumerate(genres):
  for key in genre_dict.keys():
    genre_count[genre_dict[key]] += 1
print('Total Number of Genre: ', len(genre_count.keys()))

Total Number of Genre:  363


In [11]:
genre_count_sorted = sorted(genre_count.items(), key = lambda x:x[1], reverse = True)
genre_count_bigger = [x for x in genre_count_sorted if x[1] >= 1000]
len(genre_count_bigger)

33

## 0.1.1 Decrease the number of genres and record new genres for each movie

In [12]:
# 0.1.1

## 1. What do those blockbusters have in their plots?
We want to see whether the top-selling movies are characterized by certain topics or keywords both qualitatively and quantitatively. We can also consider certain character personas in the plots.

### 1.1 Topic modeling of plots: LDA
#### 1.1.1 Clean the text. 
  (1) Excluding stop words and punctuations. \\
  (2) Tokenizing the sentence.

In [30]:
# 1.1
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import nltk
nltk.download('popular', quiet=True)
from nltk.corpus import stopwords # load stopwords to exclude 
from nltk.tokenize import word_tokenize
import string
from nltk.stem.wordnet import WordNetLemmatizer # word tokenizer
import spacy
import en_core_web_sm

In [34]:
from nltk.corpus.reader.categorized_sents import word_tokenize
# load stop words for English
stop = set(stopwords.words('english'))
# load punctuations
exclude = set(string.punctuation)
# initialize work tokenizer
lemma = WordNetLemmatizer()
# initialize the NER model
NER = en_core_web_sm.load()

def clean(text):
  # exclude stop words
  ner_result = NER(text)
  name_list = []
  for x in ner_result:
    if(x.ent_iob_ != 'O'):
      name_list.append(x.text.lower())
  word_list = [word.lower() for word in word_tokenize(plot_example)]
  no_name_no_stop = ' '.join([word for word in word_list if word not in name_list and word not in stop])
  # exclude punctuations
  # punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
  # turn the sentence into a list of words->word tokenize
  tokenized = ' '.join([lemma.lemmatize(word) for word in no_name_no_stop.split()])
  return tokenized.split()
movie_data['plot_clean']=movie_data['Plot'].apply(clean)
movie_data[movie_data['movie_id'] == 5676692]

  config_value=config["nlp"][key],


Unnamed: 0,movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,Plot,plot_clean
24161,5676692,/m/0dzlbx,Iron Man,2008-04-14,585174222.0,126.0,"{""/m/032f6"": ""Persian Language"", ""/m/0jzc"": ""A...","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Playboy and genius Tony Stark, who has inherit...","[genius, inherited, defense, contractor, fathe..."


In [55]:
#create dictionary
plot_word_dictionary = corpora.Dictionary(movie_data['plot_clean'])
# remove the word that appears less than 5 times.
plot_word_dictionary.filter_extremes(no_below=5, no_above=1.0, keep_n=None)
# fill in the gap between word indices after removing word 
plot_word_dictionary.compactify()
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print('Total Unique word: ',len(plot_word_dictionary))

doc_term_matrix = [plot_word_dictionary.doc2bow(doc) for doc in movie_data['plot_clean'] ]
print('Total Documents: ',len(doc_term_matrix))

# lda = gensim.models.ldamodel.LdaModel
lda = LdaMulticore

num_topics=20
ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=plot_word_dictionary,passes=50,minimum_probability=0)

Total Unique word:  36846
Total Documents:  42204


In [58]:
ldamodel.topics(num_topics=num_topics)

AttributeError: ignored

### 1.2 Assign each plot to a certain set of topics. 
  (1) Get the vector representation for each topic by adding all word vectors (GloVe here maybe). \\
  (2) Build a topic vector for each plot by adding of the topic vectors, weighted by the probability of the plot belonging to this topic. 

In [None]:
# 1.2

### 1.3 Keywords extraction of each plot, by KeyBERT

In [None]:
# 1.3

### 1.4 Build keyword vector for each plot by summing up all word vectors in keyword list of each plot.

In [None]:
# 1.4

### 1.5 Clustering of topics and keywords.
This part is the backbone of this first part analysis. 
#### 1.5.1 K-means of all plots' topic & keyword vectors.
#### 1.5.2 Observe clusters and visualize the revenue of each movie by color (discrete revenue level or continuous revenue color mapping)
#### 1.5.3 See whether certain clusters contain all top-selling or all bad-selling movies. 
#### 1.5.4 Here ends the qualitative analysis. For quantitative analysis, compare the averaged revenues in each cluster along with uncertainty (CI maybe) to see if the effect of different clusters of topics and keywords are significant.

In [None]:
# 1.5.1

In [None]:
# 1.5.2 & 1.5.3

### 1.6 Split movies into different genres (groupby)
#### 1.6.1 Decide main genres by frequency
#### 1.6.2 For movies belonging to each genres, do 1.5 again to see genre-related visualization.

In [None]:
# 1.6.1

In [None]:
# 1.6.2

### 1.7 Time-related topic & keywords changing
#### 1.7.1 Discretize time into decades.
#### 1.7.2 In each decade, do 1.5 to see time-related changing of topics and keywords.

In [None]:
# 1.7.1

In [None]:
# 1.7.2

# End of first part. 
#### Problems:
1. Do we need to combine genres and time? May result in too many groups of data and lack of amount of data in each group.

# 2. Can this actor contribute to more revenues?

## 2.1 Actor fitness score definition and calculation
Definition: The difference between the movie’s revenue and the average of movies including that actor. \\
Calculation: For each actor, there is a fitness score for each movie that the actor is in. 
Can only do this to protagonist or main characters (how to extract this)


## 2.2 For each actor, build a regression model
(1) Input features: \\
  1) Movie: plot keyword vectors, topic vectors (not needed to add together, can split as features), movie genres (main genres only), movie all properties (time, area, etc) Time can be split over to a independent variable. \\
  2) Actor: all related, age, sex, etc. \\
(2) Output: the fitness score for this actor in this movie.

## 2.3 Analysis of the coeff in the model for each actor.
(1) Are there any actors that the regression is not significant or coefficient all close to 0? This means the actor is very general.(all movies have similar revenues)

## 2.4 Build the model but exclude sex as input feature. 
#### 2.4.1 Build the model for actor and actress. Are the coefficients for man significantly higher than women?
Potential binding effect for man and woman actors. E.g., woman has higher coeff in love movie while man higher in action, or different character personas

## 2.5 Genre split. Maybe

## 2.6 Time-related analysis. Maybe
Detect outdated actor?

## 2.7 Actor recommendation
By analyzing the coefficient. Different time, different genre. 

## Play with DATA

[CMU Movie Summary Corpus](http://www.cs.cmu.edu/~ark/personas/)

`plot_summaries.txt` [29 M]

Plot summaries of 42,306 movies extracted from the November 2, 2012 dump of English-language Wikipedia.  Each line contains the Wikipedia movie ID (which indexes into movie.metadata.tsv) followed by the summary.


`corenlp_plot_summaries.tar` [628 M, separate download]

The plot summaries from above, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).


### TEST DATA
`tvtropes.clusters.txt`

72 character types drawn from tvtropes.com, along with 501 instances of those types.  The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv.

`name.clusters.txt`


970 unique character names used in at least two different movies, along with 2,666 instances of those types.  The ID field indexes into the Freebase character/actor map ID in character.metadata.tsv


In [None]:
# If you already downloaded CoreNLP data, you can avoid downloading by
# put it to data/corenlp_plot_summaries.tar
%cd /content/drive/MyDrive/EPFL_course_project/ada-2022-project-superplainteamname2022/
!sh ./data/setup.sh

/content/drive/MyDrive/EPFL_course_project/ada-2022-project-superplainteamname2022
./data/setup.sh: 19: ./data/setup.sh: Syntax error: "fi" unexpected (expecting "then")


In [None]:
character_metadata.query('actor_age <= 0')[['movie_release_date', 'actor_birthdate','actor_age']]

Unnamed: 0,movie_release_date,actor_birthdate,actor_age
767,1934-05-02,1963-11-07,-29.0
2286,1918-04-14,1931-03-25,-12.0
3892,1965-01-01,1983-03-03,-18.0
6666,1924-01-01,1972-11-07,-48.0
7188,1955-08-07,1973-08-01,-17.0
...,...,...,...
446570,1999-10-03,NaT,-937.0
446581,1955-01-01,1967-05-31,-12.0
446583,1944-02-23,1947-05-28,-3.0
446816,1941-06-20,1957-04-19,-15.0


In [None]:
# play with the data - check the calculation of actor age

calculated_age = (character_metadata.movie_release_date - character_metadata.actor_birthdate).astype('timedelta64[Y]')
ages = character_metadata[['freebase_actor_id', 'actor_age', 'actor_birthdate', 'movie_release_date']]
ages['calculated_age'] = calculated_age
ages['diff'] = ages['actor_age'] - ages['calculated_age']

print("diff>1 :{}".format(ages[ages['diff'].apply(lambda x: not np.isnan(x) and np.abs(x) > 1)]))
ages[ages['diff'].apply(lambda x: not np.isnan(x) and x != 0)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ages['calculated_age'] = calculated_age
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ages['diff'] = ages['actor_age'] - ages['calculated_age']


diff>1 :Empty DataFrame
Columns: [freebase_actor_id, actor_age, actor_birthdate, movie_release_date, calculated_age, diff]
Index: []


Unnamed: 0,freebase_actor_id,actor_age,actor_birthdate,movie_release_date,calculated_age,diff
34,/m/0bwh7d8,40.0,1947-01-01,1988-01-01,41.0,-1.0
164,/m/02w09gx,36.0,1949-01-01,1986-01-01,37.0,-1.0
767,/m/01wlly9,-29.0,1963-11-07,1934-05-02,-30.0,1.0
962,/m/07m9cm,44.0,1963-12-19,2008-12-18,45.0,-1.0
1179,/m/09vz5s,36.0,1937-01-01,1974-01-01,37.0,-1.0
...,...,...,...,...,...,...
447210,/m/02pb53,-9.0,1942-02-08,1932-08-09,-10.0,1.0
447504,/m/0f12r29,76.0,1933-01-01,2010-01-01,77.0,-1.0
449604,/m/0cm19f,56.0,1915-01-01,1972-01-01,57.0,-1.0
449664,/m/01g42,52.0,1913-11-02,1966-11-02,53.0,-1.0


Some ages has error 1. Some ages are negative...

In [None]:
# CoreNLP: https://stanfordnlp.github.io/CoreNLP/

def load_coreNLP_data(wiki_movie_id: int):
    """
    data/corenlp_plot_summaries/{wiki_movie_id}.xml.gz
    """
    from bs4 import BeautifulSoup
    import gzip
    
    xml = f'data/corenlp_plot_summaries/{wiki_movie_id}.xml.gz'
    with gzip.open(xml, 'rb') as f:
        soup = BeautifulSoup(f, 'xml')
    return soup

In [None]:
data = load_coreNLP_data(3217)
# data is like:
# <document>
#   <sentences>
#       <sentence>
#           ...
#       </sentence>
#   </sentences>
#   <coreference>
#      <coreference>
#         ...
#      </coreference>
#  </coreference>
# </document>
print(set(tag.name for tag in data.document.find_all(recursive=False)))
print(set(tag.name for tag in data.sentences.find_all(recursive=False)))
print(set(tag.name for tag in data.sentences.sentence.find_all(recursive=False)))
# print(data.sentence)

print(set(tag.name for tag in data.coreference.find_all(recursive=False)))
print(set(tag.name for tag in data.coreference.coreference.find_all(recursive=False)))
print(data.coreference.coreference.prettify())



{'coreference', 'sentences'}
{'sentence'}
{'parse', 'basic-dependencies', 'collapsed-dependencies', 'collapsed-ccprocessed-dependencies', 'tokens'}
{'coreference'}
{'mention'}
<coreference>
 <mention representative="true">
  <sentence>
   1
  </sentence>
  <start>
   23
  </start>
  <end>
   26
  </end>
  <head>
   24
  </head>
 </mention>
 <mention>
  <sentence>
   3
  </sentence>
  <start>
   18
  </start>
  <end>
   20
  </end>
  <head>
   18
  </head>
 </mention>
</coreference>

