### Code for emotion score vector computation

In [1]:
# importing libraries
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from datetime import datetime, date, time
from dateutil.parser import parse

In [2]:
# Setting up a color blind friendly pallete
CB_color_cycle = ['#377eb8','#ff7f00','#4daf4a',
                  '#f781bf','#a65628','#984ea3',
                  '#999999','#e41a1c','#dede00']

Xiaocheng's code for summary processing

In [3]:
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_summary(text):
    """
    Tokenize, lemmatize, remove stopwords and punctuations from an input text.
    
    Parameters
    ----------
    text: str, input text
    
    Returns
    -------
    str, preprocessed text
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    tokens = nltk.word_tokenize(text)
    text = [word for word in tokens if word not in string.punctuation]
    
    return " ".join([lemmatizer.lemmatize(word.lower()) for word in text if word.lower() not in stop_words])

In [5]:
"""
summaries: dictionary, with movie_id as keys and list of preprocessed words in the summary as values
"""
with open("./test_data/plot_summaries.txt", encoding='utf-8') as f:
    content = f.readlines()
original_summaries = [x.strip() for x in content] 
summaries = [preprocess_summary(d).split() for d in original_summaries]
summaries = {summary[0]: summary[1:] for summary in summaries}

End of Xiaocheng's code

In [6]:

# Loading the NRC lexicon emotion intensity data
data = pd.read_table("NRC-lexicon/NRC-Emotion-Intensity-Lexicon-v1-ForVariousLanguages-withZeroIntensityEntries.txt")
data_filt = data.iloc[:,0:9].copy()

In [None]:
# Calculating the emotion intensity vector for each of the movies
emo_vector = pd.DataFrame(columns=data_filt.columns)
emo_vector = emo_vector.drop(columns = "English Word")

for key in summaries:
 test = pd.DataFrame(columns=data_filt.columns)
 for i in range(0,len(summaries[key])):
  selection = data_filt[data_filt["English Word"] == summaries[key][i]]
  test = pd.concat([test, selection], ignore_index=True)
 test["Movie ID"] = key
 test = test.drop(columns = "English Word")
 test = test.set_index("Movie ID")
 test = test.groupby("Movie ID").sum()
 emo_vector = pd.concat([emo_vector, test], ignore_index=False)

emo_vector.reset_index(inplace=True)
emo_vector.rename(columns={"index": "Wikipedia movie ID"}, inplace=True)

# Saving the new dataframe with the emotion vectors 
emo_vector.to_csv("MovieIDs_emotions.csv")

In [12]:
emo_vector = pd.read_csv("MovieIDs_emotions.csv")
emo_vector.reset_index(inplace=True)
emo_vector.rename(columns={"index": "Movie ID"}, inplace=True)
emo_vector.head()

Unnamed: 0,Movie ID,Movie ID.1,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,0,23890098,0.545,0.508,0.0,0.0,0.514,0.0,0.578,0.656
1,1,31186339,21.194,12.817,8.796,28.742,13.018,16.055,7.102,17.189
2,2,20663735,12.641,6.514,3.917,14.27,10.557,11.749,3.398,17.851
3,3,2231378,10.029,14.45,3.774,11.866,42.855,8.775,6.217,14.474
4,4,595909,6.694,7.516,2.769,7.93,8.273,8.365,2.32,15.985


In [9]:
# Using Mya's code to parse the movie metadata:
# Define the column names based on the metadata structure.
column_names = [
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie name",
    "Movie release date",
    "Movie box office revenue",
    "Movie runtime",
    "Movie languages",
    "Movie countries",
    "Movie genres",
]

# Read the TSV file into a pandas DataFrame and specify that it's tab-separated.
movie_md = pd.read_csv("test_data/movie.metadata.tsv", sep='\t', names=column_names, header=None) # <- Mya's code ends here

In [10]:
# Using Mya's code to extract genres:
# Function to extract the genres
def extract_genres(genre_data):
    genre_names = []
    pattern = r'"([^"]+)"\s*:\s*"([^"]+)"'
    matches = re.findall(pattern, genre_data)
    for match in matches:
        genre_names.append(match[1])  # Extract the genre name
    return ','.join(genre_names)

# Apply the function to extract genre names
movie_md["Movie genres"] = movie_md["Movie genres"].apply(extract_genres)
movie_md['Movie genres'] = movie_md['Movie genres'].apply(lambda x: x.split(','))

In [11]:
# Joinning the dataframes by 'Wikipedia movie ID' while removing all rows in movie_md for which we
# do not have plot summaries.

emo_vector.rename(columns={"Movie ID": "Wikipedia movie ID"}, inplace=True)

df = emo_vector.merge(movie_md, on='Wikipedia movie ID', how='left')
df.sample(10)

ValueError: The column label 'Wikipedia movie ID' is not unique.

In [None]:
# Remove rows for which the release date is unknown
df_filtered = df[df["Movie release date"].notna()]
len(df), len(df_filtered)

In [None]:
# Handling dates in the dataframe
df_filtered["Movie release year"] = df_filtered["Movie release date"].apply(lambda x: parse(x).year)

In [None]:
plt.scatter(df_filtered["Movie release year"], df_filtered["anger"])

In [None]:
# Seems like someone was really ahead of their time! Let's see who were our movie pioneers
df_filtered[df_filtered["Movie release year"] < 1200]

In [None]:
# By checking on the internet I saw that the correct release date is 2010, so I can just correct the dataframe
df_filtered.loc[26305, "Movie release date"] = '2010-12-02'
df_filtered.loc[26305, "Movie release year"] = 2010
df_filtered.loc[26305]

In [None]:
# Creating a new column with just the main genre of the movie
df_filtered["Main genre"] = df_filtered["Movie genres"].apply(lambda x: x[0])

In [None]:
# Count the frequency of each genre and get the 10 most frequent genres
top_10_genres = df_filtered['Main genre'].value_counts().head(10).index.tolist()

# Filter the DataFrame to keep only movies belonging to the top 10 genres
df_topmg = df_filtered[df_filtered['Main genre'].isin(top_10_genres)]

df_topmg

In [None]:
# Plotting the raw intensity scores per year for each on of the top 10 most common genres
fig, ax = plt.subplots(10, 1, figsize=(20, 70))

for j in range(len(top_10_genres)):
    for i in range(1, 9):
        sns.pointplot(
            x="Movie release year",
            y=df_topmg.columns[i],
            data=df_topmg[(df_topmg["Main genre"] == top_10_genres[j])],
            estimator="median",
            color=CB_color_cycle[i-1],
            label=df_topmg.columns[i],
            errorbar = None,
            ax=ax[j]
        )
        
        ax[j].legend(loc='upper right')
        ax[j].set_title(top_10_genres[j])
        ax[j].set_ylabel('Median Emotion Intensity Score')
        ax[j].set_xlabel('Movie Release Year')
        ax[j].tick_params(axis='x', rotation=90)

plt.show()


In [None]:
# Normalizing the emotion scores in percentage
df_topmg_norm = df_topmg.copy()
Total_score = df_topmg_norm[df_topmg_norm.columns[1:9]].sum(axis=1).copy()
for i in range(1,9):
    df_topmg_norm[df_topmg_norm.columns[i]] = df_topmg_norm[df_topmg_norm.columns[i]]*100/Total_score

df_topmg_norm[df_topmg_norm.columns[1:9]].sum(axis=1)

In [None]:
# Plotting the normalized emotion scores per year for each on of the top 10 most common genres
fig, ax = plt.subplots(10, 1, figsize=(20, 70), sharey = True)

for j in range(len(top_10_genres)):
    for i in range(1, 9):
        sns.pointplot(
            x="Movie release year",
            y=df_topmg_norm.columns[i],
            data=df_topmg_norm[(df_topmg_norm["Main genre"] == top_10_genres[j])],
            estimator="median",
            color=CB_color_cycle[i-1],
            label=df_topmg.columns[i],
            #errorbar=('ci', 95),
            errorbar = None,
            ax=ax[j]
        )
        
        ax[j].legend(loc='upper right')
        ax[j].set_title(top_10_genres[j])
        ax[j].set_ylabel('Median Emotion Intensity Score')
        ax[j].set_xlabel('Movie Release Year')
        ax[j].tick_params(axis='x', rotation=90)

plt.show()
