# 1. IMPORTS

## 1.1 Libraries

In [5]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## 1.2 Data

In [8]:
folder = "../temporary/"

df_events=pd.read_csv(folder + "usa_historical_events.csv")
df_treated=pd.read_csv(folder + "cleaned_data.csv",index_col = 'Wikipedia movie ID')

PLOT_SUMMARIES = "../data/plot_summaries.txt"

folder = "../generated/"

df_final=pd.read_csv(folder + "movies_with_budget.csv",index_col = 'Wikipedia movie ID')

with open(PLOT_SUMMARIES, 'r', encoding='utf-8') as file:
    plot_summaries = file.readlines()

# 2. PLOT TONE SCORE

In [10]:
# For this, we use the "VADER sentiment analysis" library which contains a dictionary. Each of the words inside are associated to a polarity score depending on how positive or negative they are. One total 
#score is computed per summary, and then it is normalized and takes a value between -1 and +1 to categorize the text as positive, negative or neutral. 

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    compound_score = score['compound']
    if compound_score >= 0.1:
        sentiment = 'positive'
    elif compound_score <= -0.1:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment, compound_score

In [4]:
data = []

for line in plot_summaries:
    parts = line.strip().split('\t', 1) # Split each line by the tab character
    if len(parts) == 2:
        movie_id, summary = parts
        sentiment, compound_score = get_sentiment(summary)
        data.append([movie_id, summary, sentiment, compound_score])

df_sent = pd.DataFrame(data, columns=['ID', 'Summary', 'Sentiment', 'Compound Score'])
df_sent = df_sent.set_index('ID')

df_sent.head(10)

Unnamed: 0_level_0,Summary,Sentiment,Compound Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",neutral,0.0083
31186339,The nation of Panem consists of a wealthy Capi...,negative,-0.9941
20663735,Poovalli Induchoodan is sentenced for six yea...,negative,-0.9867
2231378,"The Lemon Drop Kid , a New York City swindler,...",negative,-0.6127
595909,Seventh-day Adventist Church pastor Michael Ch...,negative,-0.9538
5272176,The president is on his way to give a speech. ...,negative,-0.9946
1952976,"{{plot}} The film opens in 1974, as a young gi...",negative,-0.7904
24225279,"The story begins with Hannah, a young Jewish t...",negative,-0.5064
2462689,Infuriated at being told to write one final co...,negative,-0.9898
20532852,A line of people drool at the window of the s...,negative,-0.8176


In [5]:
df_sent.index = df_sent.index.astype('int64')
df_merged = pd.merge(df_treated, df_sent, left_index=True, right_index=True, how='left')

In [8]:
df_events[['Sentiment', 'Compound Score']] = df_events.apply(
    lambda row: pd.Series(get_sentiment(row['Description'])) if pd.notna(row['Description']) else pd.Series(get_sentiment(row['Title'])),
    axis=1)

In [9]:
df_merged.head(10)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",,,
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983
261236,A Woman in Flames,,1983.0,,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604
13696889,The Gangsters,5.0,1913.0,1913-05-29,,35.0,"['Silent film', 'English']",['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-...",,,
18998739,The Sorcerer's Apprentice,,2002.0,,,86.0,['English'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil Morgana returns...",negative,-0.8885
10408933,Alexander's Ragtime Band,8.0,1938.0,1938-08-16,76195730.0,106.0,['English'],['United States of America'],"['Musical', 'Comedy', 'Black-and-white']",,,
9997961,Contigo y aquí,,1974.0,,,,['Spanish'],['Argentina'],"['Musical', 'Drama', 'Comedy']",,,
2345652,City of the Dead,,1960.0,,,76.0,['English'],['United Kingdom'],"['Horror', 'Supernatural']",,,


In [12]:
df_final[['Sentiment', 'Compound Score']] = df_final.apply(
    lambda row: pd.Series(get_sentiment(row['Summary'])) if pd.notna(row['Summary']) 
    else pd.Series([np.nan, np.nan]),
    axis=1
)

In [13]:
df_final.head(10)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,Summary,Sentiment,Compound Score,Theme,vote_average,budget
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['English'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","Set in the second half of the 22nd century, th...",negative,-0.9913,Survival,5.127,49504060.0
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['English'],['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",Dramatization of the story behind the murder o...,negative,-0.9451,Mystery,,
28463795,Brun bitter,,1988.0,,,83.0,['Norwegian'],['Norway'],"['Crime Fiction', 'Drama']",,,,,,
9363483,White Of The Eye,,1987.0,,,110.0,['English'],['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",A series of murders of rich young women throug...,negative,-0.9983,Nihilism,,
261236,A Woman in Flames,5.0,1983.0,1983-05-11,,106.0,['German'],['Germany'],['Drama'],"Eva, an upper class housewife, becomes frustra...",positive,0.9604,Empowerment,5.3,
13696889,The Gangsters,5.0,1913.0,1913-05-29,,35.0,"['Silent film', 'English']",['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-...",An amusing burlesque of gang fighters. The pol...,positive,0.7269,Comedy,6.0,
18998739,The Sorcerer's Apprentice,4.0,2002.0,2002-04-12,,86.0,['English'],['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil Morgana returns...",negative,-0.8885,Choice,4.6,
10408933,Alexander's Ragtime Band,8.0,1938.0,1938-08-16,76195730.0,106.0,['English'],['United States of America'],"['Musical', 'Comedy', 'Black-and-white']","Classical violinist, Roger Grant disappoints h...",positive,0.8873,Love,6.6,42330960.0
9997961,Contigo y aquí,9.0,1974.0,1974-09-04,,,['Spanish'],['Argentina'],"['Musical', 'Drama', 'Comedy']",,,,,,
2345652,City of the Dead,,1960.0,,,76.0,['English'],['United Kingdom'],"['Horror', 'Supernatural']",,,,,,


# 3. EXPORT CLEAN DATASETS

In [12]:
df_merged.to_csv('../generated/cleaned_data.csv', index=True, encoding='utf-8')

In [13]:
df_events.to_csv('../generated/usa_historical_events.csv',index = False)

In [14]:
df_final.to_csv('../generated/USE_THIS_DATASET.csv',index = True)

In [15]:
df_final.isna().sum()

Movie name                      0
Movie release month         21171
Movie release year           6834
Movie release date          21171
Movie box office revenue    70535
Movie runtime               20379
Movie languages                 0
Movie countries                 0
Movie genres                    0
Summary                     19378
Sentiment                   19378
Compound Score              19378
Theme                       19605
vote_average                39682
budget                      73232
dtype: int64