# 1. IMPORTS

## 1.1 Libraries

In [37]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## 1.2 Data

In [53]:
folder = "../temporary/"

df_events=pd.read_csv(folder + "usa_historical_events.csv")
df_treated=pd.read_csv(folder + "cleaned_data.csv",index_col = 'Wikipedia movie ID')

PLOT_SUMMARIES = "../data/plot_summaries.txt"

with open(PLOT_SUMMARIES, 'r', encoding='utf-8') as file:
    plot_summaries = file.readlines()

# 2. PLOT TONE SCORE

In [55]:
# For this, we use the "VADER sentiment analysis" library which contains a dictionary. Each of the words inside are associated to a polarity score depending on how positive or negative they are. One total 
#score is computed per summary, and then it is normalized and takes a value between -1 and +1 to categorize the text as positive, negative or neutral. 

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    compound_score = score['compound']
    if compound_score >= 0.1:
        sentiment = 'positive'
    elif compound_score <= -0.1:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment, compound_score

In [57]:
data = []

for line in plot_summaries:
    parts = line.strip().split('\t', 1) # Split each line by the tab character
    if len(parts) == 2:
        movie_id, summary = parts
        sentiment, compound_score = get_sentiment(summary)
        data.append([movie_id, summary, sentiment, compound_score])

df_sent = pd.DataFrame(data, columns=['ID', 'Summary', 'Sentiment', 'Compound Score'])
df_sent = df_sent.set_index('ID')

df_sent.head(10)

Unnamed: 0_level_0,Summary,Sentiment,Compound Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23890098,"Shlykov, a hard-working taxi driver and Lyosha...",neutral,0.0083
31186339,The nation of Panem consists of a wealthy Capi...,negative,-0.9941
20663735,Poovalli Induchoodan is sentenced for six yea...,negative,-0.9867
2231378,"The Lemon Drop Kid , a New York City swindler,...",negative,-0.6127
595909,Seventh-day Adventist Church pastor Michael Ch...,negative,-0.9538
5272176,The president is on his way to give a speech. ...,negative,-0.9946
1952976,"{{plot}} The film opens in 1974, as a young gi...",negative,-0.7904
24225279,"The story begins with Hannah, a young Jewish t...",negative,-0.5064
2462689,Infuriated at being told to write one final co...,negative,-0.9898
20532852,A line of people drool at the window of the s...,negative,-0.8176


In [58]:
df_sent.index = df_sent.index.astype('int64')
df_merged = pd.merge(df_treated, df_sent, left_index=True, right_index=True, how='left')

In [62]:
df_event[['Sentiment', 'Compound Score']] = df_event.apply(
    lambda row: pd.Series(get_sentiment(row['Description'])) if pd.notna(row['Description']) else pd.Series(get_sentiment(row['Title'])),
    axis=1)

NameError: name 'df_event' is not defined

In [None]:
df_merged.head(10)

In [None]:
df_event.head(10)

# 3. EXPORT CLEAN DATASETS

In [None]:
df_merged.to_csv('../generated/cleaned_data.csv', index=True, encoding='utf-8')

In [None]:
df_event.to_csv('../generated/usa_historical_events.csv',index = False)