# 1. IMPORTS

## 1.1 Libraries

In [22]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## 1.2 Data

In [24]:
#import merged dataset
folder = "../temporary/"
df_treated=pd.read_csv(folder + "merged_data.csv",index_col = 'Wikipedia movie ID')


# 2. PLOT TONE SCORE

In [26]:
# For this, we use the "VADER sentiment analysis" library which contains a dictionary. Each of the words inside are associated to a polarity score depending on how positive or negative they are. One total 
#score is computed per summary, and then it is normalized and takes a value between -1 and +1 to categorize the text as positive, negative or neutral. 

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    compound_score = score['compound']
    if compound_score >= 0.1:
        sentiment = 'positive'
    elif compound_score <= -0.1:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment, compound_score

In [28]:
df_treated[['Sentiment', 'Compound Score']] = df_treated.apply(
    lambda row: pd.Series(get_sentiment(row['Summary'])) if pd.notna(row['Summary']) 
    else pd.Series([np.nan, np.nan]),
    axis=1)

In [29]:
df_treated.head(10)

Unnamed: 0_level_0,Movie name,Movie release month,Movie release year,Movie release date,Movie box office revenue,Movie runtime,Movie countries,Movie genres,Summary,budget,Sentiment,Compound Score
Wikipedia movie ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
975900,Ghosts of Mars,8.0,2001.0,2001-08-24,24771180.0,98.0,['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...","In 2176, a Martian police unit is sent to pick...",49504060.0,negative,-0.9112
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2.0,2000.0,2000-02-16,,95.0,['United States of America'],"['Mystery', 'Biographical film', 'Drama', 'Cri...",Dramatization of the story behind the murder o...,,negative,-0.9451
28463795,Brun bitter,,1988.0,,,83.0,['Norway'],"['Crime Fiction', 'Drama']",,,,
9363483,White Of The Eye,,1987.0,,,110.0,['United Kingdom'],"['Thriller', 'Erotic thriller', 'Psychological...",,,,
261236,A Woman in Flames,5.0,1983.0,1983-05-11,,106.0,['Germany'],['Drama'],"Eva, an upper-class housewife, frustratedly le...",,negative,-0.2732
13696889,The Gangsters,5.0,1913.0,1913-05-29,,35.0,['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-...",An amusing burlesque of gang fighters. The pol...,,positive,0.7269
18998739,The Sorcerer's Apprentice,4.0,2002.0,2002-04-12,,86.0,['South Africa'],"['Family Film', 'Fantasy', 'Adventure', 'World...","Every hundred years, the evil sorceress Morgan...",,negative,-0.8885
10408933,Alexander's Ragtime Band,8.0,1938.0,1938-08-16,76195730.0,106.0,['United States of America'],"['Musical', 'Comedy', 'Black-and-white']","Classical violinist, Roger Grant disappoints h...",42330960.0,positive,0.8873
9997961,Contigo y aquí,9.0,1974.0,1974-09-04,,,['Argentina'],"['Musical', 'Drama', 'Comedy']",,,,
2345652,City of the Dead,,1960.0,,,76.0,['United Kingdom'],"['Horror', 'Supernatural']",,,,


# 3. EXPORT CLEAN DATASETS

In [36]:
df_treated.to_csv('../temporary/with_plottone_data.csv',index = True)

In [40]:
df_treated.isna().sum()

Movie name                      0
Movie release month         21171
Movie release year           6834
Movie release date          21171
Movie box office revenue    70535
Movie runtime               20379
Movie countries                 0
Movie genres                    0
Summary                     32459
budget                      73232
Sentiment                   32459
Compound Score              32459
dtype: int64