# 1. Imports

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup 
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px


In [None]:
folder = "generated/"

df_events=pd.read_csv(folder + "usa_historical_events")
df_movies=pd.read_csv(folder + "cleaned_data")

In [None]:
df_events.head(5)
df_movies.head(5)

# 2. Preliminary Analysis

## 2.1 Number of movies

In [None]:
plt.figure(figsize=(10, 6))
df_movies['Movie release year'].value_counts().sort_index().plot(kind='line')
plt.xlabel("Movie Release Year")
plt.ylabel("Number of Movies")
plt.title("Number of Movies by Release Year")
plt.xticks(rotation=45)
plt.show()

## 2.2 Box office

In [None]:
df_plot = df_movies.dropna(subset=['Movie release year', 'Movie box office revenue'])
average_revenue_stats = df_plot.groupby('Movie release year')['Movie box office revenue'].agg(['mean', 'count'])
average_revenue_stats.columns = ['Average Revenue', 'Sample Count']
average_revenue_stats.head(5)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.set_xlabel("Movie Release Year")
ax1.set_ylabel("Average Box Office Revenue", color="tab:blue")
ax1.plot(average_revenue_stats.index, average_revenue_stats['Average Revenue'], color="tab:blue", label="Average Revenue")
ax1.tick_params(axis='y', labelcolor="tab:blue")
ax1.grid(True)

ax2 = ax1.twinx()
ax2.set_ylabel("Sample Count", color="tab:orange")
ax2.bar(average_revenue_stats.index, average_revenue_stats['Sample Count'], color="tab:orange", alpha=0.3, label="Sample Count")
ax2.tick_params(axis='y', labelcolor="tab:orange")

fig.suptitle("Average Box Office Revenue and Sample Count by Movie Release Year")
fig.tight_layout()
plt.show()

## 2.3 Languages

In [None]:
language_list = sum(df_treated['Movie languages'].tolist(), [])
language_counts = pd.Series(language_list).value_counts()

plt.figure(figsize=(10, 6))
language_counts[:20].plot(kind='bar')
plt.xlabel("Language")
plt.ylabel("Frequency")
plt.title("Frequency of Languages in Movie Data")
plt.xticks(rotation=45)
plt.show()
#empty list is like NaN
language_counts.to_string()

## 2.4 Genres

In [None]:
genres_list = sum(df_treated['Movie genres'].tolist(), [])


genre_counts = pd.Series(genres_list).value_counts()

# Plotting the value counts
plt.figure(figsize=(10, 6))
genre_counts[:20].plot(kind='bar')
plt.xlabel("Genre")
plt.ylabel("Frequency")
plt.title("Frequency of Genres in Movie Data")
plt.xticks(rotation=45)
plt.show()

In [None]:
genre_counts.to_string()

## 2.5 Countries

In [None]:
countries_list = sum(df_treated['Movie countries'].tolist(), [])


country_counts = pd.Series(countries_list).value_counts()

# Plotting the value counts
plt.figure(figsize=(10, 6))
country_counts[:20].plot(kind='bar')
plt.xlabel("Country")
plt.ylabel("Frequency")
plt.title("Frequency of Countries in Movie Data")
plt.xticks(rotation=45)
plt.show()
#empty is nan

In [None]:
country_counts.to_string()

## 2.6 Sentiment Score

In [None]:
plot_df = df_merged.dropna(subset=['Movie release year', 'Compound Score'])

mean_compound_score_per_year = plot_df.groupby('Movie release year')['Compound Score'].mean()
mean_compound_score_per_year = pd.DataFrame(mean_compound_score_per_year)


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(mean_compound_score_per_year.index, mean_compound_score_per_year['Compound Score'])
plt.xlabel('Movie Release Year')
plt.ylabel('Mean Compound Score')
plt.title('Mean Compound Score per Movie Release Year')
plt.grid(True)
plt.show()

In [None]:
fig = px.line(
    mean_compound_score_per_year,
    x=mean_compound_score_per_year.index,
    y='Compound Score',
    title='Mean Compound Score per Movie Release Year',
    labels={'Movie release year': 'Year', 'Compound Score': 'Mean Compound Score'}
)

fig.update_traces(hovertemplate="Year: %{x}<br>Compound Score: %{y:.2f}")

fig.show()

In [None]:
plot_df = df_merged.dropna(subset=['Movie release month', 'Compound Score'])

# Group by 'Movie release year' and calculate the mean of 'Compound Score'
mean_compound_score_per_month = plot_df.groupby('Movie release month')['Compound Score'].mean()
mean_compound_score_per_month = pd.DataFrame(mean_compound_score_per_month)

plt.figure(figsize=(10, 6))
plt.plot(mean_compound_score_per_month.index, mean_compound_score_per_month['Compound Score'])
plt.xlabel('Movie Release Month')
plt.ylabel('Mean Compound Score')
plt.title('Mean Compound Score per Movie Release Month')
plt.grid(True)
plt.show()

In [None]:
plot_df = df_merged.dropna(subset=['Movie release month', 'Compound Score'])

plot_df['Decade'] = (plot_df['Movie release year'] // 10) * 10

top_decades = plot_df['Decade'].value_counts().nlargest(5).index

top_decades_df = plot_df[filtered_df['Decade'].isin(top_decades)]

mean_compound_score_per_decade_month = top_decades_df.groupby(['Decade', 'Movie release month'])['Compound Score'].mean().unstack()

plt.figure(figsize=(10, 6))

for decade in mean_compound_score_per_decade_month.index:
    plt.plot(mean_compound_score_per_decade_month.columns, mean_compound_score_per_decade_month.loc[decade], label=f'{decade}s')

plt.xlabel('Movie Release Month')
plt.ylabel('Mean Compound Score')
plt.title('Mean Compound Score per Movie Release Month (Top 5 Decades with Most Data)')
plt.legend(title='Decade')
plt.grid(True)
plt.show()


In [None]:
row_means = mean_compound_score_per_decade_month.mean(axis=1)

mean_compound_score_per_decade = pd.DataFrame(row_means, columns=['Mean Compound Score'])

mean_compound_score_per_decade

In [None]:
fig = px.line(
    df_events, 
    x='Date', 
    y='Compound Score', 
    title='Compound Sentiment Score Over Time',
    labels={'Date': 'Date', 'Compound Score': 'Compound Sentiment Score'},
)

fig.update_traces(mode='lines+markers')  
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Compound Score',
    hovermode="x unified", 
    template="plotly_white", 
    width=800,
    height=500
)

fig.show()