In [235]:
import pandas as pd
import numpy as np
import plotly
import dataset
import statsmodels as sts
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [236]:
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.0)
plt.rcParams['grid.linewidth'] = 0.5

# Dataset Loading

Keep only necessary dataset(s)

In [237]:
augmented_cmu = dataset.get_augmented_cmu()
imdb_df = dataset.get_imdb_dataset()


Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.



### Community Labels

In [238]:
community_dict = {0:'Romantic Movies & Social Commentary (C0)', 1:'Diverse Drama & Action (C1)', 2:'Lighthearted Entertainment (C2)', 3:'Dark & Suspenseful Fiction (C3)', 4:'Historical & Cultural Narratives (C4)'}

movie_community = pd.read_csv('Output/cmu_community_assignment.csv').drop(columns=['Unnamed: 0'])
movie_community['community_label'] = movie_community['hard_assignment'].apply(lambda x: community_dict.get(x))
cmu_community = pd.merge(left=augmented_cmu, right=movie_community, on='movie_wikipedia_id', how='inner')

In [239]:
cmu_community.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'language', 'word_count', 'char_count',
       'avg_word_length', 'sentence_count', 'lexical_diversity',
       'sentiment_polarity', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting', 'imdb_id', 'imdb_name',
       'imdb_year', 'prob_c0', 'prob_c1', 'prob_c2', 'prob_c3', 'prob_c4',
       'hard_assignment', 'community_label'],
      dtype='object')

Get community assignments and labels

### Customize Dataset

To prevent merge errors later, we should always initialize the dataset used in this way. Then, everything below this cell is "guaranteed" to run.

##### Box Office Mojo Revenue Data

Source: scraped

In [240]:
boxofficemojo = dataset.get_boxofficemojo_dataset()
boxofficemojo = boxofficemojo[['imdb_id', 'budget', 'performance_worldwide']]
boxofficemojo.columns = ['imdb_id', 'budget', 'revenue']
boxofficemojo = boxofficemojo[(boxofficemojo['budget'].notnull()) | (boxofficemojo['revenue'].notnull())].reset_index(drop=True)
boxofficemojo = boxofficemojo.replace(np.nan, 0.0)

##### Kaggle Movie Revenue Data

Source: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [241]:
kaggle_movie = pd.read_csv('Data/kaggle_movies_dataset/movies_metadata.csv')
kaggle_movie = kaggle_movie[['imdb_id', 'budget', 'revenue']]
kaggle_movie['budget'] = kaggle_movie['budget'].apply(lambda x: float(x) if isinstance(x, int) or isinstance(x, float) or x.isdigit() else 0)
kaggle_movie['revenue'] = kaggle_movie['revenue'].apply(lambda x: float(x) if isinstance(x, int) or isinstance(x, float) or x.isdigit() else 0)
kaggle_movie = kaggle_movie[(kaggle_movie['budget'] > 0.0) |(kaggle_movie['revenue'] > 0.0)].reset_index(drop=True)


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.



##### CMU Revenue Data

Source: CMU Movie Corpus Dataset

In [242]:
revenue_df = augmented_cmu[['imdb_id', 'revenue']]
revenue_df.columns = ['imdb_id', 'revenue_z']
revenue_df = revenue_df[revenue_df['revenue_z'].notnull()]

##### Merging Revenue Data

In [243]:
# @Function: get prioritized revenue -> Box Office Mojo > Kaggle > CMU
def get_revenue(x):
    # Box Office Mojo -> most recent data
    if x.get('revenue_x') != 0.0:
        return x.get('revenue_x')
    # Kaggle Movie Dataset -> updated 2019
    elif x.get('revenue_y') != 0.0:
        return x.get('revenue_y')
    # CMU revenue data -> from 2015
    elif x.get('revenue_z') != 0.0:
        return x.get('revenue_z')
    # Did not find revenue
    return 0.0
    
# @Function: get prioritized revenue -> Box Office Mojo > Kaggle > CMU
def get_budget(x):
    # Box Office Mojo -> most recent data
    if x.get('budget_x') != 0.0:
        return x.get('budget_x')
    # Kaggle Movie Dataset -> updated 2019
    elif x.get('budget_y') != 0.0:
        return x.get('budget_y')
    # Did not find revenue
    return 0.0

In [244]:
xrevenue = pd.merge(left=boxofficemojo, right=kaggle_movie, on='imdb_id', how='outer')
xrevenue = pd.merge(left=xrevenue, right=revenue_df, on='imdb_id', how='outer')
xrevenue = xrevenue.replace(np.nan, 0.0)

xrevenue['revenue'] = xrevenue.apply(get_revenue, axis=1)
xrevenue['budget'] = xrevenue.apply(get_budget, axis=1)

xrevenue = xrevenue[['imdb_id', 'budget', 'revenue']]

In [245]:
cmu_community.columns

Index(['movie_wikipedia_id', 'movie_freebase_id', 'movie_name', 'release_year',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'plot_summary', 'language', 'word_count', 'char_count',
       'avg_word_length', 'sentence_count', 'lexical_diversity',
       'sentiment_polarity', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting', 'imdb_id', 'imdb_name',
       'imdb_year', 'prob_c0', 'prob_c1', 'prob_c2', 'prob_c3', 'prob_c4',
       'hard_assignment', 'community_label'],
      dtype='object')

In [246]:
revenue_df = cmu_community.copy()[['movie_freebase_id', 'imdb_id', 'imdb_name', 'hard_assignment', 'runtime','languages','countries','plot_summary','word_count','char_count',
                                  'avg_word_length','sentence_count','lexical_diversity','sentiment_polarity','topic','mood',
                                  'target_audience','temporal_setting','location_setting',
                                  ]]
revenue_df['community_labels'] = revenue_df['hard_assignment'].apply(lambda x: community_dict.get(x))

# MERGE FOR IMDB DATASET
revenue_df = pd.merge(left=revenue_df, right=imdb_df, how='inner', on='imdb_id')

# MERGE FOR REVENUE
revenue_df = pd.merge(left=revenue_df, right=xrevenue, how='inner', on='imdb_id')

# Basic Visualizations

Some basic dataviz to illustrate distributions, label counts, etc. Just to describe the dataset

# Plot Feature Analysis

Here, we can hopefully include some of the work done by the russians

In [389]:
plot_features = pd.read_csv('Data/plot_features.csv')

In [390]:
plot_features = pd.read_csv('Data/plot_features.csv')

plot_features[['languages', 'countries', 'genres', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting']] = plot_features[
            ['languages', 'countries', 'genres', 'topic', 'mood', 'target_audience',
       'temporal_setting', 'location_setting']
].applymap(json.loads, na_action='ignore')

plot_features[['languages', 'countries', 'genres']] = \
    plot_features[['languages', 'countries', 'genres']].applymap(
        lambda x: [y.lower() for y in x.values()], na_action='ignore')

In [391]:
set(y for x in plot_features['topic'].dropna() for y in x) - \
    set(y for x in plot_features['genres'].dropna() for y in x)

{'conflict', 'historical', 'romance'}

In [392]:
plot_features['genres'] = (plot_features['genres'] + plot_features['topic'])\
    .map(lambda x: list(set(x)), na_action='ignore')
plot_features = plot_features.drop(columns='topic')

In [393]:
plot_features['language'].unique()

array(['en'], dtype=object)

In [394]:
plot_features = plot_features.drop(columns='language')

In [395]:
plot_features = plot_features[
    ~plot_features[['genres', 'mood', 'target_audience', 'temporal_setting', 'location_setting']]\
    .isna().any(axis=1)]

In [415]:
mood_values = set(y for x in plot_features['mood'].dropna() for y in x)
target_audience_values = set(y for x in plot_features['target_audience'].dropna() for y in x)
temporal_setting_values = set(y for x in plot_features['temporal_setting'].dropna() for y in x)
location_setting_values = set(y for x in plot_features['location_setting'].dropna() for y in x)

In [416]:
plot_features

Unnamed: 0,movie_wikipedia_id,movie_freebase_id,movie_name,release_year,revenue,runtime,languages,countries,genres,plot_summary,...,char_count,avg_word_length,sentence_count,lexical_diversity,sentiment_polarity,mood,target_audience,temporal_setting,location_setting,originality_score
0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,[english language],[united states of america],"[space western, adventure, thriller, horror, s...","Set in the second half of the 22nd century, th...",...,2181,6.109244,15,0.627451,-0.085095,"[dark, dramatic]",[adults],[future],[fictional],0.461384
1,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,[english language],[united kingdom],"[thriller, psychological thriller, mystery, dr...",A series of murders of rich young women throug...,...,3301,5.594915,36,0.542373,0.035867,[dark],[adults],[modern],[real],0.432487
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,[german language],[germany],[drama],"Eva, an upper class housewife, becomes frustra...",...,2339,5.490610,24,0.582160,0.133259,[dark],[adults],[modern],[real],0.497707
3,18998739,/m/04jcqvw,The Sorcerer's Apprentice,2002,,86.0,[english language],[south africa],"[world cinema, family film, fantasy, adventure]","Every hundred years, the evil Morgana returns...",...,870,5.337423,7,0.631902,0.040568,[inspirational],[children],[past],[fictional],0.682414
4,6631279,/m/0gffwj,Little city,1997,,93.0,[english language],[united states of america],"[romantic drama, romance film, comedy, romance...","Adam, a San Francisco-based artist who works a...",...,1234,5.484444,9,0.626667,0.165202,"[romantic, dramatic]",[adults],[modern],[real],0.618238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25564,26482675,/m/0bbwngb,Eşrefpaşalılar,2010,1847671.0,,,,"[conflict, comedy film, romance, drama]","The film is about two friends, Tayyar , a mafi...",...,601,5.564815,4,0.731481,0.258333,[dramatic],[adults],[modern],[real],
25565,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011,,120.0,[english language],[united states of america],"[mystery, drama, science fiction]",Two former National Oceanic Atmospheric Admini...,...,664,6.384615,5,0.759615,0.108333,[dark],[adults],[modern],[real],
25566,34980460,/m/0g4pl34,Knuckle,2011,,96.0,[english language],"[ireland, united kingdom]","[biographical film, documentary, conflict, drama]",{{No plot}} This film follows 12 years in the ...,...,368,5.750000,3,0.781250,0.010000,[dramatic],[adults],[modern],[real],
25567,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992,,150.0,[japanese language],[japan],"[adventure, animation, anime, drama, japanese ...","The story takes place in the year 2092,The Sup...",...,1237,6.216080,8,0.713568,0.194003,"[exciting, fantastical]",[adults],[future],[fictional],


In [417]:
common_columns = set(plot_features.columns).intersection(set(revenue_df.columns))
common_columns.discard('movie_freebase_id')
revenue_df = revenue_df.drop(columns=common_columns)
revenue_df

Unnamed: 0,movie_freebase_id,imdb_id,imdb_name,hard_assignment,topic,community_labels,title_type,is_adult,runtime_minutes,avg_rating,...,inspirational,lighthearted,romantic,dark,dramatic,exciting,fantastical,inspirational.1,lighthearted.1,romantic.1
0,/m/03vyhn,tt0228333,Ghosts of Mars,3,"['Science Fiction', 'Horror']",Dark & Suspenseful Fiction (C3),movie,0,98,4.9,...,0,0,0,1,1,0,0,0,0,0
1,/m/016ywb,tt0097499,Henry V,1,"['Historical', 'Drama', 'Romance']",Diverse Drama & Action (C1),movie,0,137,7.5,...,0,0,0,0,1,0,0,0,0,0
2,/m/014k4y,tt0255819,Baby Boy,0,"['Drama', 'Romance']",Romantic Movies & Social Commentary (C0),movie,0,130,6.4,...,0,0,0,0,1,0,0,0,0,0
3,/m/0b6kc_5,tt0166158,Daddy and Them,2,"['Comedy', 'Romance']",Lighthearted Entertainment (C2),movie,0,101,5.6,...,0,1,0,0,1,0,0,0,1,0
4,/m/02vlsqt,tt0405393,Rudo y Cursi,0,"['Drama', 'Romance']",Romantic Movies & Social Commentary (C0),movie,0,106,6.7,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7501,/m/05mc7l,tt0087004,The Brother from Another Planet,2,"['Science Fiction', 'Comedy', 'Drama']",Lighthearted Entertainment (C2),movie,0,108,6.8,...,0,1,0,0,0,1,1,0,1,0
7502,/m/0kvgqb,tt0100666,Spaced Invaders,2,"['Comedy', 'Science Fiction']",Lighthearted Entertainment (C2),movie,0,100,5.3,...,0,1,0,0,0,0,0,0,1,0
7503,/m/0660qx,tt0120202,State and Main,2,"['Comedy', 'Drama']",Lighthearted Entertainment (C2),movie,0,105,6.7,...,0,1,0,0,0,0,0,0,1,0
7504,/m/030xw6,tt0107057,Guilty as Sin,3,"['Mystery', 'Drama', 'Thriller']",Dark & Suspenseful Fiction (C3),movie,0,107,5.7,...,0,0,0,1,1,0,0,0,0,0


## Merged info on plot summaries and revenues 

In [418]:
revenue_df = pd.merge(revenue_df, plot_features, on='movie_freebase_id', how='inner')

In [419]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Assuming the lists in 'mood' column are not NaN and are cleaned (e.g., no empty strings)
# If you have NaNs or empty lists, you'll need to handle them before this step
revenue_df['mood'] = revenue_df['mood'].apply(lambda x: x if isinstance(x, list) else [])

# Perform one-hot encoding on the 'mood' column
mood_encoded = mlb.fit_transform(revenue_df['mood'])

# Create a DataFrame with the encoded mood variables
mood_encoded_df = pd.DataFrame(mood_encoded, columns=mlb.classes_)

# Merge the new one-hot encoded columns back with the original DataFrame
revenue_df = pd.concat([revenue_df, mood_encoded_df], axis=1)

# Display the DataFrame to verify the changes
revenue_df.head()

Unnamed: 0,movie_freebase_id,imdb_id,imdb_name,hard_assignment,topic,community_labels,title_type,is_adult,runtime_minutes,avg_rating,...,temporal_setting,location_setting,originality_score,dark,dramatic,exciting,fantastical,inspirational,lighthearted,romantic
0,/m/03vyhn,tt0228333,Ghosts of Mars,3,"['Science Fiction', 'Horror']",Dark & Suspenseful Fiction (C3),movie,0,98,4.9,...,[future],[fictional],0.461384,1,1,0,0,0,0,0
1,/m/016ywb,tt0097499,Henry V,1,"['Historical', 'Drama', 'Romance']",Diverse Drama & Action (C1),movie,0,137,7.5,...,[past],[real],,0,1,0,0,0,0,0
2,/m/014k4y,tt0255819,Baby Boy,0,"['Drama', 'Romance']",Romantic Movies & Social Commentary (C0),movie,0,130,6.4,...,[modern],[real],,0,1,0,0,0,0,0
3,/m/0b6kc_5,tt0166158,Daddy and Them,2,"['Comedy', 'Romance']",Lighthearted Entertainment (C2),movie,0,101,5.6,...,[modern],[real],,0,1,0,0,0,1,0
4,/m/02vlsqt,tt0405393,Rudo y Cursi,0,"['Drama', 'Romance']",Romantic Movies & Social Commentary (C0),movie,0,106,6.7,...,[modern],[fictional],0.675307,0,1,0,0,0,0,0


In [420]:
mood_data

Unnamed: 0,revenue,mood,is_mood
0,14.010832,dark,1
4,3.416846,dark,1
7,0.529677,dark,1
8,4.650000,dark,1
10,10.600000,dark,1
...,...,...,...
83118,9.689816,romantic,1
83124,35.387212,romantic,1
83128,40.693477,romantic,1
83138,11.124511,romantic,1


In [421]:
# Calculate the 95th percentile for the 'revenue' column
percentile_95 = revenue_df['revenue'].quantile(0.95)

# Filter the DataFrame to exclude revenues above the 95th percentile
filtered_df = revenue_df[revenue_df['revenue'] <= percentile_95]

# Convert 'revenue' to millions
filtered_df['revenue'] = filtered_df['revenue'] / 1e6

# Reshape the DataFrame
mood_data = pd.melt(filtered_df, 
                    id_vars=['revenue'], 
                    value_vars=['dark', 'dramatic', 'exciting', 'fantastical', 'inspirational', 'lighthearted', 'romantic'],
                    var_name='mood', 
                    value_name='is_mood')
mood_data = mood_data[mood_data['is_mood'] == 1]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [422]:
mood_data

Unnamed: 0,revenue,mood,is_mood
0,14.010832,dark,1
4,3.416846,dark,1
7,0.529677,dark,1
8,4.650000,dark,1
10,10.600000,dark,1
...,...,...,...
110838,9.689816,romantic,1
110844,35.387212,romantic,1
110848,40.693477,romantic,1
110858,11.124511,romantic,1


In [423]:
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px

# Assuming mood_data is already prepared as per the code provided

# Create the boxplot with Plotly
fig = go.Figure()

# Add a box trace for each mood
for mood in mood_data['mood'].unique():
    mood_subset = mood_data[mood_data['mood'] == mood]
    fig.add_trace(go.Box(
        y=mood_subset['revenue'],
        name=mood,
        boxpoints='outliers',  # Only show outliers beyond the whiskers
        jitter=0.5,  # Spread out the points for better visibility
        marker=dict(
            size=2,
        ),
        line=dict(width=1),
    ))

# Customize the layout
fig.update_layout(
    title='Revenue Distribution by Movie Mood (95th Percentile Cutoff)',
    yaxis=dict(
        title='Revenue (Millions $)',
        tickformat=',.0f',  # No decimal places
        tickprefix='$'  # Add dollar sign prefix
    ),
    xaxis=dict(title='Mood'),
    showlegend=False
)

# Calculate the number of observations in each category
num_obs = mood_data['mood'].value_counts().sort_index()

# Add the count annotations
for i, mood in enumerate(num_obs.index):
    fig.add_annotation(
        x=mood,
        y=mood_data[mood_data['mood'] == mood]['revenue'].max(),
        text=f'n={num_obs[mood]}',
        showarrow=False,
        yshift=10
    )

# Show the figure
fig.show()

# Save the figure to an HTML file
fig.write_html("revenue_distribution_by_mood.html", full_html=False, include_plotlyjs='cdn')


In [424]:
print(mood_data[mood_data['mood'] == 'exciting']['revenue'].median())
print(mood_data[mood_data['mood'] == 'exciting']['revenue'].quantile(0.25))
print(mood_data[mood_data['mood'] == 'exciting']['revenue'].quantile(0.75))

36.891985500000004
11.538235
92.991835


In [425]:
num_obs['exciting']

2328

**Movies in the 'exciting' mood category have a median revenue of approximately 37 million, suggesting a strong box office performance for this type of film. The interquartile range of 12 million to 93 million indicates that while some films achieve outstanding financial success, there is variability in the revenue performance within this category. The presence of outliers above the upper whisker shows that some 'exciting' movies significantly outperform others in terms of revenue.**

In [439]:
import plotly.express as px

# Assuming filtered_df is already defined with the necessary data
# Creating the boxplot with Plotly

fig = px.box(
    filtered_df,
    x='target_audience_str',
    y='revenue_millions',
    title='Revenue Distribution by Target Audience (95th Percentile Cutoff)',
    labels={'target_audience_str': 'Target Audience', 'revenue_millions': 'Revenue (Millions $)'}
)

# Adjusting the layout for better readability
fig.update_layout(
    xaxis_title='Target Audience',
    yaxis=dict(
        title='Revenue (Millions $)',
        tickformat=',.0f',
        tickprefix='$'
    ),
    xaxis=dict(tickangle=45)
)

# Adding the number of observations as annotations
for i, audience in enumerate(filtered_df['target_audience_str'].unique()):
    count = filtered_df[filtered_df['target_audience_str'] == audience].shape[0]
    fig.add_annotation(
        x=audience,
        y=filtered_df['revenue_millions'].max(),
        text=f'n={count}',
        showarrow=False,
        yshift=10
    )

# Show plot
fig.show()

# Save the figure to an HTML file
fig.write_html("revenue_distribution_by_target_audience", full_html=False, include_plotlyjs='cdn')

In [433]:
import plotly.graph_objects as go

# Assuming mood_target_revenue_df is already defined as per the previous code

# Create the pivot table for average revenue
pivot_revenue = mood_target_revenue_df.pivot_table(index='Mood', columns='TargetAudience', values='Revenue', aggfunc='mean')

# Create the pivot table for count of movies
pivot_count = mood_target_revenue_df.pivot_table(index='Mood', columns='TargetAudience', values='Revenue', aggfunc='count')

# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=pivot_revenue,
    x=pivot_revenue.columns,
    y=pivot_revenue.index,
    hoverongaps=False,
    colorbar_title='Average Revenue<br>(Millions $)',
    text=pivot_count.values,
    hoverinfo="text+z",
    colorscale="blues"
))

# Update layout
fig.update_layout(
    title='Average Revenue by Mood and Target Audience',
    xaxis_title='Target Audience',
    yaxis_title='Mood',
    xaxis=dict(tickangle=45)
)

# Display the figure
fig.show()


# Save the figure to an HTML file
fig.write_html("revenue_heatmap_by_target_audience_and_mood", full_html=False, include_plotlyjs='cdn')

In [332]:
tom = pd.read_csv('rottentomatoes.csv', index_col=0)

In [440]:
import plotly.graph_objects as go
import pandas as pd
import scipy.stats

# Assuming 'tom' is your DataFrame and it contains 'release_year' and 'tomato_score' columns

# Calculate the average tomato score and count of movies per year
yearly_stats = tom.groupby('release_year')['tomato_score'].agg(['mean', 'size', 'sem']).reset_index()

# Calculate the confidence interval
confidence_interval = yearly_stats['sem'] * scipy.stats.t.ppf((1 + 0.95) / 2., yearly_stats['size'] - 1)

# Create the line chart for the average tomato scores with confidence interval
fig = go.Figure()

# Line for the average score
fig.add_trace(go.Scatter(
    x=yearly_stats['release_year'],
    y=yearly_stats['mean'],
    mode='lines+markers',
    name='Average Tomato Critic Score',
    line=dict(color='blue')
))

# Upper bound for the confidence interval
fig.add_trace(go.Scatter(
    x=yearly_stats['release_year'],
    y=yearly_stats['mean'] + confidence_interval,
    mode='lines',
    marker=dict(color="#444"),
    line=dict(width=0),
    showlegend=False
))

# Lower bound for the confidence interval
fig.add_trace(go.Scatter(
    x=yearly_stats['release_year'],
    y=yearly_stats['mean'] - confidence_interval,
    marker=dict(color="#444"),
    line=dict(width=0),
    mode='lines',
    fillcolor='rgba(68, 68, 68, 0.3)',
    fill='tonexty',
    showlegend=False
))

# Bar chart for the number of movies
fig.add_trace(go.Bar(
    x=yearly_stats['release_year'],
    y=yearly_stats['size'],
    name='Number of Movies',
    yaxis='y2',
    marker=dict(color='grey', opacity=0.5)
))

# Update the layout
fig.update_layout(
    title='Average Critic Scores Over Time with Movie Counts and Variance',
    xaxis_title='Release Year',
    yaxis_title='Average Tomato Critic Score',
    yaxis2=dict(
        title='Number of Movies',
        overlaying='y',
        side='right',
        showgrid=False,
    ),
    legend=dict(x=0.01, y=0.99)
)

# Show the figure
fig.show()

# Save the figure to an HTML file
fig.write_html("average_critic_scores_with_variance.html", full_html=False, include_plotlyjs='cdn')


In [441]:
import plotly.graph_objects as go

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import scipy.stats


data_cleaned_features = pd.read_csv('Data/plot_features.csv')

# Merging the Rotten Tomatoes data with the cleaned features data
merged_data = pd.merge(tom, data_cleaned_features, on=['movie_name', 'release_year'], how='inner')

# Function to convert JSON strings to lists
def json_str_to_list(json_str):
    return json.loads(json_str) if pd.notnull(json_str) else []

# Convert 'topic' column from JSON string to list
merged_data['topic'] = merged_data['topic'].apply(json_str_to_list)

# Explode the 'topic' column to have separate rows for each topic
exploded_data = merged_data.explode('topic')

# Filtering for movies released after 1990
filtered_data = exploded_data[exploded_data['release_year'] > 1990]

# Function to calculate the mean and 95% confidence interval
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    mean, se = np.mean(data), scipy.stats.sem(data)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return mean, mean-h, mean+h

# Group by 'topic' and apply the confidence interval function
grouped_data = filtered_data.groupby('topic')['tomato_score']
mean_ci_data = grouped_data.apply(lambda x: mean_confidence_interval(x.dropna()))
# Sort the mean_ci_data by the mean scores before plotting
sorted_mean_ci_data = mean_ci_data.sort_values(ascending=False, key=lambda x: x.map(lambda x: x[0]))

# Create the figure with error bars for the confidence intervals
fig = go.Figure(data=[
    go.Bar(
        name='Avg Tomato Score',
        x=sorted_mean_ci_data.index,
        y=sorted_mean_ci_data.map(lambda x: x[0]),
        error_y=dict(
            type='data',
            array=sorted_mean_ci_data.map(lambda x: (x[2] - x[1]) / 2),
            visible=True
        ),
        marker_color='skyblue'
    )
])

# Update the layout
fig.update_layout(
    title='Average Tomato Scores by Topic (Post 1990, With Confidence Intervals)',
    xaxis_title='Topic',
    yaxis_title='Average Tomato Score',
    showlegend=False
)

# Show the plot
fig.show()

# Save the plot to an HTML file
fig.write_html("average_tomato_scores_by_topic.html", full_html=False, include_plotlyjs='cdn')
