## Load Box Office, IMDb and Quotebank

In [61]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
import plotly

In [2]:
data_dir = os.getcwd() + os.sep + 'data'

df_Quotebank = pd.read_pickle(rf"{data_dir}{os.sep}Quotebank_full.pkl") 
df_boxOffice = pd.read_pickle(rf"{data_dir}{os.sep}boxOffice.pkl")                          

In [3]:
IMDbMovies = pd.read_csv(rf"{data_dir}{os.sep}IMDb{os.sep}IMDB movies.csv", low_memory=False)
IMDbRatings = pd.read_csv(rf"{data_dir}{os.sep}IMDb{os.sep}IMDB ratings.csv")    
df_IMDb = IMDbMovies.merge(IMDbRatings)

In [4]:
df_Quotebank.head(3)

Unnamed: 0,quotation,speaker,qids,date,numOccurrences,probas,urls,movie,shared_ID,AFINN_label,AFINN_score,VADER_label,VADER_score,BERT_label,posBERT_score,scaledBERT_score,year-month
0,"with the short and said, `We'd like to go with...",Al Jean,[Q2829373],2020-02-28,1,"[[None, 0.6415], [Al Jean, 0.3585]]",[https://comicbook.com/tv-shows/2020/02/28/the...,Onward,3832,POSITIVE,0.285714,POSITIVE,0.7003,POSITIVE,0.999716,0.889884,2020-02
1,Thrilled to announce that I will perform the e...,Brandi Carlile,[Q164060],2020-02-19,1,"[[Brandi Carlile, 0.543], [None, 0.3348], [Myc...",[https://klaw.com/brandi-carlile-carried-me-wi...,Onward,1151,POSITIVE,0.135135,POSITIVE,0.6996,POSITIVE,0.999764,0.910927,2020-02
2,Sonic the Hedgehog might not become a kid-movi...,Brian Truitt,[Q24387573],2020-02-14,1,"[[None, 0.7308], [Brian Truitt, 0.179], [Micha...",[http://www.gamespot.com/articles/sonic-the-he...,Sonic the Hedgehog,3155,POSITIVE,0.166667,POSITIVE,0.8899,POSITIVE,0.999813,0.937261,2020-02


In [5]:
df_boxOffice.head()

Unnamed: 0,days,dow,rank,daily,theaters,special events,movie
0,2019-05-24,Friday,1,31358935.0,4476,,Aladdin
1,2019-05-25,Saturday,1,30013295.0,4476,,Aladdin
2,2019-05-26,Sunday,1,30128699.0,4476,,Aladdin
3,2019-05-27,Monday,1,25305033.0,4476,Memorial Day,Aladdin
4,2019-05-28,Tuesday,1,12014982.0,4476,,Aladdin


In [6]:
df_IMDb.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,5.7,13.0,4.5,4.0,5.7,34.0,6.4,51.0,6.0,70.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,6.2,23.0,6.6,14.0,6.4,66.0,6.0,96.0,6.2,331.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,5.8,4.0,6.8,7.0,5.4,32.0,6.2,31.0,5.9,123.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,5.5,14.0,6.1,21.0,4.9,57.0,5.5,207.0,4.7,105.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,7.3,82.0,7.4,77.0,6.9,139.0,7.0,488.0,7.0,1166.0


In [25]:
df_boxOffice['year-month'] = [("-").join(date.split("-")[:2]) for date in df_boxOffice['days']]
df_boxOffice['year-month'].unique()

array(['2019-05', '2019-06', '2019-07', '2019-08', '2019-09', '2019-10',
       '2018-12', '2019-01', '2019-02', '2019-03', '2019-04', '2015-05',
       '2015-06', '2015-07', '2018-04', '2018-05', '2018-06', '2018-07',
       '2018-08', '2018-09', '2020-01', '2020-02', '2020-03', '2016-03',
       '2016-04', '2016-05', '2016-06', '2017-03', '2017-04', '2017-05',
       '2017-06', '2017-07', '2018-02', '2018-03', '2018-11', '2016-07',
       '2016-08', '2016-09', '2016-02', '2017-08', '2017-09', '2017-10',
       '2017-11', '2017-12', '2016-11', '2016-12', '2017-01', '2017-02',
       '2016-10', '2019-11', '2019-12', '2015-04', '2018-10', '2015-08',
       '2015-09', '2015-10', '2018-01', '2015-11', '2015-12', '2016-01',
       '2020-09', '2020-10', '2020-11', '2020-12', '2021-01', '2021-02',
       '2021-03', '2021-04'], dtype=object)

In [162]:
def wrangleData(df):
    #Creating numerical movie keys
    keys = [i for i in range(df['movie'].unique().__len__())]

    # create sub-dataframes for each movie sorted by the time index
    all_dataframes = {}
    for i, movie in zip(keys, df['movie'].sort_values().unique()):
        df_temp = df[df['movie'] == movie].sort_values('year-month', ascending=True)
        all_dataframes[i] = df_temp
    
    # concatenating all sub-dataframes to a single dataframe
    concat_df = []
    for i in all_dataframes:
        concat_df.append(all_dataframes[i])

    df_wrangled = pd.concat(concat_df)
    
    return df_wrangled

In [182]:
def createPlotDF(df_all, attribute, y_label, N_samples=10, replace=True):
    """ takes as input; a dataframe, the sentiment name we wish to investigate as well as
    number of movies in the subsample and whether it should be done with replacement or not.
    
    return as a dataframe containing movie title for all time indeces (i.e. year-month) as well as 
    the mean sentiment related to that movie in that time slot. This mean sentiment is forced to zero if no quotes
    about a movie in the given time-index are found.
    """
    
    # selecting sub-sample if 'N_samples' argument is passed
    if N_samples == None:
        df_sample = df_all
    else:
        selected_movies = df_all['movie'].sample(N_samples, replace=replace).values.tolist()
        df_sample = df_all[df_all['movie'].isin(selected_movies)]
    
    df_plot = {}
    i = 0
    
    # creating data frame for visualization
    for movie in tqdm(df_sample.movie.unique()):
        acc_mean = 0
        for month in df_sample['year-month'].sort_values().unique():
            # checking whether movie/time-index pair exists in dataframe
            condition = np.logical_and(df_sample['year-month'] == month, df_sample['movie'] == movie)

            if condition.sum() == 0: # if movie and time-index does not occur together
                acc_mean += 0 
                
            else: # when movie and time-index occur together, mean the specified scores from that time-index
                #if attribute == 'posBERT_score':
                    #df_sample[attribute] = (df_sample[attribute] - 0.5) * 2 #for negative sentiment interpretation
                    
                acc_mean += df_sample[condition][attribute].sum()
            
            # add values to dataframe for plotting
            df_plot[i] = [month, movie, acc_mean]
            i += 1
    
    # creating dataframe and specifying column names
    df_plot = pd.DataFrame(df_plot).T
    df_plot.columns = ['month', 'movie', f'Avg. monthly {y_label}']

    return df_plot, df_sample

In [183]:
def animatedBarPlot(df_plot, y_label, title, speed=0.2):
    """ 
    speed must be between 0 and 1 where 1 is not included.
    """
    
    upper = np.round(df_plot.groupby('movie')[f'Avg. monthly {y_label}'].max())
    lower = np.round(df_plot.groupby('movie')[f'Avg. monthly {y_label}'].min())
    range_y = [0, np.round(upper + 0.05*upper)]

    fig = px.bar(df_plot, x='movie', y=f"Avg. monthly {y_label}", color="movie",
                  animation_frame="month", animation_group="movie", 
                  range_y=range_y,# orientation='h',
                  title=title)
    
    fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = (1-speed) * 1000
    fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = (1-speed) * 1000
    
    fig.show()
    
    return fig

In [174]:
y_label = 'box office revenue'
df_boxOffice_wrangled = wrangleData(df_boxOffice)
attribute = 'daily'
N_samples = 10
title = 'Accumulated Box Office revenue across time'

df_plot, df_sample= createPlotDF(df_boxOffice_wranlged, attribute=attribute, y_label=y_label, N_samples=N_samples)

fig = animatedBarPlot(df_plot, y_label=y_label, speed=0.4, title=title)

  0%|          | 0/10 [00:00<?, ?it/s]

In [175]:
df_sample.groupby('movie').sum()

Unnamed: 0_level_0,rank,daily,theaters
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avengers: Infinity War,2399,678815482.0,240842
Bohemian Rhapsody,2845,216303339.0,241573
Deadpool,1470,363042311.0,206822
Despicable Me 3,3167,264624300.0,248423
Fantastic Beasts and Where to Find Them,2619,234037575.0,183213
Jurassic World,2336,652197981.0,253875
Rogue One,2415,334201140.0,243878
Sonic the Hedgehog,106,146066470.0,135100
Spider-Man: Far from Home,745,220159104.0,195496
Star Wars: Episode VIII - The Last Jedi,2247,620181382.0,197295


In [186]:
y_label = 'sentiment'
df_sentiment_wrangled = wrangleData(df_Quotebank)
attribute = 'scaledBERT_score'
N_samples = 10
title = 'Accumulated BERT sentiment across time'

df_plot, df_sample= createPlotDF(df_sentiment_wrangled, attribute=attribute, y_label=y_label, N_samples=N_samples)

fig = animatedBarPlot(df_plot, y_label=y_label, speed=0.4, title=title)

  0%|          | 0/10 [00:00<?, ?it/s]

In [185]:
df_sample.groupby('movie').sum()

Unnamed: 0_level_0,numOccurrences,shared_ID,AFINN_score,VADER_score,posBERT_score,scaledBERT_score
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avengers: Endgame,3771,13470112,-32.49994,662.9563,1554.118572,327.08696
Black Panther,3084,11436103,165.57793,744.2865,1593.061993,693.228804
Captain America: Civil War,1952,7770356,-49.537743,-257.8374,926.153384,287.875669
Deadpool,882,2273567,46.590351,202.3015,334.373356,28.352087
Mission: Impossible - Fallout,5360,20014090,251.324423,1240.7229,2552.253882,773.122672
Star Wars: Episode IX - The Rise of Skywalker,4579,14944474,241.296114,-190.5125,2250.951311,877.638598
Star Wars: Episode VII - The Force Awakens,6112,20099487,311.26176,-574.2501,3049.152283,1250.200548
Thor: Ragnarok,4335,15447656,233.640981,1084.1368,2190.180933,934.852559
