In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy

In [3]:
%run data_pipeline.ipynb

########## Data pipeline ##########

Preparing CMU data
379 movies shared both name and release year, dropping
314 movies had actors with the same name, dropping

Merging Oscar dataset, after merge:
Number of different Oscar nominated movies in dataset: 952 in total 63968 different movies
Number of different Oscar nominated actors in dataset: 801 in total 134907 different actors
Number of Oscar nominated rows: 1443

Merging IMDb dataset, after merge:
Number of movies with ratings: 36758
Oscar nominated movies with rating: 939
Number of rows in data before cleaning:  443504
Number of rows in data after cleaning:  23819
Number of rows where age is < 0: 7 . Dropping these rows

FINAL STATE OF DATA
Number of rows:  23812
Number of different Oscar nominated movies in dataset: 394 in total 5987 different movies
Number of different Oscar nominated actors in dataset: 284 in total 2959 different actors
Number of Oscar nominated rows: 519
Processing done, dataset written to cache/data.csv


In [4]:
# Read data
df = pd.read_csv('cache/data.csv', sep=',', index_col=0)
unclean_df = pd.read_csv('cache/unclean_data.csv', sep=',', index_col=0)

In [5]:
df.head()

Unnamed: 0,title,box_office_revenue,runtime,languages,countries,genres,movie_identifier,actor_gender,actor_height,actor_ethnicity,...,identifier,category,winner,oscar_nominated,year,average_rating,number_of_votes,number_of_movies_starred_in,average_rating_previous_movies,average_box_office_revenue_previous_movies
140029,Down to You,24419914,92,"['French Language', 'English Language']",['United States of America'],"['Romantic comedy', 'Romance Film', 'Drama', '...",down to you_2000,M,1.88,/m/0xnvg,...,down to you_2000_adam carolla,,,False,2000,5.0,15878,1,5.0,24419914.0
60320,The Bible: In The Beginning,34900023,171,['English Language'],"['United States of America', 'Italy']","['Christian film', 'Drama', 'Epic', 'World cin...",the bible in the beginning_1966,M,1.85,/m/03bkbh,...,the bible in the beginning_1966_richard harris,,,False,1966,6.2,6385,1,6.2,34900023.0
389034,Hawaii,34562222,161,['English Language'],['United States of America'],"['Period piece', 'Roadshow theatrical release'...",hawaii_1966,M,1.85,/m/03bkbh,...,hawaii_1966_richard harris,,,False,1966,6.5,3708,1,12.7,69462245.0
130002,Camelot,31102578,178,['English Language'],['United States of America'],"['Costume drama', 'Musical', 'Roadshow theatri...",camelot_1967,M,1.85,/m/03bkbh,...,camelot_1967_richard harris,,,False,1967,6.6,7624,2,9.65,50282411.5
182566,Caprice,4075000,95,['English Language'],['United States of America'],"['Romantic comedy', 'Crime Fiction', 'Mystery'...",caprice_1967,M,1.85,/m/03bkbh,...,caprice_1967_richard harris,,,False,1967,5.5,1761,3,8.266667,34879941.0


## **Historic movie industry development**

The movie industry has like the economy grown exponentially during the last 100 years. Hence one might expect higher demands and more competition for each Oscar nomination. It is nevertheless interesting and needed to validate this hypothesis with data. We start with constructing the dataframes used for the analysis, then plot the development of movie releases and actor nomnations since the 1920s. 

Constructing df for analysis

In [6]:
oscar_nominated = df[df['oscar_nominated']==True]

# Counting movie releases per year
movie_df = df.drop_duplicates(subset='movie_identifier', keep='first', inplace=False)[['movie_identifier', 'year']]
movie_releases = movie_df.groupby('year').size().reset_index(name='count')
movie_releases.head()

Unnamed: 0,year,count
0,1928,1
1,1929,1
2,1930,1
3,1932,3
4,1933,5


In [7]:
# Counting oscar nominations per year
oscar_nominations = oscar_nominated.groupby('year').size().reset_index(name='count')
years = pd.DataFrame({'year': movie_releases['year'].unique()})
oscar_nominations = years.merge(oscar_nominations, on='year', how='left').fillna(0)
oscar_nominations.head()

Unnamed: 0,year,count
0,1928,0.0
1,1929,0.0
2,1930,0.0
3,1932,0.0
4,1933,0.0


In [8]:
# Counting nominated movies per year
nominations_per_movie_per_year = oscar_nominated.groupby(['year', 'movie_identifier']).agg({'title':'count'}).reset_index().rename(columns={'title':'count'})
unique_nominated_movies_per_year = nominations_per_movie_per_year.groupby(['year']).agg({'movie_identifier':'count'}).reset_index().rename(columns={'movie_identifier':'count'})
unique_nominated_movies_per_year.head()

Unnamed: 0,year,count
0,1934,1
1,1936,1
2,1939,3
3,1940,1
4,1941,1


In [9]:
# Merging releases and nominated movies per year
movie_releases_with_nominations = movie_releases.merge(unique_nominated_movies_per_year, on='year', how='left').fillna(0).rename(columns={'count_x' : 'releases', 'count_y':'nominated movies'})
movie_releases_with_nominations.head()

Unnamed: 0,year,releases,nominated movies
0,1928,1,0.0
1,1929,1,0.0
2,1930,1,0.0
3,1932,3,0.0
4,1933,5,0.0


In [10]:
# Merging nr. of oscar nominations per year
releases_nominations_counts_df = movie_releases_with_nominations.merge(oscar_nominations, on='year', how='left').fillna(0).rename(columns={'count' : 'oscar nominations'})
releases_nominations_counts_df.head()

Unnamed: 0,year,releases,nominated movies,oscar nominations
0,1928,1,0.0,0.0
1,1929,1,0.0,0.0
2,1930,1,0.0,0.0
3,1932,3,0.0,0.0
4,1933,5,0.0,0.0


In [11]:
# Calculating % of movies that was nominated each year
releases_nominations_counts_df['%nominated'] = releases_nominations_counts_df['nominated movies'] / releases_nominations_counts_df['releases']
releases_nominations_counts_df.head()

Unnamed: 0,year,releases,nominated movies,oscar nominations,%nominated
0,1928,1,0.0,0.0,0.0
1,1929,1,0.0,0.0,0.0
2,1930,1,0.0,0.0,0.0
3,1932,3,0.0,0.0,0.0
4,1933,5,0.0,0.0,0.0


**Plotting**

In [79]:
x = releases_nominations_counts_df['year']
y = releases_nominations_counts_df['%nominated']
cs = CubicSpline(x, y)
x_smooth = np.linspace(min(x), max(x), 500)
y_smooth = cs(x_smooth)

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(y=releases_nominations_counts_df['releases'], x=releases_nominations_counts_df['year'], name="Nr. Movie releases"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(y=100*y_smooth, x=x_smooth, name=f"% with a nominated performance"),
    secondary_y=True,
    
)

fig.update_xaxes(title_text="Year")
fig.update_yaxes(title_text="Nr. of movies", secondary_y=False)
fig.update_yaxes(title_text="%", secondary_y=True)
fig.update_layout(
    title_text="Movie relases and percentage of movies with an Oscar nominated performance since 1928"
)
fig.show()

Clearly, the number of movie releases per year has fluctuated during the last 100 years. However, since the 1960, the number of movie releases has been on a steady uptrend, until the 2010s. In the meantime, the share of movies that has nominated actors has decreased dramatically from 20-40% in thw 30s-70s to far below 10% in the 2010s. 

The movie industry has like almost all other industries experienced exponential growth during the last 100 years. However, the growth has not been constant, there has been periods of significant decline, e.g. during the 1950s. The number of movies relased has steaditly increased since the 1960s until peaking in 2006 followed by a recent decline. Visualizing this, it becomes apparant that competition has a profound effect on an actors chances to be nominated for an oscar. This likelihood has declined from averaging 20% chance to below 2.5% in the 2000s. The trend seems to continue and it therefore essential for aspiring actors to start their career as early as possible before competition toughens further. 

##

## Review & Revenue

In [14]:
review_df = df.sort_values(by=['movie_identifier', 'oscar_nominated'], ascending=False)
review_df = review_df.drop_duplicates(subset='movie_identifier', keep='first')

In [15]:
nominated_review_df = review_df[review_df['oscar_nominated'] == True]
not_nominated_review_df = review_df[review_df['oscar_nominated'] == False]

In [78]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=nominated_review_df['average_rating'],
    histnorm='probability density',
    name='Nominated',
    marker_color='blue',
    opacity=0.7
))

fig.add_trace(go.Histogram(
    x=not_nominated_review_df['average_rating'],
    histnorm='probability density',
    name='Not nominated',
    opacity=0.7,
    marker_color='red'
))

fig.add_shape(
    type="line",
    x0=np.mean(nominated_review_df['average_rating']),
    y0=0,
    x1=np.mean(nominated_review_df['average_rating']),  
    y1=1,
    line=dict(color="blue", dash="dash"),
    name="Mean nominated", 
    showlegend=True
)

fig.add_shape(
    type="line",
    x0=np.mean(not_nominated_review_df['average_rating']),
    y0=0,
    x1=np.mean(not_nominated_review_df['average_rating']),
    y1=1,
    line=dict(color="red", dash="dash"),
    name="Mean not nominated", 
    showlegend=True
)

fig.update_layout(
    title="Review distributions for movies with and without nominated performances",
    xaxis_title="Average rating",
    yaxis_title="Density",
    barmode='overlay',
    template="plotly"
)

fig.show()

In [44]:
scipy.stats.kstest(nominated_review_df['average_rating'], not_nominated_review_df['average_rating'])

KstestResult(statistic=0.5997929790773637, pvalue=3.003225674514207e-126, statistic_location=6.8, statistic_sign=-1)

The average rating a movie recieves is indicative of several importat factors to take into consideration, e.g.: Movie quality, audience tastes, specific audience, the era of release and actor popularity. The overarching issue is that when actors recieve oscar nominations, it can have an effect on all the afformentioned factors contributiong to different review scores. Looking at the review distributions for movies with nominated and not nominated actors we can only draw one conclusion with certainty, that movies with nominated actors recieve significantly better reviews than movies with not nominated actors. This is also verified throught the Kolmogorov-Smirnov test, which reciews a p-value that is infitesimally small.  

In [23]:
revenue_df = df.sort_values(by=['movie_identifier', 'oscar_nominated'], ascending=False)
revenue_df = revenue_df.drop_duplicates(subset='movie_identifier', keep='first')
revenue_nominated_df = revenue_df[revenue_df['oscar_nominated'] == True]
revenue_not_nominated_df = revenue_df[revenue_df['oscar_nominated'] == False]

In [74]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=revenue_nominated_df['box_office_revenue'],
    histnorm='probability density',
    name='Nominated',
    marker_color='blue',
    opacity=0.7
))

fig.add_trace(go.Histogram(
    x=revenue_not_nominated_df['box_office_revenue'],
    histnorm='probability density',
    name='Not nominated',
    opacity=0.7,
    marker_color='red'
))

fig.add_shape(
    type="line",
    x0=np.mean(revenue_nominated_df['box_office_revenue']),
    y0=0.000000000001,
    x1=np.mean(revenue_nominated_df['box_office_revenue']),  
    y1=0.0001,
    line=dict(color="blue", dash="dash"),
    name="Mean nominated", 
    showlegend=True
)

fig.add_shape(
    type="line",
    x0=np.mean(revenue_not_nominated_df['box_office_revenue']),
    y0=0.000000000001,
    x1=np.mean(revenue_not_nominated_df['box_office_revenue']),
    y1=0.0001,
    line=dict(color="red", dash="dash"),
    name="Mean not nominated", 
    showlegend=True
)

fig.update_layout(
    title="Revenue distribution for movies with and without nominated performances",
    xaxis_title="Box Office Revenue",
    xaxis_range=[0,10**9],
    yaxis_title="Density",
    yaxis_type='log',
    barmode='overlay',
    template="plotly"
)

fig.show()

In [71]:
scipy.stats.kstest(revenue_not_nominated_df['box_office_revenue'], revenue_nominated_df['box_office_revenue'])

KstestResult(statistic=0.2406674949923808, pvalue=3.0894188983499494e-19, statistic_location=25143818, statistic_sign=1)

In [49]:
print(np.round(np.mean(revenue_not_nominated_df['box_office_revenue']),2))
print(np.round(np.mean(revenue_nominated_df['box_office_revenue']),2))

56522471.41
90997341.69


In [53]:
not_wo_max = revenue_not_nominated_df[revenue_not_nominated_df['box_office_revenue'] !=revenue_not_nominated_df['box_office_revenue'].max()]
nom_wo_max = revenue_nominated_df[revenue_nominated_df['box_office_revenue'] !=revenue_nominated_df['box_office_revenue'].max()]


print(np.round(np.mean(not_wo_max['box_office_revenue']),2))
print(np.round(np.mean(nom_wo_max['box_office_revenue']),2))

56035033.52
85668143.31
