In [323]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import plotly.graph_objects as go

In [324]:
%run data_pipeline.ipynb

########## Data pipeline ##########

Preparing CMU data
379 movies shared both name and release year, dropping
314 movies had actors with the same name, dropping

Merging Oscar dataset, after merge:
Number of different Oscar nominated movies in dataset: 952 in total 63968 different movies
Number of different Oscar nominated actors in dataset: 801 in total 134907 different actors
Number of Oscar nominated rows: 1443

Merging IMDb dataset, after merge:
Number of movies with ratings: 36758
Oscar nominated movies with rating: 939
Number of rows in data before cleaning:  443504
Number of rows in data after cleaning:  23819
Number of rows where age is < 0: 7 . Dropping these rows

FINAL STATE OF DATA
Number of rows:  23812
Number of different Oscar nominated movies in dataset: 394 in total 5987 different movies
Number of different Oscar nominated actors in dataset: 284 in total 2959 different actors
Number of Oscar nominated rows: 519
Processing done, dataset written to cache/data.csv


In [684]:
# Read data
df = pd.read_csv('cache/data.csv', sep=',', index_col=0)
unclean_df = pd.read_csv('cache/unclean_data.csv', sep=',', index_col=0)

In [685]:
df.head()

Unnamed: 0,title,box_office_revenue,runtime,languages,countries,genres,movie_identifier,actor_gender,actor_height,actor_ethnicity,...,identifier,category,winner,oscar_nominated,year,average_rating,number_of_votes,number_of_movies_starred_in,average_rating_previous_movies,average_box_office_revenue_previous_movies
140029,Down to You,24419914,92,"['French Language', 'English Language']",['United States of America'],"['Romantic comedy', 'Romance Film', 'Drama', '...",down to you_2000,M,1.88,/m/0xnvg,...,down to you_2000_adam carolla,,,False,2000,5.0,15878,1,5.0,24419914.0
60320,The Bible: In The Beginning,34900023,171,['English Language'],"['United States of America', 'Italy']","['Christian film', 'Drama', 'Epic', 'World cin...",the bible in the beginning_1966,M,1.85,/m/03bkbh,...,the bible in the beginning_1966_richard harris,,,False,1966,6.2,6385,1,6.2,34900023.0
389034,Hawaii,34562222,161,['English Language'],['United States of America'],"['Period piece', 'Roadshow theatrical release'...",hawaii_1966,M,1.85,/m/03bkbh,...,hawaii_1966_richard harris,,,False,1966,6.5,3708,1,12.7,69462245.0
130002,Camelot,31102578,178,['English Language'],['United States of America'],"['Costume drama', 'Musical', 'Roadshow theatri...",camelot_1967,M,1.85,/m/03bkbh,...,camelot_1967_richard harris,,,False,1967,6.6,7624,2,9.65,50282411.5
182566,Caprice,4075000,95,['English Language'],['United States of America'],"['Romantic comedy', 'Crime Fiction', 'Mystery'...",caprice_1967,M,1.85,/m/03bkbh,...,caprice_1967_richard harris,,,False,1967,5.5,1761,3,8.266667,34879941.0


## Genre

To maximize the likelihood of winning an oscar, an actor has to choose genre wisely. We will below analyse what genres are relatively likely to to have oscar nominated actors, and what genres have relatively few actor nominations. 

Constructing dataframe for analysis

In [None]:
# Making a copy for this part
genre_df = df.copy()
genre_df_new = df.copy()
genre_df_new = genre_df_new[genre_df_new['year'] > 1999]

In [None]:
# Making a copy for this part
genre_df = df.copy()
genre_df_new = df.copy()
genre_df_new = genre_df_new[genre_df_new['year'] > 1999]

# One nan value, filling with no genres, i.e. [] 
genre_df['genres'] = genre_df['genres'].fillna('[]')
genre_df_new['genres'] = genre_df_new['genres'].fillna('[]')


# Splitting each entry in the genre column into lists of genres. 
genre_lists = genre_df['genres'].apply(lambda x: x[1:-1].split(', ') if x != '[]' else [])
genre_lists_new = genre_df_new['genres'].apply(lambda x: x[1:-1].split(', ') if x != '[]' else [])

# Removing "" from each list in genre_lists: "'Thriller'" -> 'Thriller'
genre_lists = genre_lists.apply(lambda x: [g[1:-1] for g in x])
genre_lists_new = genre_lists_new.apply(lambda x: [g[1:-1] for g in x])


# Removing emtpy entries in the lists: 
genre_lists = genre_lists.apply(lambda x: [g for g in x if g != ''] )
genre_lists_new = genre_lists_new.apply(lambda x: [g for g in x if g != ''] )


# Creating a set of all genres. 
all_genres = set([genre for sublist in genre_lists for genre in sublist])
all_genres_new = set([genre for sublist in genre_lists_new for genre in sublist])


# Resetting index
genre_df = genre_df.reset_index(drop=True)
genre_df_new = genre_df_new.reset_index(drop=True)

Now we build a dictionary mapping each genre to a list of 1s and 0s that indicates for each movie if they had that genre. 

In [760]:
# Adding all genres as keys to the dictionary
genre_dict = {}
for g in all_genres:
    genre_dict[g] = []

genre_dict_new = {}
for g in all_genres_new: 
    genre_dict_new[g] = []

# For each movie, for each genre, we append a 1 if the genre was in the movie, else 0
for movie_genres in genre_lists:  
    for g in all_genres:
        if g in movie_genres: 
            genre_dict[g].append(1)
        else: genre_dict[g].append(0)

for movie_genres in genre_lists_new:  
    for g in all_genres_new:
        if g in movie_genres: 
            genre_dict_new[g].append(1)
        else: genre_dict_new[g].append(0)

In [761]:
# Each genre has now a list indicating if what movies has that genre
print('Length: ', len(genre_dict['Thriller']))
print(genre_dict['Thriller'][:10])

Length:  23812
[0, 0, 0, 0, 1, 0, 1, 0, 0, 1]


The first three movies have the genre thriller. We verify this below. 

In [762]:
genre_df['genres'].head()

0    ['Romantic comedy', 'Romance Film', 'Drama', '...
1    ['Christian film', 'Drama', 'Epic', 'World cin...
2    ['Period piece', 'Roadshow theatrical release'...
3    ['Costume drama', 'Musical', 'Roadshow theatri...
4    ['Romantic comedy', 'Crime Fiction', 'Mystery'...
Name: genres, dtype: object

Now we add our dictionary data to the genre_df dataframe. 

In [763]:
genre_df = pd.concat([genre_df.reset_index(drop=True), pd.DataFrame(genre_dict).reset_index(drop=True)], axis=1)
genre_df_new = pd.concat([genre_df_new.reset_index(drop=True), pd.DataFrame(genre_dict_new).reset_index(drop=True)], axis=1)

In [764]:
# making a version containing only movies with nominated performances
genre_df_oscar_nominated = genre_df[genre_df['oscar_nominated'] == True]
genre_df_new_oscar_nominated = genre_df_new[genre_df_new['oscar_nominated'] == True]

In [765]:
print('Nr. of columns in new df: ', genre_df_oscar_nominated.columns.shape)
print('Example genres: ', genre_df_oscar_nominated.columns[100:104].values)

Nr. of columns in new df:  (283,)
Example genres:  ['Tragedy' 'Historical Epic' 'Alien Film' 'Road movie']


Now we take only the genre columns, from column index 25. We sum each column, getting the nr. of appearances for each genre.

In [None]:
genre_frequencies = genre_df.iloc[:,23:].sum(axis=0).sort_values(ascending=False)
genre_frequencies_new = genre_df_new.iloc[:,23:].sum(axis=0).sort_values(ascending=False)


nominated_genre_frequencies = genre_df_oscar_nominated.iloc[:,23:].sum(axis=0)
nominated_genre_frequencies = nominated_genre_frequencies.reindex(genre_frequencies.index)

nominated_genre_frequencies_new = genre_df_new_oscar_nominated.iloc[:,23:].sum(axis=0)
nominated_genre_frequencies_new = nominated_genre_frequencies_new.reindex(genre_frequencies_new.index)

In [767]:
assert nominated_genre_frequencies.index.all() == genre_frequencies.index.all()
assert nominated_genre_frequencies_new.index.all() == genre_frequencies_new.index.all()

In [768]:
genre_frequencies_df = pd.DataFrame({'all': genre_frequencies, 'nominated': nominated_genre_frequencies})
genre_frequencies_new_df = pd.DataFrame({'all': genre_frequencies_new, 'nominated': nominated_genre_frequencies_new})

In [None]:
import plotly.graph_objects as go
x_values = np.arange(1, len(genre_frequencies_df) + 1)
not_nominated = genre_frequencies_df['all']
nominated = genre_frequencies_df['nominated']

fig = go.Figure()

fig.add_trace(go.Bar(
    x=x_values,
    y=not_nominated,
    name='Not nominated',
    marker_color='red',
   
))

fig.add_trace(go.Bar(
    x=x_values,
    y=nominated,
    name='Nominated',
    marker_color='blue'
))

fig.update_layout(
    title='Distribution of genres in nominated and not nominated movies',
    xaxis_title='Genres (sorted)',
    yaxis_title='Nr. of appearances (log)',
    yaxis_type='log',
    bargap=0.0, 
    bargap=0.0
)
fig.show()


We can clearly see that the distributions are similair, but that the nominated movies have several genres with no nominations. To test if these are the same probability distributions we use the Kolmogorov-Smirnov test implemented in scipy stats. 

In [770]:
scipy.stats.kstest(nominated_genre_frequencies, genre_frequencies)

KstestResult(statistic=0.6192307692307693, pvalue=6.32944883717225e-47, statistic_location=7, statistic_sign=1)

P-value: 6.3295e-47
statistic= 0.6192

The extremely small P-value indicates that we can confidently say that the distributions are different. However, a statistic of 0.62 indicates that eventhough the distributions are different, they are not that dissimilar. 

We take a further look into the genres that have no nominated appearances: 

In [None]:
# Creating a new column that will be true if it has no nominated appearances. 
genre_frequencies_df['top_non_nominated_genres'] = False
mask = genre_frequencies_df['nominated'] == 0
genre_frequencies_df.loc[mask, 'top_non_nominated_genres'] = True


In [772]:
# Selecting all the genres that does not have any nominated appearances and saving to not_nominated_genres_df. 
not_nominated_genres_df = genre_frequencies_df.loc[genre_frequencies_df['top_non_nominated_genres'] == True,:].reset_index()
not_nominated_genres_df = not_nominated_genres_df.rename(columns={'index':'genre'})
print(not_nominated_genres_df.head())
print(not_nominated_genres_df.shape)

                genre   all  nominated  top_non_nominated_genres
0           Animation  1104          0                      True
1           Slapstick   786          0                      True
2  Computer Animation   505          0                      True
3           Absurdism   245          0                      True
4       Doomsday film   243          0                      True
(116, 4)


116 Genres have no oscar nominated actors. Some of these genres are self explanatory e.g. Animation. However we need further analysis to get a deeper understanding. 

In [None]:
# Adding a new columns
genre_frequencies_df['share_nominated'] = genre_frequencies_df['nominated'] / genre_frequencies_df['all']
# Sorting the dataframe on nominated movies after removing the genres with 1 nomination. 
genre_frequencies_df = genre_frequencies_df[genre_frequencies_df['nominated'] > 1].sort_values(by='nominated', ascending=False)
# Selecting top 20 genres by the nr. of nominated movies in that genre
top_20_df = genre_frequencies_df.iloc[:20,:].sort_values(by='nominated', ascending=False)

## By nr of performances: 

In [776]:
# Plotting 
fig = go.Figure()

fig.add_trace(
    go.Bar(
        y=not_nominated_genres_df.loc[:20,'genre'],  
        x=not_nominated_genres_df.loc[:20,'all'],     
        orientation='h',  
        name="Number of performances. 1928-2012",
        text=not_nominated_genres_df.loc[:20,'all'], 
        textposition='outside',
        marker=dict(color='blue'), 
        opacity=0.7   
    ),
)

fig.update_layout(
    title='Biggest genres without Oscar nominated performance',
    xaxis_title='Nr. of performances',
    yaxis_title='Genres',
    barmode='stack',  
    template='plotly',  
    showlegend=True     
)
fig.show()
fig.write_html('genres_wo_nominations.html', full_html=False, include_plotlyjs='cdn')


Like eluded to earlier, some genres might be self explanatory. However, there are clearly genres worth avoiding for oscar aspiring actors. For instance Absurdims, doomsday film, dystopia and holiday film to name a few.

In [777]:
# Plotting 
fig = go.Figure()

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=[p * f for p, f in zip(top_20_df['share_nominated'], top_20_df['all'])], 
    orientation='h',
    name='Nominated performance',
    marker=dict(color='red'), 
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'],  
    orientation='h',      
    name='Not nominated performances',
    marker=dict(color='blue'),
    opacity=0.7, 
))

fig.update_layout(
    title='Most popular genres at the Oscars',
    xaxis_title='Number of appearances',
    yaxis_title='Genres',
    barmode='stack', 
    template='plotly',  
    showlegend=True 
)
fig.show()
fig.write_html('most_popular_genres.html', full_html=False, include_plotlyjs='cdn')

There are clearly genres worth avoiding for oscar aspiring actors. For instance Absurdims, doomsday film, dystopia and holiday film to name a few. On the otherhand, there are also genres that have many actor nominations, e.g. Drama, Period, Romantic drama and Biography. However, they have many nominations partly because they have many movies. Hence, we can look at the genres with the highest share of oscar winners. 

In [778]:
genre_frequencies_df = genre_frequencies_df[genre_frequencies_df['nominated'] > 1].sort_values(by='share_nominated', ascending=False)
top_20_df = genre_frequencies_df.iloc[:20,:].sort_values(by='nominated', ascending=False)

In [779]:
fig = go.Figure()

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=[p * f for p, f in zip(top_20_df['share_nominated'], top_20_df['all'])], 
    orientation='h',
    name='Nominated performance',
    marker=dict(color='red'), 
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'],  
    orientation='h',      
    name='Not nominated performances',
    marker=dict(color='blue'),
    opacity=0.7, 
))

fig.update_layout(
    title='Most popular genres at the Oscars',
    xaxis_title='Number of appearances',
    yaxis_title='Genres',
    barmode='stack', 
    template='plotly',  
    showlegend=True 
)

fig.show()

In [780]:
genre_frequencies_df = genre_frequencies_df[genre_frequencies_df['nominated'] > 1].sort_values(by='nominated', ascending=False)
# Selecting top 20 genres by the share of nominated performances
top_20_df = genre_frequencies_df.iloc[:20,:].sort_values(by='share_nominated', ascending=False)
# Converting to %
top_20_df['%_nominated'] = top_20_df['share_nominated']*100

In [None]:

# Selecting top 20 genres by the share of nominated performances
top_20_df = genre_frequencies_df.iloc[:20,:].sort_values(by='share_nominated', ascending=False)
# Converting to %
top_20_df['%_nominated'] = top_20_df['share_nominated']*100

In [783]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    y=top_20_df.index,   
    x=top_20_df['%_nominated'], 
    orientation='h',
    name='% of performances nominated',
    marker=dict(color='blue'), 
    xaxis='x1', 
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'], 
    orientation='h',
    name='Performances',
    marker=dict(color='red'), 
    xaxis='x2'  
))

fig.update_layout(
    title='Top genres at the Oscars',
    xaxis_title='% of performances nominated', 
    yaxis_title='Genres', 
    barmode='stack', 
    template='plotly',  
    showlegend=True,
    
    xaxis=dict(
        title='% of performances nominated',
        range = [0,40], 
        overlaying='x2',
        
    ),
    
    xaxis2=dict(
        title='Performances',  
        side='top'  
        
    )
)
fig.write_html('genres_by_share2.html', full_html=False, include_plotlyjs='cdn')
fig.show()

The genre with the highest share of oscar nominated performances is New Hollywood with 27% of performances being nominated. Coming second is Biographical film with only 8% of performances being nominated. With a time machine, travelling back to the 1960s-1980s would be the most optimal for an actor winning an oscar. Hence we will need to look at more recent successful genres. 

In [786]:
genre_frequencies_new_df['share_nominated'] = genre_frequencies_new_df['nominated'] /genre_frequencies_new_df['all'] 

In [None]:
# Selecting top 20 genres by the share of nominated performances
top_20_df = genre_frequencies_new_df.iloc[:20,:].sort_values(by='share_nominated', ascending=False)
# Converting to %
top_20_df['%_nominated'] = top_20_df['share_nominated']*100

In [790]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    y=top_20_df.index,   
    x=top_20_df['%_nominated'], 
    orientation='h',
    name='% of performances nominated',
    marker=dict(color='blue'), 
    xaxis='x1', 
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'], 
    orientation='h',
    name='Performances',
    marker=dict(color='red'), 
    xaxis='x2'  
))

fig.update_layout(
    title='Top genres at the Oscars since 2000',
    xaxis_title='% of performances nominated', 
    yaxis_title='Genres', 
    barmode='stack', 
    template='plotly',  
    showlegend=True,
    
    xaxis=dict(
        title='% of performances nominated',
        range = [0,40], 
        overlaying='x2',
        
    ),
    
    xaxis2=dict(
        title='Performances',  
        side='top'  
        
    )
)
fig.write_html('genres_by_share_new.html', full_html=False, include_plotlyjs='cdn')
fig.show()

In recent years, the most popular genres at the oscars has not had as high success rate as the most popular genres in earlier years. Up top is Period piece, Romantic drama and Crime Thriller. 

## By nr of movies: 

In [None]:
genre_movie_df = genre_df.sort_values(by =['movie_identifier', 'oscar_nominated'], ascending=False)

In [None]:
# Counting movie releases per year
genre_movie_df = genre_movie_df.drop_duplicates(subset='movie_identifier', keep='first', inplace=False)

In [None]:
# Counting genre frequencies
genre_frequencies = genre_movie_df.iloc[:,23:].sum(axis=0).sort_values(ascending=False)
nominated_genre_frequencies = genre_df_oscar_nominated.iloc[:,23:].sum(axis=0)
nominated_genre_frequencies = nominated_genre_frequencies.reindex(genre_frequencies.index)

In [None]:
genre_frequencies_df = pd.DataFrame({'all': genre_frequencies, 'nominated': nominated_genre_frequencies})
genre_frequencies_df['share_nominated'] = genre_frequencies_df['nominated'] / genre_frequencies_df['all']

In [None]:
top_20_df = genre_frequencies_df.sort_values(by='nominated', ascending=False).iloc[:20,:]

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=[p * f for p, f in zip(top_20_df['share_nominated'], top_20_df['all'])], 
    orientation='h',
    name='With nominated performance',
    marker=dict(color='red'), 
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'],  
    orientation='h',      
    name='Without nominated performance',
    marker=dict(color='blue'),
    opacity=0.7, 
))

fig.update_layout(
    title='Most popular genres at the Oscars; By number of movies',
    xaxis_title='Number of movies ',
    yaxis_title='Genres',
    barmode='stack', 
    template='plotly',  
    showlegend=True 
)

fig.show()


In [None]:
genre_frequencies_df = genre_frequencies_df.drop(['Private military company'])

In [None]:
top_20_df = genre_frequencies_df.sort_values(by='share_nominated', ascending=False).iloc[:20,:].sort_values(by='nominated', ascending=False)
top_20_df['%_nominated'] = top_20_df['share_nominated'] * 100
top_20_df = top_20_df.sort_values(by='%_nominated', ascending=False)

In [None]:
top_20_df

Unnamed: 0,all,nominated,share_nominated,%_nominated
Propaganda film,1,1,1.0,100.0
Movies About Gladiators,2,2,1.0,100.0
British New Wave,1,1,1.0,100.0
Legal drama,6,5,0.833333,83.333333
New Hollywood,59,49,0.830508,83.050847
Tragedy,44,27,0.613636,61.363636
Prison film,5,3,0.6,60.0
Boxing,26,14,0.538462,53.846154
Outlaw biker film,2,1,0.5,50.0
Demonic child,4,2,0.5,50.0


In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    y=top_20_df.index,   
    x=top_20_df['%_nominated'], 
    orientation='h',
    name='% of performances nominated',
    marker=dict(color='Blue'), 
    opacity=0.7
))

fig.add_trace(go.Bar(
    y=top_20_df.index,   
    x=top_20_df['all'], 
    orientation='h',
    name='Performances',
    marker=dict(color='red'), 
))

fig.update_layout(
    title='Most popular genres at the Oscars; By share of movies',
    xaxis_title='Movies',
    yaxis_title='Genres',
    barmode='stack', 
    template='plotly',  
    showlegend=True 
)

fig.write_html('genres_by_share.html', full_html=False, include_plotlyjs='cdn')

fig.show()