In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from scipy import stats

In [None]:
%run data_pipeline.ipynb

In [None]:
# Read data
df = pd.read_csv('cache/data.csv', sep=',', index_col=0)
unclean_df = pd.read_csv('cache/unclean_data.csv', sep=',', index_col=0)

In [None]:
df.head()

## **Actor Network Analysis**

In [None]:
import networkx as nx
from itertools import combinations

In [None]:
oscar_nominated = df[df['oscar_nominated']==True]

df_new_movies = df


In [None]:
print(len(df_new_movies), len(df))

In [None]:
G = nx.Graph()

for _, row in df_new_movies.iterrows():
    G.add_node(row['actor_identifier'], oscar_nominated=row['oscar_nominated'])


for movie, group in df_new_movies.groupby('title'):
    actors = group['actor_identifier'].tolist()
    for actor1, actor2 in combinations(actors, 2):
        if G.has_edge(actor1, actor2):
            G[actor1][actor2]['movies'].append(movie)
        else:
            G.add_edge(actor1, actor2, movies=[movie])

colors = {True: 'gold', False: 'blue'}
node_colors = [colors[G.nodes[node]['oscar_nominated']] for node in G.nodes]


In [None]:
average_nominated_degree = []
average_not_nominated = []
degree_dict = dict(G.degree())
for node, degree in degree_dict.items():
    nominated = G.nodes[node]['oscar_nominated']
    if nominated: 
        average_nominated_degree.append(G.degree(node))
    else: 
        average_not_nominated.append(G.degree(node))

print(np.mean(average_not_nominated))
print(np.mean(average_nominated_degree))

In [None]:
plt.figure(figsize=(30, 24))  
pos = nx.spring_layout(G, k=1, iterations=50)  
nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=500, font_size=5, font_weight="bold")
degree_dict = dict(G.degree())
for node, degree in degree_dict.items():
    plt.text(pos[node][0], pos[node][1], str(degree), fontsize=12, ha='center', color='black')
    

plt.title("Actor Collaboration Graph with Colored Nodes")
plt.show()

In [None]:
G = nx.Graph()
for movie, group in oscar_nominated.groupby('title'):
    actors = group['actor_identifier'].tolist()
    for actor1, actor2 in combinations(actors, 2):
        if G.has_edge(actor1, actor2):
            G[actor1][actor2]['movies'].append(movie)
        else:
            G.add_edge(actor1, actor2, movies=[movie])

In [None]:
#pos = nx.spring_layout(G, k=1, iterations=50) 
nx.draw_networkx(G, pos=None, arrows=None, with_labels=False, node_size=35, width=0.5)
plt.title('Oscar Nominated Actors')

## **Year bias**

In [None]:
from scipy.interpolate import CubicSpline

Constructing df for analysis

In [None]:
# Counting movie releases per year
movie_df = df.drop_duplicates(subset='movie_identifier', keep='first', inplace=False)[['movie_identifier', 'year']]
movie_releases = movie_df.groupby('year').size().reset_index(name='count')
movie_releases.head()

In [None]:
# Counting oscar nominations per year
oscar_nominations = oscar_nominated.groupby('year').size().reset_index(name='count')
years = pd.DataFrame({'year': movie_releases['year'].unique()})
oscar_nominations = years.merge(oscar_nominations, on='year', how='left').fillna(0)
oscar_nominations.head()

In [None]:
# Counting nominated movies per year
nominations_per_movie_per_year = oscar_nominated.groupby(['year', 'movie_identifier']).agg({'title':'count'}).reset_index().rename(columns={'title':'count'})
unique_nominated_movies_per_year = nominations_per_movie_per_year.groupby(['year']).agg({'movie_identifier':'count'}).reset_index().rename(columns={'movie_identifier':'count'})
unique_nominated_movies_per_year.head()

In [None]:
# Merging releases and nominated movies per year
movie_releases_with_nominations = movie_releases.merge(unique_nominated_movies_per_year, on='year', how='left').fillna(0).rename(columns={'count_x' : 'releases', 'count_y':'nominated movies'})
movie_releases_with_nominations.head()

In [None]:
# Merging in nr. of oscar nominations per year
releases_nominations_counts_df = movie_releases_with_nominations.merge(oscar_nominations, on='year', how='left').fillna(0).rename(columns={'count' : 'oscar nominations'})
releases_nominations_counts_df.head()

In [None]:
# Calculating % of movies that was nominated each year
releases_nominations_counts_df['%nominated'] = releases_nominations_counts_df['nominated movies'] / releases_nominations_counts_df['releases']
releases_nominations_counts_df.head()

**Plotting**

In [None]:
plt.figure(figsize=(10, 6))

plt.bar(releases_nominations_counts_df['year']+0.4, releases_nominations_counts_df['releases'], label ='Releases', alpha=0.8)
plt.bar(releases_nominations_counts_df['year']-0.4, releases_nominations_counts_df['oscar nominations'], label ='Nominations', color='red')
plt.bar(releases_nominations_counts_df['year'], releases_nominations_counts_df['nominated movies'], label ='Nominated Movies', color='#00008B')


plt.xlabel('Year')
plt.xlim(min(df['year']), max(df['year']))

plt.ylabel('Nr. Releases')

plt.legend()

plt.title('Movie releases and nominations per year')


x = releases_nominations_counts_df['year']
y = releases_nominations_counts_df['%nominated']

cs = CubicSpline(x, y)

x_smooth = np.linspace(min(x), max(x), 500)
y_smooth = cs(x_smooth)
ax2 = plt.twinx()
ax2.set_ylabel('Share')
ax2.plot(x_smooth, 100*y_smooth, color='black', label='Nominated movies (%)', alpha=0.8, linewidth=1)

ax2.set_ylabel('Nominated Movies (%)')
plt.legend()
plt.show()


## Genre

In [None]:
genre_df = df.copy()

In [None]:
genre_df['genres'] = genre_df['genres'].fillna('[]')
genre_lists = genre_df['genres'].apply(lambda x: x[1:-1].split(', ') if x != '[]' else [])
genre_lists = genre_lists.apply(lambda x: [g[1:-1] for g in x])
genre_lists = genre_lists.apply(lambda x: [g for g in x if g != ''] )
all_genres = set([genre for sublist in genre_lists for genre in sublist])

In [None]:
genre_dict = {}
for g in all_genres:
    genre_dict[g] = []

for list in genre_lists:  
    for g in all_genres:
        if g in list: 
            genre_dict[g].append(1)
        else: genre_dict[g].append(0)

In [None]:
genre_df = pd.concat([genre_df.reset_index(drop=True), pd.DataFrame(genre_dict).reset_index(drop=True)], axis=1)
#genre_df = genre_df.drop(columns='index') 

In [None]:
genre_df_oscar_nominated = genre_df[genre_df['oscar_nominated'] == True]

In [None]:
genre_df_oscar_nominated.columns.shape

In [None]:
genre_frequencies = genre_df.iloc[:,20:].sum(axis=0).sort_values(ascending=False)

In [None]:
nominated_genre_frequencies = genre_df_oscar_nominated.iloc[:,20:].sum(axis=0)
nominated_genre_frequencies = nominated_genre_frequencies.reindex(genre_frequencies.index)

In [None]:
assert nominated_genre_frequencies.index.all() == genre_frequencies.index.all()

In [None]:
nominated_genre_frequencies_normalized = (nominated_genre_frequencies - nominated_genre_frequencies.mean())/nominated_genre_frequencies.std()
nominated_genre_frequencies_normalized = nominated_genre_frequencies_normalized + abs(min(nominated_genre_frequencies_normalized))
nominated_genre_frequencies_normalized = (nominated_genre_frequencies_normalized - min(nominated_genre_frequencies_normalized))/(max(nominated_genre_frequencies_normalized)- min(nominated_genre_frequencies_normalized))



genre_frequencies_normalized = (genre_frequencies - genre_frequencies.mean())/genre_frequencies.std()
genre_frequencies_normalized = genre_frequencies_normalized + abs(min(genre_frequencies_normalized))
genre_frequencies_normalized = (genre_frequencies_normalized - min(genre_frequencies_normalized))/(max(genre_frequencies_normalized)- min(genre_frequencies_normalized))


In [None]:
plt.figure(figsize=(10, 6))
plt.bar(np.arange(1, len(genre_frequencies)+1), genre_frequencies_normalized, color='red', label='Not nominated')
plt.bar(np.arange(1, len(genre_frequencies)+1), nominated_genre_frequencies_normalized, label='Nominated')
plt.title('Distribution of genres in nominated and not nominated movies')
plt.legend()

In [None]:
import scipy
scipy.stats.kstest(nominated_genre_frequencies_normalized, genre_frequencies_normalized)

We can reject null hypothesis