In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
%run data_pipeline.ipynb

In [None]:
import pandas as pd

# Load the data
genres_df = pd.read_parquet('cache/genres.parquet')
languages_df = pd.read_parquet('cache/languages.parquet')
countries_df = pd.read_parquet('cache/countries.parquet')
data_df = pd.read_parquet('cache/data.parquet')
data_df['oscar_nominated'] = data_df['oscar_nominated'].astype(float)

### Find number of movies the actor starred in prior to starring in the current movie

In [None]:
# Sort movies by year 
data_df = data_df.loc[data_df['year'].notna(), :]
data_df = data_df.sort_values('year', ascending=True)

# Get cumulative count of movies for each actor
data_df['number_of_movies_starred_in'] = data_df.groupby('actor_identifier').cumcount() + 1

### Find distribution of number of movies actors star in

In [None]:
# The cumulative count does not make sense for the earlier movies since we don't count movies
# prior to that. We set the year to 2000 since now this issue would only occur if actors starred in a movie
# prior to 1928 and were nominated for an Oscar after 2000. Which is unlikely.

oscar_nominated_plot_df = data_df.loc[(data_df['oscar_nominated'] == 1) & (data_df['year'] > 2000)].groupby('actor_identifier').agg(
    movies_starred_in=('number_of_movies_starred_in', 'min')).reset_index()
not_oscar_nominated_plot_df = data_df.loc[(data_df['oscar_nominated'] == 0) & (data_df['year'] > 2000)].groupby('actor_identifier').agg(
    movies_starred_in=('number_of_movies_starred_in', 'max')).reset_index()

fig, axs = plt.subplots(2, 1, figsize=(12, 6))
sns.histplot(data=not_oscar_nominated_plot_df, x='movies_starred_in', ax=axs[0], log_scale=False)
sns.histplot(data=oscar_nominated_plot_df, x='movies_starred_in', ax=axs[1], log_scale=False)

axs[0].set_title('Number of movies starred in by actors never nominated for an Oscar')
axs[1].set_title('Number of movies starred in by actors before getting nominated for their first Oscar')

axs[0].set_xlabel('Number of movies starred in')
axs[1].set_xlabel('Number of movies starred in')

plt.tight_layout()
plt.show()

In [None]:
print('Mean number of movies starred in by actors never nominated for an Oscar: ', not_oscar_nominated_plot_df['movies_starred_in'].mean())
print('Mean number of movies starred in by actors before getting nominated for their first Oscar: ', oscar_nominated_plot_df['movies_starred_in'].mean())

In [None]:
# Do a p-test to see if the difference is significant
from scipy.stats import ttest_ind

ttest_result = ttest_ind(not_oscar_nominated_plot_df['movies_starred_in'], oscar_nominated_plot_df['movies_starred_in'])
print('P-value: ', ttest_result.pvalue)

The p-value is less than 5%, thus we conclude the distributions to be different.

We conclude that actors who got nominated for Oscars in the 21th centaury on average starred in 14 movies. 

### Do the chance of winning an Oscar increase because you have more entries (as in more movies) or because of something else? (2000 >)

In [None]:
# Assuming each movie is independent of the other
# Probability of getting nominated for an Oscar after starring in one movie
filtered_df = data_df.loc[data_df['year'] > 2000]
iid_p = filtered_df.loc[filtered_df['oscar_nominated'] == 1].shape[0] / filtered_df.shape[0]
print('Probability of getting nominated for an Oscar after starring in one movie: ', iid_p)

nominated_df = filtered_df.loc[filtered_df['oscar_nominated'] == 1]

actual_probabilities = []
iid_probabilities = []

for i in range(1, 25):
    actual_probabilities.append(nominated_df.loc[nominated_df['number_of_movies_starred_in'] <= i].shape[0] / filtered_df.shape[0])
    iid_probabilities.append(1 - (1 - iid_p) ** i)


plt.plot(range(1, 25), actual_probabilities, label='Actual probabilities')
# plt.plot(range(1, 25), iid_probabilities, label='IID probabilities')
plt.xlabel('Number of movies starred in')
plt.ylabel('Probability of getting nominated for an Oscar')
plt.legend()
plt.show()





In [None]:
data_df['cumulative_rating'] = data_df.groupby('actor_identifier')['average_rating'].cumsum()
data_df['average_rating'] = data_df['cumulative_rating'] / data_df['number_of_movies_starred_in']

# Removing rows after actor's first Oscar nomination
oscar_nominated_df = data_df.loc[data_df['oscar_nominated'] == 1]
non_oscar_nominated_df = data_df.loc[data_df['oscar_nominated'] == 0]

# oscar_nominated_grouped_df = oscar_nominated_df.groupby('actor_identifier').agg(
#     first_oscar_nominated_year=('year', 'min')).reset_index()

# before_oscar_nominated_df = oscar_nominated_grouped_df.merge(data_df, on='actor_identifier')
# before_oscar_nominated_df = before_oscar_nominated_df.loc[before_oscar_nominated_df['year'] <= before_oscar_nominated_df['first_oscar_nominated_year']]

# after_oscar_nominated_df = oscar_nominated_grouped_df.merge(data_df, on='actor_identifier')
# after_oscar_nominated_df = after_oscar_nominated_df.loc[after_oscar_nominated_df['year'] > after_oscar_nominated_df['first_oscar_nominated_year']]

oscar_nominated_df = oscar_nominated_df.groupby('actor_identifier').agg(
    average_box_office=('average_rating', 'last'),
    number_of_movies_starred_in=('number_of_movies_starred_in', 'last')).reset_index()

non_oscar_nominated_df = non_oscar_nominated_df.groupby('actor_identifier').agg(
    average_box_office=('average_rating', 'last'),
    number_of_movies_starred_in=('number_of_movies_starred_in', 'last')).reset_index()

fig, axs = plt.subplots(1, 2, figsize=(12, 6), layout='tight', sharey=True, sharex=True)

sns.scatterplot(data=oscar_nominated_df, x='number_of_movies_starred_in', y='average_box_office', ax=axs[0])
sns.scatterplot(data=non_oscar_nominated_df, x='number_of_movies_starred_in', y='average_box_office', ax=axs[1])

axs[0].set_title('Actors before their first Oscar nomination')
axs[1].set_title('Actors who have never been nominated for an Oscar at the time of their last movie')

axs[0].set_xlabel('Number of movies starred in')
axs[1].set_xlabel('Number of movies starred in')
axs[0].set_ylabel('Average box office revenue up until that point')

plt.legend()

# fig, ax = plt.subplots(1, 1, figsize=(12, 6))

# sns.lineplot(data=before_oscar_nominated_df, x='number_of_movies_starred_in', y='average_rating', orient='x', ax=ax)
# sns.lineplot(data=non_oscar_nominated_df, x='number_of_movies_starred_in', y='average_rating', orient='x', ax=ax)
# sns.lineplot(data=after_oscar_nominated_df, x='number_of_movies_starred_in', y='average_rating', orient='x', ax=ax)