In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:

num_nominees = 9

np.random.choice(num_nominees, num_nominees, replace = False)

array([0, 8, 2, 4, 6, 3, 5, 7, 1])

In [3]:
full_table = pd.read_csv('./data/processed_results/osc_df')

# Setting Up our DataFrame
First, we load in the ML ready data frame from the table_assemling notebook

In [4]:
min_year = 1995 # This is the first year that all awards shows have occured

# Training Set - Excluding 2019
train = full_table.loc[((full_table['year'] < 2019) & (full_table['year'] > min_year))]
test_2019 = full_table.loc[(full_table['year'] == 2019)]

print('training set contains:', train.shape[0], 'movies')
print('Prediciting on:', test_2019.shape[0], 'movies')

training set contains: 154 movies
Prediciting on: 9 movies


In [5]:
# Identify predictors

full_predictors = ['year','nom_gg_drama', 'winner_gg_drama', 'nom_gg_comedy', 'winner_gg_comedy',
       'nom_pga', 'winner_pga', 'nom_bafta', 'winner_bafta', 'nom_dga', 'winner_dga',
        'nom_sag', 'winner_sag', 'nom_cannes', 'winner_cannes','Nominations']

# Simulating a Voter using a Decision Tree
Each 'Voter' will take a Decisions Tree trained on a smaller part of the data to pick that voter's rankings of the Best Picture Nominees

In [6]:
# The model I am using to predict per voter is a Decison Tree
voter1 = DecisionTreeClassifier(splitter='random',
                                max_depth=3,# Low depth allows for some randomness
                                min_samples_leaf=3,
                                random_state = 92)

In [7]:
def simulate_a_vote(model, train_df, to_predict_df, full_predictors):
    train = train_df.copy()
    test = to_predict_df.copy()
    
    # A noise column, randomly generated each time represents a voter's bias
    train.loc[:,'Noise'] = np.random.rand(train_df.shape[0])
    test.loc[:,'Noise'] = np.random.rand(to_predict_df.shape[0])

    # Looking at a random amount of awards shows (similar to bootstrapping)
    # This reflects a voter's attention to the season
    # num_features is how many of the features they care about
    num_features = np.random.choice(int(len(full_predictors)/1.7))
    voter_features = list(np.random.choice(full_predictors, num_features)) + ['Noise']

    x = np.array(train[voter_features])
    y = np.array(train['Oscar_win'])
    
    model.fit(x,y)
    
    # ProbA of the voter will represent the ranked votes
    ballot_clean = model.predict_proba(np.array(test[voter_features]))[:,1]
    # Add small random values to break up ties
    ballot = ballot_clean + np.random.rand(len(ballot_clean))/10000
    
    # Use np.argsort() to rank the order of the probA
    # The Academy uses ranked votes calculate winner
    temp = ballot.argsort()
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(ballot))
    ranks = np.abs(ranks - len(ballot))
    return ranks

In [26]:
vote = simulate_a_vote(voter1, train, test_2019, full_predictors)
print("This voter's ballot looks like:",vote)
position = np.argmin(vote)
print(f"This means their first choice is the {position + 1}th film in our df: {list(test_2019.film)[position]}")

This voter's ballot looks like: [9 3 8 4 5 7 2 6 1]
This means their first choice is the 9th film in our df: Parasite (2019 film)


# Simulating the Entire Academy
By 'casting a vote' many times, we can get the ballot from the entire academy

In [9]:
def simulate_voting_body(num_voters, model, train_df, to_predict_df, full_predictors):
    collected_ballots = np.zeros((num_voters, to_predict_df.shape[0]))
    for i in range(num_voters):
        collected_ballots[i,:] = simulate_a_vote(model, train_df, to_predict_df, full_predictors)
    return collected_ballots

In [31]:
n = 5
print(f'Here is an example of a {n}-person Academy:')
print(simulate_voting_body(n, voter1, train, test_2019, full_predictors))

Here is an example of a 5-person Academy:
[[8. 3. 5. 9. 7. 6. 1. 4. 2.]
 [7. 8. 2. 9. 6. 4. 5. 1. 3.]
 [4. 3. 7. 2. 8. 1. 9. 5. 6.]
 [6. 8. 7. 4. 9. 5. 2. 3. 1.]
 [7. 4. 3. 9. 8. 6. 2. 5. 1.]]


In [11]:
def tally_votes(voting_body, list_of_nominees):
    # List of nominees must be in the same order as the vote index
    firsts = np.where(voting_body==1,1,0)
    tally = np.sum(firsts, axis = 0)
    tallied_votes_df = pd.DataFrame(tally, columns=['Votes']).T
    tallied_votes_df.columns = list_of_nominees
    return tallied_votes_df.T.sort_values('Votes', ascending = False)

In [32]:
n = 1000
this_academy = simulate_voting_body(n, voter1, train, test_2019, full_predictors)
print(f"Overall, this {n}-person academy's top picks look like this:")
tally_votes(this_academy, list(test_2019.film))

Overall, this 1000-person academy's top picks look like this:


Unnamed: 0,Votes
1917 (2019 film),458
Parasite (2019 film),162
Once Upon a Time in Hollywood,125
The Irishman,80
Joker (2019 film),50
Jojo Rabbit,47
Ford v Ferrari,27
Little Women (2019 film),26
Marriage Story,25


# Tiered Voting Changes
We start elimnating the least voted for film from the ballots and re-ranking the films

In [13]:
def remove_least(voting_body, list_of_nominees):
    # List of nominees must be in the same order as the vote index
    firsts = np.where(voting_body==1,1,0)
    tally = np.sum(firsts, axis = 0)
    least_votes_index = np.argmin(tally)
    # Removes the least voted entry (from # 1 to 0)
    voting_body = np.delete(voting_body, least_votes_index, axis = 1)
    list_of_nominees.remove(list_of_nominees[least_votes_index])
    return voting_body, list_of_nominees

In [14]:
def re_rank_ballots(voting_body):
    """
    Takes a voting body (numpy array)
    Makes sure each row goes from 1 to shape[1]
    """
    re_ranked = np.zeros(voting_body.shape)
    for i in range(voting_body.shape[0]):
        temp = voting_body[i,:].argsort()
        ranks = np.empty_like(temp)
        ranks[temp] = np.arange(len(voting_body[i,:]))
        re_ranked[i,:] = ranks + 1
    return re_ranked

In [15]:
def run_one_round_of_eliminations(voting_body, list_of_nominees):
    voting_body, list_of_nominees = remove_least(voting_body, list_of_nominees)
    voting_body = re_rank_ballots(voting_body)
    return voting_body, list_of_nominees

In [33]:
new_votes, new_noms = run_one_round_of_eliminations(this_academy, list(test_2019.film))

print(len(new_noms), 'films remaining')
print('\nNew Standings:')
tally_votes(new_votes, new_noms)

8 films remaining

New Standings:


Unnamed: 0,Votes
1917 (2019 film),462
Parasite (2019 film),165
Once Upon a Time in Hollywood,125
The Irishman,87
Joker (2019 film),53
Jojo Rabbit,50
Ford v Ferrari,31
Little Women (2019 film),27


## Re-Rank until one film has more than 50% of the vote
This is where the real simulation comes in.
We put together all the previous functions to simulate the result of the 2019 Best Picture voting

In [17]:
def run_preferential_voting(voting_body,list_of_nominees, show_steps = False):
    top_pick_percent = tally_votes(voting_body,list_of_nominees).max()[0]/tally_votes(voting_body,list_of_nominees).sum()[0]
    while top_pick_percent < 0.5:
        voting_body,list_of_nominees = run_one_round_of_eliminations(voting_body, list_of_nominees)
        top_pick_percent = tally_votes(voting_body,list_of_nominees).max()[0]/tally_votes(voting_body,list_of_nominees).sum()[0]    
        if show_steps:
            print(tally_votes(voting_body, list_of_nominees),'\n')
    return voting_body, list_of_nominees

# Lets Simulate the Oscars!

In [34]:
min_year = 1995

# Training Set - Excluding 2019
train = full_table.loc[((full_table['year'] < 2019) & (full_table['year'] > min_year))]
test_2019 = full_table.loc[(full_table['year'] == 2019)]

print('training set contains:', train.shape[0], 'movies')
print('Prediciting on:', test_2019.shape[0], 'movies')

# Identify features to predict on
full_predictors = ['year','nom_gg_drama', 'winner_gg_drama', 'nom_gg_comedy', 'winner_gg_comedy',
       'nom_pga', 'winner_pga', 'nom_bafta', 'winner_bafta', 'nom_dga', 'winner_dga',
        'nom_sag', 'winner_sag', 'nom_cannes', 'winner_cannes','Nominations']

# Pict the model we want for each random voter
voter_model = DecisionTreeClassifier(splitter='random',
                                max_depth=3,
                                min_samples_leaf=3,
                                random_state = 92)

num_voters_academy = 7000
print(f'\nSimulating an Academy with {num_voters_academy} random voters.....')
academy_sim = simulate_voting_body(num_voters=num_voters_academy, model = voter_model, train_df = train, to_predict_df = test_2019, full_predictors=full_predictors)

print('\nInitial Rankings:\n----------------------------------------')
print(tally_votes(academy_sim, list(test_2019.film)),'\n')
print("Now we start eliminating films untill there one has more than 50% of the top picks:\n----------------------------------------")
final_ballot, final_films = run_preferential_voting(academy_sim, list(test_2019.film),True)

training set contains: 154 movies
Prediciting on: 9 movies

Simulating an Academy with 7000 random voters.....

Initial Rankings:
                               Votes
1917 (2019 film)                3053
Parasite (2019 film)            1334
Once Upon a Time in Hollywood    735
The Irishman                     636
Jojo Rabbit                      346
Joker (2019 film)                344
Marriage Story                   221
Little Women (2019 film)         188
Ford v Ferrari                   143 

Now we start eliminating films untill there one has more than 50% of the top picks
                               Votes
1917 (2019 film)                3076
Parasite (2019 film)            1351
Once Upon a Time in Hollywood    748
The Irishman                     662
Joker (2019 film)                357
Jojo Rabbit                      353
Marriage Story                   241
Little Women (2019 film)         212 

                               Votes
1917 (2019 film)                3112
Parasi

# And the Oscar goes to...

In [42]:
bp_winner = np.array(tally_votes(final_ballot, final_films).reset_index())[0][0].split('(')[0].strip()
print(f'And the Oscar goes to...\n🎉🏆{bp_winner}🏆🎉')

And the Oscar goes to...
🎉🏆1917🏆🎉
