In [93]:
import pandas as pd
import pickle

# Load season 47 data - the finale airs Wednesday, December 12, 2024 and is the season I'm trying to predict the winner! 
file_path_season_47 = '/Users/ginagrainda/Desktop/survivor_data/survivor_season_47.csv'
survivor_47_data = pd.read_csv(file_path_season_47)

In [94]:
# Add interaction features
survivor_47_data['gender_man_and_region_West'] = survivor_47_data['gender_M'] * survivor_47_data['region_West']
survivor_47_data['gender_woman_and_age_26_35'] = survivor_47_data['gender_F'] * survivor_47_data['age_band_26-35'] 
survivor_47_data['poc_and_age_26_35'] = survivor_47_data['poc'] * survivor_47_data['age_band_26-35']
survivor_47_data['gender_woman_and_new_era_season'] = survivor_47_data['gender_F'] * survivor_47_data['new_era_season']
survivor_47_data['tribe_blue_and_capricorn'] = survivor_47_data['tribe_color_Blue'] * survivor_47_data['astrological_sign_Capricorn']

# Select features of interest into a new data frame
features = [
    'gender_M',
    'industry_Law, Public Safety, Military',
    'industry_Business, Finance, Administrative',
    'industry_Arts, Media, Entertainment',
    'industry_Advertising, Marketing, Sales',
    'region_Northeast',
    'region_West',
    'age_band_26-35',
    'tribe_color_Blue',
    'tribe_color_Yellow',
    # Interaction features
    'gender_man_and_region_West', 
    'gender_woman_and_age_26_35',
    'poc_and_age_26_35', 
    'gender_woman_and_new_era_season',
    'tribe_blue_and_capricorn'
]
processed_47_data = survivor_47_data[features]

In [95]:
# Load the pre-trained model
model_path = '/Users/ginagrainda/Desktop/survivor_data/survivor_winner_model.pkl'
with open(model_path, 'rb') as file:
    model = pickle.load(file)

In [111]:
# Make predictions
predictions = model.predict(processed_47_data)

# If model supports predict_proba, get probabilities to create a winner index
if hasattr(model, 'predict_proba'):
    prediction_probs = model.predict_proba(processed_47_data)
    winner_index = prediction_probs[:, 1].argmax()  # Index of participant with highest probability of winning
else:
    winner_index = predictions.argmax()

# Extract the predicted winner
predicted_winner = encoded_47_data.iloc[winner_index]

print('Winner probabilities are as follows for all contestants in Season 47:')
for i, (name, prob) in enumerate(zip(encoded_47_data['contestant_name'], prediction_probs[:, 1])):
        print(f"{i}: {name} - Win Probability: {prob:.2f}")

Winner probabilities are as follows for all contestants in Season 47:
0: Jon Lovett - Win Probability: 0.02
1: TK Foster - Win Probability: 0.09
2: Aysha Welch - Win Probability: 0.15
3: Kishan Patel - Win Probability: 0.05
4: Anika Dhar - Win Probability: 0.01
5: Rome Cooney - Win Probability: 0.05
6: Tiyana Hallums - Win Probability: 0.32
7: Sierra Wright - Win Probability: 0.01
8: Sol Yi - Win Probability: 0.00
9: Gabe Ortis - Win Probability: 0.00
10: Kyle Ostwald - Win Probability: 0.06
11: Caroline Vidmar - Win Probability: 0.11
12: Andy Rueda - Win Probability: 0.06
13: Genevieve Mushaluk - Win Probability: 0.74
14: Rachel LaMont - Win Probability: 0.61
15: Sam Phalen - Win Probability: 0.01
16: Sue Smey - Win Probability: 0.03
17: Teeny Chirichillo - Win Probability: 0.00


In [110]:
# Print the winner's name
winner_name = predicted_winner.get('contestant_name', 'Unknown')  
print(f"The predicted winner of Season 47 is {winner_name}.")

The predicted winner of Season 47 is Genevieve Mushaluk.


In [97]:
# Genevieve Mushaluk was voted out in Part I of the season finale. The model seems to be effective, though, as she played a great game! Rachel LaMont has the next highest winner probability, which makes her a viable pick.
# In an effort to predict the actual winner, I want to test the model on just the remaining 4 contestants: Rachel LaMont, Sue Smey, Teeny Chirichillo, and Sam Phelan

# Load season 47 data and select ONLY the 4 remaining contestants
file_path_season_47 = '/Users/ginagrainda/Desktop/survivor_data/survivor_season_47.csv'
survivor_47_data = pd.read_csv(file_path_season_47)
final_4_names = ['Rachel LaMont', 'Sue Smey', 'Teeny Chirichillo', 'Sam Phalen']
final_4_data = season_47_data[season_47_data['contestant_name'].isin(final_4_names)]

In [98]:
print(final_4_data)

      contestant_name  age           hometown           profession  \
14      Rachel LaMont   34     Southfield, MI     Graphic Designer   
15         Sam Phalen   24      Nashville, TN      Sports Reporter   
16           Sue Smey   58  Putnam Valley, NY  Flight School Owner   
17  Teeny Chirichillo   23     Manahawkin, NJ     Freelance Writer   

    num_season  old_school_season  new_era_season  finish  winner  black  ...  \
14          47                  0               1     NaN     NaN      0  ...   
15          47                  0               1     NaN     NaN      0  ...   
16          47                  0               1     NaN     NaN      0  ...   
17          47                  0               1     NaN     NaN      0  ...   

       region  astrological_sign  tribe_color  age_band  age_band_18-25  \
14    Midwest             Pisces       Yellow     26-35               0   
15      South              Aries       Yellow     18-25               1   
16  Northeast     

In [99]:
# Make a copy of final_4_data to avoid SettingWithCopyWarning
final_4_data = final_4_data.copy()

# Add interaction features for the final 4 contestants
final_4_data.loc[:, 'gender_man_and_region_West'] = final_4_data['gender_M'] * final_4_data['region_West']
final_4_data.loc[:, 'gender_woman_and_age_26_35'] = final_4_data['gender_F'] * final_4_data['age_band_26-35']
final_4_data.loc[:, 'poc_and_age_26_35'] = final_4_data['poc'] * final_4_data['age_band_26-35']
final_4_data.loc[:, 'gender_woman_and_new_era_season'] = final_4_data['gender_F'] * final_4_data['new_era_season']
final_4_data.loc[:, 'tribe_blue_and_capricorn'] = final_4_data['tribe_color_Blue'] * final_4_data['astrological_sign_Capricorn']

# Select features of interest into a new data frame
features = [
    'gender_M',
    'industry_Law, Public Safety, Military',
    'industry_Business, Finance, Administrative',
    'industry_Arts, Media, Entertainment',
    'industry_Advertising, Marketing, Sales',
    'region_Northeast',
    'region_West',
    'age_band_26-35',
    'tribe_color_Blue',
    'tribe_color_Yellow',
    # Interaction features
    'gender_man_and_region_West', 
    'gender_woman_and_age_26_35',
    'poc_and_age_26_35', 
    'gender_woman_and_new_era_season',
    'tribe_blue_and_capricorn'
]
processed_final_4_data = final_4_data[features]

In [112]:
# Make predictions
predictions = model.predict(processed_final_4_data)

# If model supports predict_proba, get probabilities to create a winner index
if hasattr(model, 'predict_proba'):
    prediction_probs = model.predict_proba(processed_final_4_data)
    winner_index = prediction_probs[:, 1].argmax()  # Index of participant with highest probability of winning
else:
    winner_index = predictions.argmax()  # Use predicted classes if probabilities are not available

# Extract the predicted winner
predicted_winner = final_4_data.iloc[winner_index]

print('Winner probabilities are as follows for the final 4 in Season 47:')
for i, (name, prob) in enumerate(zip(final_4_data['contestant_name'], prediction_probs[:, 1])):
        print(f"{i}: {name} - Win Probability: {prob:.2f}")

Winner probabilities are as follows for the final 4 in Season 47:
0: Rachel LaMont - Win Probability: 0.61
1: Sam Phalen - Win Probability: 0.01
2: Sue Smey - Win Probability: 0.03
3: Teeny Chirichillo - Win Probability: 0.00


In [113]:
# Print the winner's name or identifier
winner_name = predicted_winner.get('contestant_name', 'Unknown')  
print(f"The predicted winner of Season 47 is {winner_name}.")

The predicted winner of Season 47 is Rachel LaMont.
