In [7]:
# The code assumes that you have already constructed your initial player list

In [2]:
import pandas as pd
import numpy as np

In [3]:
player_list = pd.read_csv('./playerlist.csv')

In [4]:
# guess is a single row from the player_list dataframe that contains all
# information on a player. answer is the same format. 
# returns an updated dictionary new_info based on the information gained from guessing guess
def find_info(guess, current_information, answer):
    new_info = current_information.copy()
    for col in new_info:
        if (guess[col].iloc[0] == answer[col].iloc[0]):
            new_info[col] = 1
        else:
            new_info[col] = 0
    # League and Div may take on the value of 2 if we have made a guess for which one of these columns
    # is a match, but the other is not. A 0 means neither is a match, and a 1 means both are a match
    # in the actual game, one would not know whether League or Div is the match if only one is a match
    if (new_info['League'] != new_info['Div']):
        new_info['League'] = 2
        new_info['Div'] = 2
    # Age may take on the value of 2 if the guess's age does not equal the answer's age, but is within two years
    if (new_info['Age'] == 0):
        if (answer['Age'].iloc[0] - 2 <= guess['Age'].iloc[0] and answer['Age'].iloc[0] + 2 >= guess['Age'].iloc[0]):
            new_info['Age'] = 2
    return new_info

# given the current remaining possible players stored in df and the current known infomation info
# relative to the most recent guess and the values of the most recent guess, filter the remaining
# players to only leave those that align with the current information
def find_remaining_players(df, info, guess):
    remaining_players = df.copy()
    for col in binary_columns:
        remaining_players = binary_event(remaining_players, col, info[col], guess)

    # for age, a 0 excludes not only players with exact same age, but also players with an age
    # within two years of that of the guess
    if (info['Age'] == 0):
        remaining_players = remaining_players[(remaining_players['Age'] - 2 > guess['Age'].iloc[0])
                        | (remaining_players['Age'] + 2 < guess['Age'].iloc[0])]
    elif (info['Age'] == 1):
        remaining_players = remaining_players[remaining_players['Age'] == guess['Age'].iloc[0]]
    # a two means that the answer's age must be within two years of the age of the current guess, but
    # not the same as that of the current guess
    else:
        remaining_players = remaining_players[(remaining_players['Age'] - 2 <= guess['Age'].iloc[0])
                        & (remaining_players['Age'] + 2 >= guess['Age'].iloc[0])
                        & ((remaining_players['Age'] != guess['Age'].iloc[0]))]
    # A zero means that the current gues and answer share neither League or Div
    if (info['League'] == 0):
        remaining_players = remaining_players[(remaining_players['League'] != guess['League'].iloc[0])
                        & (remaining_players['Div'] != guess['Div'].iloc[0])]
    # A one means that the current guess and answer share both
    elif (info['League'] == 1):
        remaining_players = remaining_players[((remaining_players['League'] == guess['League'].iloc[0])
                        & (remaining_players['Div'] == guess['Div'].iloc[0]))]
    # A two means they share one but not the other, but the user does not know which
    # attribute is shared, so an exclusive or is used
    else:
        remaining_players = remaining_players[(remaining_players['League'] == guess['League'].iloc[0])
                        ^ (remaining_players['Div'] == guess['Div'].iloc[0])]
    return remaining_players

# given an initial set of remaining players old_df, the column col of interest, 
# the event of interest, and the current guess find the remaining players
# after filtering for the event under the current information
# Only for use with events that can take on only 0 or 1 as their value
def binary_event(old_df, col, event, guess):
    if(event):
        new_df = old_df[old_df[col] == guess[col].iloc[0]]
    else:
        new_df = old_df[old_df[col] != guess[col].iloc[0]]
    return new_df


# for the given guess, a dataframe for one player with all of their relevant information,
# compute the expected number of remaining players that could possibly be the answer
# after making this guess given the current remaining players in the dataframe current_player_list
# Note: each player in current_player_list is assumed to have equal probability of being the answer
def find_xp(current_player_list, guess):
    # test all possible events except for the Name being correct, because if this is the case, then
    # the number of remaining possible players is 0, so this would not contribute to the expected value
    # calculation anyway
    events_of_interest = {'Name': [0], 'Team': [0,1], 'Bats': [0,1], 'Throws': [0,1], 'Born': [0,1], 'Age': [0,1,2], 'Position': [0,1],
                     'League': [0,1,2], 'Div': [0,1,2]}
    remaining_players = current_player_list.copy()
    initial_remaining = len(remaining_players)
    # initialize the expected number of remaining players to 0
    xp = 0
    
    # Rather than finding the remaining players left after applying all the events of interest for each column, they are applied individually
    # If after any application of any event there are no remaining players, the function moves to the next event of interest for that column
    # without completing the remaining iterations of the nested for loops. The number of remaining players at the end of the last nest for loop
    # divided by the number of initial players is the probability that this set of events will be the result.
    # Applying the events individually and reusing the filtering between different sets of events of interest
    # does not throw off the results because of the chain rule of probability as for events N, Te, Ba, Tr, Bo, A, P, L, D
    # P(N & Te & Ba & Tr & Bo & A & P & L & D) = P(N)*P(Te|N)*P(Ba|NTe)*P(Tr|NTeBa)*P(Bo|NTeBaTr)*P(A|NTeBaTrBo)*P(P|NTeBaTrBo)*P(L|NTeBaTrBoAP)*P(D|NTeBaTrBoAPL)
    for event in events_of_interest['Name']:
        name_df = binary_event(remaining_players, 'Name', event, guess)
        if (name_df.empty):
            continue
        
        for event in events_of_interest['Team']:
            team_df = binary_event(name_df, 'Team', event, guess)
            if (team_df.empty):
                continue
            
            for event in events_of_interest['Bats']:
                bats_df = binary_event(team_df, 'Bats', event, guess)
                if (bats_df.empty):
                    continue
                
                for event in events_of_interest['Throws']:
                    throws_df = binary_event(bats_df, 'Throws', event, guess)
                    if (throws_df.empty):
                        continue
                    
                    for event in events_of_interest['Born']:
                        born_df = binary_event(throws_df, 'Born', event, guess)
                        if (born_df.empty):
                            continue
                        
                        # Age and League/Div require more extensive logic as they
                        # are not binary events. See find_remaining_players for further information
                        for event in events_of_interest['Age']:
                            if (event == 0):
                                age_df = born_df[(born_df['Age'] - 2 > guess['Age'].iloc[0])
                                                | (born_df['Age'] + 2 < guess['Age'].iloc[0])]
                            elif (event == 1):
                                age_df = binary_event(born_df, 'Age', event, guess)
                            else:
                                age_df = born_df[(born_df['Age'] - 2 <= guess['Age'].iloc[0])
                                                & (born_df['Age'] + 2 >= guess['Age'].iloc[0])
                                                & (born_df['Age'] != guess['Age'].iloc[0])]
                            if (age_df.empty):
                                continue
    
                            for event in events_of_interest['Position']:
                                pos_df = binary_event(age_df, 'Position', event, guess)
                                if (pos_df.empty):
                                    continue
    
                                for event in events_of_interest['League']:
                                    if (event == 0):
                                        division_df = pos_df[(pos_df['League'] != guess['League'].iloc[0])
                                                        & (pos_df['Div'] != guess['Div'].iloc[0])]
                                    elif (event == 1):
                                        division_df = binary_event(pos_df, 'Division', event, guess)
                                    else:
                                        division_df = pos_df[(pos_df['League'] == guess['League'].iloc[0])
                                                        ^ (pos_df['Div'] == guess['Div'].iloc[0])]
                                    if (division_df.empty):
                                        continue
                                    # compute the final probability and update the expected value computation
                                    num_remaining = len(division_df)
                                    current_prob = num_remaining/initial_remaining
                                    xp += num_remaining * current_prob
    # Using this method rather than the naive approach of looping over every remaining player as the possible answer and finding the 
    # number of remaining players reduces the runtime of this function from ~1.5 seconds to ~0.06 seconds for the initial list of 
    # over a 1300 players. This initial computation only need be performed every time the initial list is updated, and so it 
    # is not done during the actual execution of the webapp, but each subsequent use of this method after the intial guess
    # will also be faster. The naive approach requires over 1300 iterations in all cases, while this approach requires
    # 288 iterations in the worst case, but often requires fewer due to the continue statements
    return xp

# for every player in the player_list dataframe, comput the expected number of remaining players that could be the answer
# if that player were guessed
# return an updated copy of player_list with the new information
def update_player_list_withxps(player_list):
    xps = np.zeros(len(player_list))
    # a for loop rather than df.apply() is used for ease of debugging should something in the find_xp() fail
    for i in range(len(player_list)):
        guess = player_list.iloc[i:i+1]
        xps[i] = find_xp(player_list, guess)
    updated_player_list = player_list.copy()
    updated_player_list['xp'] = xps
    return updated_player_list

In [None]:
# update the player list with the xp values and sort them and commit these lists to the repository
updated_player_list = update_player_list_withxps(player_list)
updated_player_list_xpsorted = updated_player_list.sort_values('xp')
updated_player_list.set_index('Name').to_csv('playerlist_withxp.csv')
updated_player_list_xpsorted.set_index('Name').to_csv('playerlist_withxp_sorted.csv')

In [5]:
# for use in the luck score computation
# for a given guess, find the number of players that would remain after guessing that player for each
# possible remainig answer in remaining_players
def possible_remaining_finder(remaining_players, guess):
    possible_rems = []
    for i in range(len(remaining_players)):
        # update the player list
        current_information = {'Name': 0, 'Team': 0, 'Bats': 0, 'Throws': 0, 'Born': 0, 'Age': 0, 'Position': 0,
                     'League': 0, 'Div': 0}
        prov_answer = remaining_players.iloc[i:i+1]
        current_information = find_info(guess, current_information, prov_answer)
        remaining_guesses = find_remaining_players(remaining_players, current_information, guess)
        # if the only remaining player is the current guess, then there would be no remaining players
        if len(remaining_guesses) == 1:
            if (guess.Name.iloc[0] == remaining_guesses.Name.iloc[0]):
                possible_rems.append(0)
            else:
                possible_rems.append(1)
        else:
            possible_rems.append(len(remaining_guesses))
    return np.array(possible_rems)

In [None]:
# compute the posssible number of remaining players for each player and create a new dataframe with this
# informationa attached to each player. Commit this dataframe to the repository
player_list_with_possible_remaining = player_list.copy()
player_list_with_possible_remaining['possible_num_remaining_players'] = player_list_with_possible_remaining.apply(
lambda row: possible_remaining_finder(player_list.copy(), pd.DataFrame(row).T), axis = 1)
player_list_possible_remaining = player_list_with_possible_remaining[['Name', 'possible_num_remaining_players']].copy()
player_list_possible_remaining.set_index('Name').to_csv('initial_possible_num_remaining.csv')