In [1]:
import pandas as pd
from pybaseball.lahman import *
download_lahman()
from itertools import combinations
import time
import random

df = pd.read_csv('careerteams.csv')
wardf = pd.read_csv('codes.csv')
people = people()
people['Name'] = people['nameFirst'] + " " + people['nameLast']
df = df.merge(wardf[['code','bwar']],on='code',how='left')
df = df.merge(people[['bbrefID', 'Name']], left_on='code', right_on='bbrefID', how='left')

df['Tm'] = df['Tm'].replace('SEP', 'MIL')
df['Tm'] = df['Tm'].replace('BSN', 'ATL')
df['Tm'] = df['Tm'].replace('MLN', 'ATL')
df['Tm'] = df['Tm'].replace('WSA', 'TEX')
df['Tm'] = df['Tm'].replace('ANA', 'LAA')
df['Tm'] = df['Tm'].replace('CAL', 'LAA')
df['Tm'] = df['Tm'].replace('KCA', 'OAK')
df['Tm'] = df['Tm'].replace('PHA', 'OAK')
df['Tm'] = df['Tm'].replace('MON', 'WSN')
df['Tm'] = df['Tm'].replace('FLA', 'MIA')
df['Tm'] = df['Tm'].replace('BRO', 'LAD')
df['Tm'] = df['Tm'].replace('SLB', 'BAL')
df['Tm'] = df['Tm'].replace('TBD', 'TBR')
df['Tm'] = df['Tm'].replace('NYG', 'SFG')
df['Tm'] = df['Tm'].replace('WSH', 'MIN')

df = df.drop('bbrefID', axis = 1)
tmp = df.groupby('code').Tm.nunique().reset_index()
tmp = tmp[tmp.Tm == 1]
ridlist = list(tmp.code)
df = df[~df.code.isin(ridlist)]

# Create a dictionary mapping each player to the unique teams they've played for
players_to_teams = df.groupby('code')['Tm'].unique().apply(list).to_dict()

# Sort the team lists for each player
for player, teams in players_to_teams.items():
    players_to_teams[player] = ','.join(sorted(teams))  # convert to string

# Map the sorted team lists back to the DataFrame
df['teams_played_for'] = df['code'].map(players_to_teams)

# Sort the df by bwar
df = df.sort_values('bwar',ascending=False)

# Keep the guy with the highest bwar for ever team profile 
keeplist = list(df.groupby('teams_played_for').first()['code'])

df = df[df.code.isin(keeplist)]

In [10]:
def find_minimal_players(df, iterations=10):
    # Output dictionary to store results
    results = {}
    curr_best = float('inf')  # Initialized to infinity for easy comparison
    
    # Create a dictionary mapping each player to the unique teams they've played for
    players_to_teams = df.groupby('code')['Tm'].unique().apply(set).to_dict()
    
    for _ in range(iterations):

        # Shuffle the players' dictionary
        shuffled_players = list(players_to_teams.keys())
        random.shuffle(shuffled_players)
        shuffled_dict = {key: players_to_teams[key] for key in shuffled_players}

        # Create a dictionary mapping each pair of teams to the players who have played for both teams
        team_pairs_to_players = {}
        for player, teams in shuffled_dict.items():
            for team_pair in combinations(teams, 2):
                sorted_pair = tuple(sorted(team_pair))
                if sorted_pair not in team_pairs_to_players:
                    team_pairs_to_players[sorted_pair] = set()
                team_pairs_to_players[sorted_pair].add(player)

        # Use a Greedy approach to select the player who covers the most uncovered team pairs
        team_pairs_covered = set()
        players_to_keep = set()

        while len(team_pairs_covered) < len(team_pairs_to_players):
            uncovered_pairs = list(team_pairs_to_players.keys() - team_pairs_covered)
            print(len(uncovered_pairs))
            # Shuffle the order of uncovered pairs
            random.shuffle(uncovered_pairs)

            # Select the player who covers the most uncovered team pairs
            best_player = None
            best_player_pairs = set()
            for player, teams in shuffled_dict.items():
                player_pairs = set([tuple(sorted(p)) for p in combinations(teams, 2)]) & set(uncovered_pairs)
                if len(player_pairs) > len(best_player_pairs):
                    best_player = player
                    best_player_pairs = player_pairs

                    # if the current solution is already worse than the best one, stop considering more players
                    if len(players_to_keep) + 1 >= curr_best:
                        break
            
            players_to_keep.add(best_player)
            team_pairs_covered.update(best_player_pairs)
            print(best_player + '\t' + str(len(best_player_pairs)))

            # Remove the best player from consideration for the next iterations
            if best_player in shuffled_dict:
                del shuffled_dict[best_player]

        # Store the result in the dictionary
        results[len(players_to_keep)] = list(players_to_keep)
        min_key = min(results.keys())
        if min_key < curr_best:
            curr_best = min_key
            print("Current best:", curr_best)

    return results


In [11]:
start_time = time.time()
results = find_minimal_players(df, iterations=1)
end_time = time.time()

elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)



435
jacksed01	91
344
hillri01	63
281
villoro01	54
227
stairma01	40
187
doteloc01	33
154
mulhote01	28
126
claytro01	25
101
chenbr01	19
82
hawkila01	17
65
guilljo01	15
50
cookde01	11
39
sierrru01	10
29
whiteri01	6
23
bakopa01	5
18
klinero01	5
13
cruzne02	4
9
blairwi01	4
5
mahayro01	3
2
mitchke01	2
Current best: 19
Elapsed time:  0.6695067882537842


In [12]:
min_key = min(results.keys())
print(min_key)

19


In [13]:
filtered_df = df[df['code'].isin(results[min_key])]
player_names = filtered_df['Name'].drop_duplicates().tolist()
print(player_names)

['Nelson Cruz', 'Kevin Mitchell', 'Royce Clayton', 'LaTroy Hawkins', 'Rich Hill', 'Ruben Sierra', 'Ron Kline', 'Octavio Dotel', 'Matt Stairs', 'Terry Mulholland', 'Dennis Cook', 'Bruce Chen', 'Edwin Jackson', 'Ron Mahay', 'Jose Guillen', 'Ron Villone', 'Willie Blair', 'Rick White', 'Paul Bako']
