In [26]:
import json
import pandas as pd

# Load the JSON file
file_path = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_with_ids_and_colors_and_race_and_team_ids.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Flatten the data
team_data = []

for team, team_info in data.items():
    team_id = team_info.get('team_id')
    colors = team_info.get('colors')
    players = team_info.get('players')
    
    for player_name, player_info in players.items():
        player_data = {
            'team_id': team_id,
            'team_name': team,
            'player_id': player_info.get('player_id'),
            'player_name': player_name,
            'team_colors': ','.join(colors).split(','),
            'position': player_info.get('position'),
            'jersey_number': player_info.get('number'),
            'race': player_info.get('race')
        }
        team_data.append(player_data)

# Convert to DataFrame
df = pd.DataFrame(team_data)

# Display the DataFrame

# Save to CSV
csv_file_path = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_df.csv'
df.to_csv(csv_file_path, index=False)

In [27]:
df

# what features will we extract for each player?
    # ['jersey_number', 'race', 'team_colors', 'position']
    # {'jersey_number', 'race', 'team (based on jersey colors', 'player_postion')}
# how do we encode each feature?
    # 1. team_name: one-hot encoding


Unnamed: 0,team_id,team_name,player_id,player_name,team_colors,position,jersey_number,race
0,1610612738,boston_celtics,202340,"Bradley, Avery","[#007A33, #BA9653, #963821, #000000]",PG,0,black
1,1610612738,boston_celtics,203109,"Crowder, Jae","[#007A33, #BA9653, #963821, #000000]",SF,99,black
2,1610612738,boston_celtics,203089,"Holland, John","[#007A33, #BA9653, #963821, #000000]",SG,30,black
3,1610612738,boston_celtics,1626154,"Hunter, R. J.","[#007A33, #BA9653, #963821, #000000]",SG,28,black
4,1610612738,boston_celtics,201973,"Jerebko, Jonas","[#007A33, #BA9653, #963821, #000000]",PF,8,white
...,...,...,...,...,...,...,...,...
447,1610612759,san_antonio_spurs,1889,"Miller, Andre","[#C0C0C0, #000000]",PG,24,black
448,1610612759,san_antonio_spurs,201988,"Mills, Patty","[#C0C0C0, #000000]",PG,8,black
449,1610612759,san_antonio_spurs,2225,"Parker, Tony","[#C0C0C0, #000000]",PG,9,white
450,1610612759,san_antonio_spurs,203613,"Simmons, Jonathon","[#C0C0C0, #000000]",SG,17,black


In [17]:
import json
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

# Load the JSON data
fp = '/playpen-storage/levlevi/player-re-id/src/data/team_rosters_with_ids_and_colors_and_race_and_team_ids.json'
with open(fp, 'r') as file:
    data = json.load(file)

# Extract player information
players = []
team_player_mapping = {}  # To store the mapping from team ID and player ID to feature vector

for team_name, team_info in data.items():
    team_colors = team_info['colors']
    team_id = team_info['team_id']
    for player_name, player_info in team_info['players'].items():
        positions = player_info['position'].split('/')  # Handle multiple positions
        player_entry = {
            "number": player_info['number'],
            "positions": positions,
            "race": player_info['race'],
            "colors": team_colors
        }
        players.append(player_entry)
        team_player_mapping[(team_id, player_info['player_id'])] = player_entry

# Prepare data for encoding
numbers = [int(player['number']) for player in players]
positions = [player['positions'] for player in players]
races = [player['race'] for player in players]
colors = [player['colors'] for player in players]

# One-hot encode positions (handling multiple positions)
mlb_positions = MultiLabelBinarizer()
positions_encoded = mlb_positions.fit_transform(positions)

# One-hot encode races
encoder_race = OneHotEncoder(sparse_output=False)
races_encoded = encoder_race.fit_transform(np.array(races).reshape(-1, 1))

# One-hot encode colors (flatten the list of color lists first)
flat_colors = [color for sublist in colors for color in sublist]
encoder_colors = OneHotEncoder(sparse_output=False)
encoder_colors.fit(np.array(flat_colors).reshape(-1, 1))

colors_encoded = np.array([encoder_colors.transform(np.array(color_list).reshape(-1, 1)).sum(axis=0) for color_list in colors])

# Combine all features into a single feature vector
feature_vectors = np.hstack((np.array(numbers).reshape(-1, 1), positions_encoded, races_encoded, colors_encoded))

# Map team ID and player ID to feature vectors
encoded_team_player_mapping = {}
for (team_id, player_id), player_entry in team_player_mapping.items():
    index = players.index(player_entry)
    encoded_team_player_mapping[f"{team_id}_{player_id}"] = feature_vectors[index].tolist()

# Save the result to a JSON file
with open('team_player_feature_vectors_with_ids.json', 'w') as outfile:
    json.dump(encoded_team_player_mapping, outfile, indent=4)

print("Feature vectors saved to team_player_feature_vectors_with_ids.json")

Feature vectors saved to team_player_feature_vectors_with_ids.json
