In [2]:

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import os
import pandas as pd
import re 

from utility.constants import *

# Utility: Load files
def load_file(folder, filename):
    """Function to read csv files"""
    file_path = os.path.join(folder, filename)
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    raise FileNotFoundError(f"File {filename} not found in folder {folder}")

def extract_years(folder_path):
    """Extract years from CSV file names."""
    years = [int(re.match(r"^\d{4}", file.name).group()) 
                for file in folder_path.glob("*.csv") if re.match(r"^\d{4}", file.name)]
    return sorted(years)

# Load ADP file
def load_adp_file(adp_dir, given_year=None):
    """Load ADP file at Random year; else use given year"""

    if given_year == None:
        years = extract_years(adp_dir)
        year = random.choice(years)
        print(f"Randomly selected ADP year of:", year)

    else: 
        year = given_year

    file_name = f"{year}ADP.csv"
    adp_df = load_file(ADP_DIR, file_name)
    adp_df['year'] = year   
    return adp_df

# Load stats
def load_seasonal_stats(seasonal_stats_dir, year):
    return load_file(seasonal_stats_dir, f"player_stats_{year}.csv")

def load_defensive_stats(defensive_stats_dir, year):
    return load_file(defensive_stats_dir, f"seasonal_defensive_stats_{year}.csv")

# Merge stats into ADP
def merge_stats(adp_df, seasonal_stats_df, defensive_stats_df):
    adp_df = adp_df.merge(
        seasonal_stats_df[["player_id", "fppr"]], on="player_id", how="left"
    )
    defensive_stats_df = defensive_stats_df.rename(columns={"pa_team": "player_id", "fpts": "def_fpts"})
    adp_df = adp_df.merge(
        defensive_stats_df[["player_id", "def_fpts"]], on="player_id", how="left"
    )
    adp_df["fpts"] = adp_df.apply(
        lambda row: row["def_fpts"] if row["POSITION"] == "DST" else row["fppr"], axis=1
    )
    return adp_df

In [55]:
def get_min_player_count_by_position(adp_dir, dstats_dir):
    """
    Get the minimum count of players for each position across all ADP season data
    and the minimum count of teams (DST) across all years.

    Args:
        adp_dir (Path): Directory containing ADP data files.
        dstats_dir (Path): Directory containing defensive stats files.

    Returns:
        dict: A dictionary where keys are positions and 'DST',
              and values are the minimum number of players or teams available across all years.
    """
    years = extract_years(adp_dir)
    position_counts = {}
    dst_counts = []  # List to store the count of teams (DST) for each year

    for year in years:
        # Load ADP data for the year
        df = load_adp_file(adp_dir, year)
        position_grouped = df.groupby("POSITION").size()  # Count players per position

        # Load defensive stats (DST) data for the year
        ddf = load_defensive_stats(dstats_dir, year)
        team_count = ddf['pa_team'].nunique()  # Count unique teams in defensive stats
        dst_counts.append(team_count)

        # Collect position-specific counts
        for position, count in position_grouped.items():
            if position not in position_counts:
                position_counts[position] = []
            position_counts[position].append(count)

    # Get the minimum count for each position across all years
    min_counts = {position: min(counts) for position, counts in position_counts.items()}

    # Add DST (team count) to the results
    min_counts["DST"] = min(dst_counts)

    return min_counts


In [56]:
min_player_dict = get_min_player_count_by_position(ADP_DIR, DEFENSIVE_STATS_DIR)

In [57]:
adp_df = load_adp_file(ADP_DIR, "2021")
year = adp_df['year'].iloc[0]
seasonal_stats_df = load_seasonal_stats(SEASONAL_STATS_DIR, year)
defensive_stats_df = load_defensive_stats(DEFENSIVE_STATS_DIR, year)
data_df = merge_stats(adp_df, seasonal_stats_df, defensive_stats_df)

# Sort players by FPPRAVG
data_df = data_df.sort_values(by="FPPRAVG").reset_index(drop=True)

# Initialize draft setup
DRAFT_ORDER = list(range(1, NUM_MANAGERS + 1))
random.shuffle(DRAFT_ORDER)

results = []
pick_order = 1

In [58]:
data_df.shape

(527, 13)

In [59]:
def filter_top_players_by_position(data_df, top_players):
    """
    Filter the top players for each position based on FPPRAVG (lower is better).
    
    Args:
    - data_df (pd.DataFrame): The original data frame containing player data.
    - top_players (dict): Dictionary specifying the number of top players to retain for each position.
    
    Returns:
    - pd.DataFrame: Filtered data containing only the top players for each position.
    """
    filtered_data = pd.DataFrame()
    for position, top_n in top_players.items():
        position_data = data_df[data_df["POSITION"] == position]
        # Sort by FPPRAVG (ascending) and select the top N players
        top_position_data = position_data.sort_values("FPPRAVG").head(top_n)
        filtered_data = pd.concat([filtered_data, top_position_data])
        filtered_data = filtered_data.sort_values(by='FPPRAVG')
    return filtered_data

In [60]:
data_df2 = filter_top_players_by_position(data_df, min_player_dict)

In [61]:
def calculate_tiers_by_metric_geo(data_df, metric_col='FPPRAVG', num_tiers=5, r=2):
    """
    Calculate tier proportions using corrected geometric progression for ascending metric.

    Args:
    - data_df (pd.DataFrame): DataFrame containing player data.
    - metric_col (str): Column name for the performance metric (e.g., 'FPPRAVG').
    - num_tiers (int): Number of tiers to create.
    - r (float): Common ratio for geometric progression (default=2).

    Returns:
    - pd.DataFrame: DataFrame with an additional "Tier" column.
    - list: Number of players in each tier.
    """
    # Sort the dataframe by the metric column in ascending order (better performance first)
    sorted_data = data_df.sort_values(by=metric_col, ascending=True).reset_index(drop=True)
    
    # Calculate reversed geometric progression weights
    weights = [r ** (i - 1) for i in range(1, num_tiers + 1)]
    
    # Normalize weights to sum to 1
    normalized_weights = [w / sum(weights) for w in weights]
    
    # Calculate the total number of players
    total_players = len(sorted_data)
    
    # Allocate players to tiers based on normalized weights
    players_per_tier = [int(total_players * w) for w in normalized_weights]
    
    # Adjust to ensure the total number of players matches exactly
    players_per_tier[-1] += total_players - sum(players_per_tier)
    
    # Assign tiers to players
    tier_assignments = []
    start_idx = 0
    for tier, count in enumerate(players_per_tier, start=1):
        end_idx = start_idx + count
        tier_assignments.extend([tier] * count)
        start_idx = end_idx

    sorted_data['Tier'] = tier_assignments
    return sorted_data, players_per_tier


data_df3, players_per_tier_geo = calculate_tiers_by_metric_geo(data_df2, metric_col='FPPRAVG', r=2)

print("Players per Tier (Geometric Progression):", players_per_tier_geo)


Players per Tier (Geometric Progression): [11, 22, 44, 89, 182]


In [68]:
data_df3.head(30)

Unnamed: 0,player_name,player_id,FPPRPOS,FPPRAVG,HPPRPOS,HPPRAVG,STRDPOS,STRDAVG,POSITION,year,fppr,def_fpts,fpts,Tier
0,Christian McCaffrey,00-0033280,RB1,1.0,RB1,1.0,RB1,1.0,RB,2021,127.5,,127.5,1
1,Dalvin Cook,00-0033893,RB2,2.0,RB2,2.0,RB2,2.0,RB,2021,206.300001,,206.300001,1
2,Alvin Kamara,00-0033906,RB3,3.0,RB4,4.0,RB4,4.0,RB,2021,234.700005,,234.700005,1
3,Derrick Henry,00-0032764,RB4,4.0,RB3,3.0,RB3,3.0,RB,2021,193.300005,,193.300005,1
4,Ezekiel Elliott,00-0033045,RB5,5.0,RB5,5.0,RB5,5.0,RB,2021,250.660006,,250.660006,1
5,Davante Adams,00-0031381,WR1,6.0,WR1,8.0,WR1,8.0,WR,2021,344.300003,,344.300003,1
6,Travis Kelce,00-0030506,TE1,7.0,TE1,12.0,TE1,12.0,TE,2021,262.8,,262.8,1
7,Aaron Jones,00-0033293,RB6,8.0,RB7,7.0,RB7,7.0,RB,2021,229.000004,,229.000004,1
8,Saquon Barkley,00-0034844,RB7,9.0,RB9,10.0,RB9,10.0,RB,2021,148.6,,148.6,1
9,Nick Chubb,00-0034791,RB10,10.0,RB6,6.0,RB6,6.0,RB,2021,215.300001,,215.300001,1


In [81]:
data_df3.groupby('Tier')['fpts'].agg({'mean', 'std', 'max', 'median', 'min'})

Unnamed: 0_level_0,mean,min,max,std,median
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,232.387275,127.5,344.300003,68.362919,229.000004
2,226.997275,48.100001,402.580002,99.249043,230.950004
3,202.94381,2.0,439.499997,92.65111,201.149999
4,143.823371,8.6,338.159997,73.716104,133.0
5,86.06443,0.0,259.100003,62.112826,81.0


In [85]:
def calculate_scaled_tier_weights_with_std(data_df, tier_col='Tier', metric_col='fpts', max_weight=10, scaling_factor=1.2):
    """
    Calculate and scale tier weights using mean and std deviation for tiers.

    Args:
    - data_df (pd.DataFrame): DataFrame containing tier and performance data.
    - tier_col (str): Column name for tier information (e.g., 'Tier').
    - metric_col (str): Column name for the performance metric (e.g., 'fpts').
    - max_weight (float): Maximum weight for Tier 1.
    - scaling_factor (float): Factor to control non-linearity.

    Returns:
    - dict: Scaled tier weights with a maximum of 10.
    """
    # Group by tier and calculate mean and std deviation
    tier_stats = data_df.groupby(tier_col)[metric_col].agg(['mean', 'std'])

    # Normalize mean to Tier 1
    tier_1_mean = tier_stats['mean'].max()
    normalized_means = tier_stats['mean'] / tier_1_mean

    # Adjust weights based on normalized mean and penalize by std deviation
    adjusted_weights = {
        tier: (normalized_means[tier] ** scaling_factor) * (1 / (1 + tier_stats.loc[tier, 'std'] / 100)) * max_weight
        for tier in tier_stats.index
    }
    
    # Scale weights so that the maximum is 10
    max_calculated_weight = max(adjusted_weights.values())
    scaled_weights = {tier: round((weight / max_calculated_weight) * 10, 2) for tier, weight in adjusted_weights.items()}
    
    return scaled_weights

# Example usage
tier_weights = calculate_scaled_tier_weights_with_std(
    data_df3, tier_col='Tier', metric_col='fpts', max_weight=10, scaling_factor=1.2
)

print("Scaled Tier Weights with Mean and Std Adjustment:", tier_weights)


Scaled Tier Weights with Mean and Std Adjustment: {1: np.float64(10.0), 2: np.float64(8.22), 3: np.float64(7.43), 4: np.float64(5.45), 5: np.float64(3.15)}


In [86]:
def calculate_scarcity(data_df, tier_weights, position_col='POSITION', tier_col='Tier'):
    """
    Calculate scarcity for each position based on remaining players and tier weights.

    Args:
    - data_df (pd.DataFrame): DataFrame with columns 'POSITION', 'Tier', and 'Remaining Players'.
    - tier_weights (dict): Dictionary of weights for each tier (e.g., {1: 10, 2: 7, ...}).

    Returns:
    - dict: Scarcity scores for each position.
    """
    scarcity_scores = {}
    
    for position in data_df[position_col].unique():
        position_data = data_df[data_df[position_col] == position]
        total_players = len(position_data)  # Total players in this position
        scarcity = 0
        
        for tier in position_data[tier_col].unique():
            tier_data = position_data[position_data[tier_col] == tier]
            remaining_players = len(tier_data)  # Count remaining players in this tier
            print(remaining_players)
            tier_weight = tier_weights[tier]
            
            # Add the tier contribution to scarcity
            scarcity += (remaining_players / total_players) / tier_weight
        
        scarcity_scores[position] = scarcity
    
    return scarcity_scores



In [88]:
def calculate_scarcity_dynamic(data_df, tier_weights, position_col='POSITION', tier_col='Tier', use_top_tier=True):
    """
    Calculate scarcity for each position based on remaining players and tier weights, using top-tier or total weights.

    Args:
    - data_df (pd.DataFrame): DataFrame with columns for position, tier, and remaining players.
    - tier_weights (dict): Weights assigned to each tier (e.g., {1: 10, 2: 7, ...}).
    - position_col (str): Column name for position information.
    - tier_col (str): Column name for tier information.
    - use_top_tier (bool): Whether to divide by the top-tier weight only or total tier weights.

    Returns:
    - dict: Scarcity scores for each position.
    """
    scarcity_scores = {}
    max_tier_weight = max(tier_weights.values())  # Weight for the top tier
    total_tier_weight = sum(tier_weights.values())  # Sum of all tier weights

    for position in data_df[position_col].unique():
        position_data = data_df[data_df[position_col] == position]
        scarcity = 0

        for tier in position_data[tier_col].unique():
            tier_data = position_data[position_data[tier_col] == tier]
            remaining_players = len(tier_data)  # Count remaining players in this tier
            tier_weight = tier_weights[tier]

            # Add the tier contribution to scarcity
            scarcity += remaining_players * tier_weight

        # Normalize scarcity by the chosen reference weight
        reference_weight = max_tier_weight if use_top_tier else total_tier_weight
        scarcity_scores[position] = scarcity / reference_weight

    return scarcity_scores

# Example setup
# tier_weights = {1: 10, 2: 7, 3: 5, 4: 3, 5: 2}  # Define weights for each tier


# Calculate scarcity using top-tier normalization
scarcity_top_tier = calculate_scarcity_dynamic(data_df3, tier_weights, use_top_tier=True)

# Calculate scarcity using total tier weights normalization
scarcity_total_tier = calculate_scarcity_dynamic(data_df3, tier_weights, use_top_tier=False)

print("Scarcity (Top Tier Reference):", scarcity_top_tier)
print("Scarcity (Total Tier Reference):", scarcity_total_tier)

Scarcity Scores by Position: {'RB': np.float64(506.63), 'WR': np.float64(551.65), 'TE': np.float64(196.03), 'QB': np.float64(192.85), 'DST': np.float64(123.8), 'K': np.float64(105.15)}


In [None]:
def calculate_scarcity(data_df, tier_weights, position_col='POSITION', tier_col='Tier'):
    """
    Calculate scarcity for each position using tier weights and player distribution.

    Args:
    - data_df (pd.DataFrame): DataFrame containing player data with tiers and positions.
    - tier_weights (dict): Dictionary of weights for each tier (e.g., {1: 10, 2: 7, ...}).
    - position_col (str): Column name for player positions (e.g., 'POSITION').
    - tier_col (str): Column name for tier information (e.g., 'Tier').

    Returns:
    - dict: Scarcity scores for each position.
    """
    # Initialize scarcity dictionary
    scarcity_scores = {}

    # Loop through each position
    for position in data_df[position_col].unique():
        # Filter data for the current position
        position_data = data_df[data_df[position_col] == position]
        
        # Calculate weighted sum of players per tier
        scarcity = 0
        for tier, weight in tier_weights.items():
            tier_count = len(position_data[position_data[tier_col] == tier])  # Count players in this tier
            scarcity += tier_count * weight  # Apply weight
        
        # Normalize scarcity (optional)
        scarcity_scores[position] = round(scarcity, 2)

    return scarcity_scores

# Example usage
tier_weights = {1: 10, 2: 8.22, 3: 7.43, 4: 5.45, 5: 3.15}  # Replace with your actual tier weights
scarcity_scores = calculate_scarcity(data_df3, tier_weights, position_col='POSITION', tier_col='Tier')

print("Scarcity Scores by Position:", scarcity_scores)


In [19]:
def calculate_dynamic_tier_weights(data_df, fpts_col='fpts', tier_col='Tier', max_weight=10):
    """
    Calculate tier weights dynamically based on fpts values.

    Args:
    - data_df (pd.DataFrame): DataFrame containing player data with 'fpts' and 'Tier' columns.
    - fpts_col (str): Column name for fpts values.
    - tier_col (str): Column name for tier information.
    - max_weight (int): Maximum weight for the top tier (default=10).

    Returns:
    - dict: Tier weights calculated based on fpts.
    """
    # Calculate the mean or median fpts for each tier
    tier_fpts = data_df.groupby(tier_col)[fpts_col].median().dropna()
    
    # Get the maximum fpts value across all tiers
    max_fpts = tier_fpts.max()
    
    # Calculate weights as proportional to the fpts
    tier_weights = {tier: max_weight * (fpts / max_fpts) for tier, fpts in tier_fpts.items()}
    
    return tier_weights

# Simulate fpts for demonstration (use actual fpts in real data)
# data_df['fpts'] = data_df['FPPRAVG'] * 10  # Example calculation of fpts based on FPPRAVG

# Apply the function
tier_weights_dynamic = calculate_dynamic_tier_weights(data_df3, fpts_col='fpts', tier_col='Tier', max_weight=10)

# Display the dynamic weights
tier_weights_dynamic


{1: np.float64(10.0),
 2: np.float64(8.94267413888344),
 3: np.float64(6.2158657795253704),
 4: np.float64(4.506699153347334),
 5: np.float64(2.9252289331737065)}