In [None]:
import numpy as np 
import pandas as pd 
#df = pd.read_csv("../Analyzed_Games/twic1556_15_analyzed.csv")
df=pd.read_csv("../huge_analyzed_games/combined_analyzed_games.csv")
#df= pd.read_csv("../Analyzed_Games/twic920_15_analyzed.csv")


# Calculate Winning Chance for each position

Convert Evaluation to numeric (For example, M6 means Mate in 6, we convert it to +20)

In [None]:

df['Evaluation'] = df['Evaluation'].astype(str).str.strip()
df['PlayerToMove'] = np.where(df['MoveNumber'] % 2 == 1, 'White', 'Black')

# Function to convert 'Evaluation' to 'New_evaluations'
def convert_evaluation(row):
    """
    Convert the evaluation M to a numeric value.

    Parameters:
    row (pd.Series): A row from a DataFrame containing the 'Evaluation' column.

    Returns:
    float: The numeric evaluation value. Returns 0.0 for mate in 0 moves, 20.0 if White can mate, -20.0 if Black can mate, 
           the numeric evaluation if it can be parsed, or NaN if the evaluation cannot be parsed.
    """
    eval_str = row['Evaluation']
    
    if eval_str in ['+M0', '-M0', 'M0']:
        return 0.0  # Mate in 0 moves
    elif eval_str.startswith('+M') or (eval_str.startswith('M') and not eval_str.startswith('-M')):
        return 20.0  # White can mate
    elif eval_str.startswith('-M'):
        return -20.0  # Black can mate
    else:
        # Try to convert the evaluation to a float
        try:
            eval_float = float(eval_str)
            return eval_float  # Numeric evaluation remains the same
        except ValueError:
            return np.nan  # Unable to parse evaluation

# Apply the function to each row of df
df['Evaluation'] = df.apply(convert_evaluation, axis=1)

Create auxillary functions get_outcome and calculate_chances

calculate_chances calculates the chances of winning, losing and drawing for a given evaluation range based on all previous games and gives the number of games in which this evaluation occurs.

In [None]:
# Map 'Result' to outcome from White's perspective
def get_outcome(result):
    if result == '1-0':
        return 'Win'    # White won
    elif result == '0-1':
        return 'Loss'   # White lost
    elif result == '1/2-1/2':
        return 'Draw'   # Draw
    else:
        return None     # Exclude other results
    
    
def calculate_chances(df, lower_eval, upper_eval):
    """
    Calculate the chances of winning, drawing, and losing for positions within a specified evaluation range.

    Parameters:
    df (pd.DataFrame): DataFrame containing chess game data with columns 'GameID', 'Evaluations', 'Result', etc.
    lower_eval (float): The lower bound of the evaluation range.
    upper_eval (float): The upper bound of the evaluation range.

    Returns:
    list: A list containing the winning chance, drawing chance, losing chance, total number of valid games, and outcome counts.
    """
    
    # Filter positions where 'Evaluation' is between lower_eval and upper_eval
    positions_in_range = df[(df['Evaluation'] >= lower_eval) & (df['Evaluation'] <= upper_eval)].copy()
    
    # Get unique GameIDs where this occurs
    games_in_range = positions_in_range['GameID'].unique()
    
    # Get the results of these games
    game_results = df[df['GameID'].isin(games_in_range)][['GameID', 'Result']].drop_duplicates()
    
    # Apply the mapping
    game_results['Outcome'] = game_results['Result'].apply(get_outcome)
    
    # Exclude games with 'Other' outcomes
    valid_results = game_results.dropna(subset=['Outcome'])
    
    # Total number of valid games
    total_valid_games = valid_results.shape[0]
    outcome_counts=None
    if total_valid_games == 0:
        winning_chance = drawing_chance = losing_chance = 0.0
    else:
        # Count the number of games in each category
        outcome_counts = valid_results['Outcome'].value_counts()
        
        # Calculate percentages
        winning_chance = (outcome_counts.get('Win', 0) / total_valid_games) * 100
        drawing_chance = (outcome_counts.get('Draw', 0) / total_valid_games) * 100
        losing_chance = (outcome_counts.get('Loss', 0) / total_valid_games) * 100
    
    return [winning_chance, drawing_chance, losing_chance, total_valid_games,outcome_counts]

Apply the functions to all games and all evaluations, to get a table of winning, drawing and losing chance for each evaluation. 

In [None]:
# Define the evaluation bins
intervals = np.arange(-20.2, 20.2, 0.2)
intervals = np.round(intervals, decimals=1)
# Prepare a list to hold the results
results = []

# Loop over intervals
for i in range(len(intervals) - 1):
    lower_eval = intervals[i]
    upper_eval = intervals[i + 1]
    
    # Call the calculate_chances function
    winning_chance, drawing_chance, losing_chance, total_valid_games = calculate_chances(df, lower_eval, upper_eval)[:4]
    
    # Store the results
    results.append({
        'Interval': f"({lower_eval}, {upper_eval}]",
        'LowerEval': lower_eval,
        'UpperEval': upper_eval,
        'WinningChance': winning_chance,
        'DrawingChance': drawing_chance,
        'LosingChance': losing_chance,
        'TotalGames': total_valid_games,
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Now, adjust the DataFrame to fill intervals with TotalGames == 0
# Find the closest interval with TotalGames > 0 and copy its chances

# Create a DataFrame of intervals with TotalGames > 0
non_zero_df = results_df[results_df['TotalGames'] > 0].reset_index(drop=True)

# Function to fill in chances for intervals with TotalGames == 0
def fill_chances(row):
    if row['TotalGames'] > 0:
        # Keep original values
        return row[['WinningChance', 'DrawingChance', 'LosingChance']]
    else:
        lower_eval = row['LowerEval']
        # Compute absolute difference in LowerEval
        diffs = (non_zero_df['LowerEval'] - lower_eval).abs()
        min_idx = diffs.idxmin()
        closest_row = non_zero_df.loc[min_idx]
        return closest_row[['WinningChance', 'DrawingChance', 'LosingChance']]

# Apply the function to fill in the missing chances
filled_chances = results_df.apply(fill_chances, axis=1)

# Assign the filled values back to the DataFrame
results_df[['WinningChance', 'DrawingChance', 'LosingChance']] = filled_chances

# Remove the 'LowerEval' and 'UpperEval' columns
results_df = results_df.drop(columns=['LowerEval', 'UpperEval'])

#results_df.to_csv('winning_chances.csv', index=False)

Some evaluations are rare, so results could be skewed. In particular, the evaluation reflects the state of the game. Thus, the winning chance should be monotonically increasing with evaluation, so we impose this on the win and loss chance table. 

We then save the table to memory. 

In [None]:
# Adjust the 'WinningChance' column to be monotonically increasing
winning_chances = results_df['WinningChance'].values
for i in range(1, len(winning_chances)):
    if winning_chances[i] < winning_chances[i-1]:
        winning_chances[i] = winning_chances[i-1]
results_df['WinningChance'] = winning_chances

# Adjust the 'LosingChance' column to be monotonically decreasing
losing_chances = results_df['LosingChance'].values
for i in range(len(losing_chances)-2, -1, -1):
    if losing_chances[i] < losing_chances[i+1]:
        losing_chances[i] = losing_chances[i+1]
results_df['LosingChance'] = losing_chances

# Save the modified DataFrame back to CSV
results_df.to_csv('winning_chances_adjusted.csv', index=False)

## Calculate Winning Chance Loss for each move

We start by reading the games

In [None]:
pd.set_option('display.max_columns', None)
#df=pd.read_csv("../huge_analyzed_games/combined_analyzed_games.csv")
#df= pd.read_csv("../Cleaned_Analyzed_Games/twic920_15_processed.csv")
df=pd.read_csv("../huge_analyzed_games/combined_analyzed_games_20.csv")

This function takes a table of games and a winning chance table and calculates for each move its WCL and LCL (Losing Chance Loss) as two nes columns.

In [2]:
def process_chess_data(df, winning_chance_table=pd.read_csv('winning_chances_all_moves.csv'), intervals=np.arange(-13, 13.2, 0.2)):
    """
    Processes chess data by binning evaluation values, merging with winning chances,
    and computing WCL, LCL, Player, and 'a' columns.

    Parameters:
    df (pd.DataFrame): DataFrame containing chess game data with 'Evaluation', 'GameID', and 'MoveNumber' columns.
    winning_chance_table (pd.DataFrame): DataFrame containing winning chances with 'Interval', 'WinningChance', 'LosingChance', and 'TotalGames' columns.
    intervals (np.array): Numpy array of interval edges used for binning evaluations.

    Returns:
    pd.DataFrame: Modified DataFrame with additional columns added.
    """
    import numpy as np
    import pandas as pd

    # Ensure intervals are rounded to one decimal place
    intervals = np.round(intervals, decimals=1)
    edges = [-np.inf] + list(intervals) + [np.inf]

    # Create bin labels
    bin_labels = []
    for i in range(len(edges) - 1):
        lower = edges[i]
        upper = edges[i + 1]
        if np.isneginf(lower):
            label = f"(-infty, {upper}]"
        elif np.isposinf(upper):
            label = f"({lower}, infty)"
        else:
            label = f"({lower}, {upper}]"
        bin_labels.append(label)

    # Ensure that the bin labels in 'winning_chance_table' match the ones we're creating
    # This is important for a correct merge
    winning_chance_table['Interval'] = winning_chance_table['Interval'].astype(str)
    bin_labels = [str(label) for label in bin_labels]

    # Bin the 'Evaluation' values in 'df' to create an 'Interval' column
    df['Interval'] = pd.cut(
        df['Evaluation'],
        bins=edges,
        labels=bin_labels,
        right=True,
        include_lowest=True,
    )

    # Ensure 'Interval' in 'df' is of type string
    df['Interval'] = df['Interval'].astype(str)

    # Select the columns to merge
    columns_to_merge = ['Interval', 'WinningChance', 'LosingChance', 'TotalGames']

    # Merge 'df' with 'winning_chance_table' on 'Interval'
    df = df.merge(
        winning_chance_table[columns_to_merge],
        on='Interval',
        how='left'
    )

    # Compute 'WCL' and 'LCL' differences per game
    df['WCL'] = df.groupby('GameID')['WinningChance'].diff().abs()
    df['LCL'] = df.groupby('GameID')['LosingChance'].diff().abs()

    # Assign 'Player' based on move number
    df['Player'] = np.where(df['MoveNumber'] % 2 != 0, 'White', 'Black')

    # Compute 'a' = max(|WCL|, |LCL|) for each move
    df['a'] = df[['WCL', 'LCL']].abs().max(axis=1)

    return df


This function is then applied to all games

In [None]:

df=process_chess_data(df)

## Binning the WCL into mistake bins

This function creates a new DataFrame that contains two lines per game: one for White, containing his Name, Elo, Color, the properties of the Game (Opening, GameID etc) and the number of mistakes they made, binned by gravity (`mistake_bins`).

In [None]:
def create_summary_table(df, mistake_bins= [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 100], winning_chance_table=None, intervals=None):
    """
    Processes the chess DataFrame to create a summary table of mistakes per interval per player per game.

    Parameters:
    df (pd.DataFrame): DataFrame containing chess game data.
    mistake_bins (list): List of bin edges for mistake intervals.
    winning_chance_table (pd.DataFrame, optional): DataFrame containing winning chances.
                                         Default is loaded from 'winning_chances_all_moves.csv'.
    intervals (np.array, optional): Numpy array of interval edges used for binning evaluations.
                                    Default is np.arange(-13, 13.2, 0.2).

    Returns:
    pd.DataFrame: Summary table with mistakes per interval per player per game.
    """
    import numpy as np
    import pandas as pd

    # If 'WCL' does not exist, apply the 'process_chess_data' function
    if 'WCL' not in df.columns or 'a' not in df.columns:
        if winning_chance_table is None:
            winning_chance_table = pd.read_csv('winning_chances_adjusted.csv')
        if intervals is None:
            intervals = np.arange(-13, 13.2, 0.2)
        df = process_chess_data(df, winning_chance_table, intervals)

    # Step 1: Define mistake labels based on mistake_bins
    mistake_labels = []
    for i in range(len(mistake_bins)-1):
        label = f'({mistake_bins[i]},{mistake_bins[i+1]}]'
        mistake_labels.append(label)

    # Step 2: Assign each 'a' to a mistake interval
    df['MistakeInterval'] = pd.cut(
        df['a'],
        bins=mistake_bins,
        labels=mistake_labels,
        right=True,
        include_lowest=True
    )

    # Step 3: Identify the player making the move if 'Player' column doesn't exist
    if 'Player' not in df.columns:
        df['Player'] = np.where(df['MoveNumber'] % 2 != 0, 'White', 'Black')

    # Step 4: Group and count the number of mistakes per interval, per player, per game
    mistake_moves = df.dropna(subset=['MistakeInterval'])
    mistake_counts = mistake_moves.groupby(['GameID', 'Player', 'MistakeInterval']).size().reset_index(name='MistakeCount')

    # Step 5: Pivot the data to get a summary table per game and player
    summary_table = mistake_counts.pivot_table(
        index=['GameID', 'Player'],
        columns='MistakeInterval',
        values='MistakeCount',
        fill_value=0
    ).reset_index()

    # Flatten the column MultiIndex if necessary
    summary_table.columns.name = None
    summary_table.columns = [col if isinstance(col, str) else col for col in summary_table.columns]

    # Step 6: Compute Total Moves per game
    total_moves = df.groupby('GameID')['MoveNumber'].max().reset_index(name='TotalMoves')

    # Step 7: Extract game-level metadata: Opening, Variation, Result
    game_metadata = df.groupby('GameID').agg({
        'Opening': 'first',
        'Variation': 'first',
        'Result': 'first'
    }).reset_index()

    # Merge TotalMoves into game_metadata
    game_metadata = game_metadata.merge(total_moves, on='GameID', how='left')

    # Step 8: Extract player-level metadata
    player_metadata = df.groupby('GameID').agg({
        'WhiteName': 'first',
        'WhiteElo': 'first',
        'WhiteFideId': 'first',
        'BlackName': 'first',
        'BlackElo': 'first',
        'BlackFideId': 'first'
    }).reset_index()

    # Prepare player metadata for merging
    # For White players
    white_players = player_metadata[['GameID', 'WhiteName', 'WhiteElo', 'WhiteFideId']].copy()
    white_players['Player'] = 'White'
    white_players = white_players.rename(columns={
        'WhiteName': 'Name',
        'WhiteElo': 'Elo',
        'WhiteFideId': 'FideId'
    })

    # For Black players
    black_players = player_metadata[['GameID', 'BlackName', 'BlackElo', 'BlackFideId']].copy()
    black_players['Player'] = 'Black'
    black_players = black_players.rename(columns={
        'BlackName': 'Name',
        'BlackElo': 'Elo',
        'BlackFideId': 'FideId'
    })

    # Concatenate player metadata
    player_metadata_long = pd.concat([white_players, black_players], ignore_index=True)

    # Step 9: Merge player metadata with the summary table
    summary_table = summary_table.merge(player_metadata_long, on=['GameID', 'Player'], how='left')

    # Step 10: Merge game metadata with the summary table
    summary_table = summary_table.merge(game_metadata, on='GameID', how='left')
    total_moves_bins = [0, 30, 40, 50, 60, 70, 80, 90, 100, 120, np.inf]
    total_moves_labels = [
        '(0,30]', '(30,40]', '(40,50]', '(50,60]', '(60,70]',
        '(70,80]', '(80,90]', '(90,100]', '(100,120]', '(120,∞)'
    ]

    # Step 2: Assign each game to a TotalMovesInterval
    summary_table['TotalMovesInterval'] = pd.cut(
        summary_table['TotalMoves'],
        bins=total_moves_bins,
        labels=total_moves_labels,
        right=True,
        include_lowest=True
    )
    # Rearranging columns for better readability
    cols = ['GameID', 'Player', 'Name', 'Elo', 'FideId', 'Opening', 'Variation', 'Result', 'TotalMoves', 'TotalMovesInterval'] + mistake_labels
    summary_table = summary_table[cols]

    return summary_table


In [5]:
summary_table=create_summary_table(df)
#summary_table.to_csv("../huge_analyzed_games/big_summary_table.csv")

  mistake_counts = mistake_moves.groupby(['GameID', 'Player', 'MistakeInterval']).size().reset_index(name='MistakeCount')
  summary_table = mistake_counts.pivot_table(


# Linear Regression Model

In [7]:
table=create_summary_table(df)

  mistake_counts = mistake_moves.groupby(['GameID', 'Player', 'MistakeInterval']).size().reset_index(name='MistakeCount')
  summary_table = mistake_counts.pivot_table(


In [12]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
df= pd.read_csv("../Cleaned_Analyzed_Games/twic920_15_processed.csv")

In [6]:
mistake_bins= [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 100]
mistake_labels = []
for i in range(len(mistake_bins)-1):
    label = f'({mistake_bins[i]},{mistake_bins[i+1]}]'
    mistake_labels.append(label)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load your summary_table
#summary_table = pd.read_csv("../huge_analyzed_games/big_summary_table.csv")

# Define the mistake intervals and labels
mistake_bins = [5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 100]
mistake_labels = []
for i in range(len(mistake_bins)-1):
    label = f'({mistake_bins[i]},{mistake_bins[i+1]}]'
    mistake_labels.append(label)

# Ensure 'TotalMovesInterval' is added to summary_table
total_moves_bins = [0, 30, 40, 50, 60, 70, 80, 90, 100, 120, np.inf]
total_moves_labels = []
for i in range(len(total_moves_bins)-1):
    lower = total_moves_bins[i]
    upper = total_moves_bins[i+1]
    if np.isinf(upper):
        label = f'({lower},∞]'
    else:
        label = f'({lower},{upper}]'
    total_moves_labels.append(label)
    
summary_table['TotalMovesInterval'] = pd.cut(
    summary_table['TotalMoves'],
    bins=total_moves_bins,
    labels=total_moves_labels,
    right=True,
    include_lowest=True
)

# Handle missing values in the target variable
summary_table = summary_table.dropna(subset=['Elo'])

# Handle missing values in categorical features by adding 'Unknown' to categories
categorical_features = ['Opening', 'Variation', 'Result', 'TotalMovesInterval', 'Player']

for col in categorical_features:
    # Ensure the column is of 'category' dtype
    if not pd.api.types.is_categorical_dtype(summary_table[col]):
        summary_table[col] = summary_table[col].astype('category')
    # Add 'Unknown' to categories if not already present
    if 'Unknown' not in summary_table[col].cat.categories:
        summary_table[col] = summary_table[col].cat.add_categories(['Unknown'])
    # Fill NaN values with 'Unknown'
    summary_table[col] = summary_table[col].fillna('Unknown')

# Handle missing values in numerical features (mistake intervals)
mistake_intervals = mistake_labels  # List of mistake interval columns
summary_table[mistake_intervals] = summary_table[mistake_intervals].fillna(0)

# Step 1: Read the 'winning_chances_all_moves.csv' Table
results_df = pd.read_csv('winning_chances_all_moves.csv')

# Define the features
feature_columns = ['Opening',  'Result', 'TotalMovesInterval', ] + mistake_intervals
X = summary_table[feature_columns]

# Identify categorical and numerical features
categorical_features = ['Opening', 'Result', 'TotalMovesInterval', ]
numerical_features = mistake_intervals

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a ColumnTransformer to apply OneHotEncoder to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Pass through numerical features without changes
)

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

# Create bin labels
bin_labels = []
for i in range(len(edges) - 1):
    lower = edges[i]
    upper = edges[i + 1]
    if np.isneginf(lower):
        label = f"(-infty, {upper}]"
    elif np.isposinf(upper):
        label = f"({lower}, infty)"
    else:
        label = f"({lower}, {upper}]"
    bin_labels.append(label)

print(f"Percentage of predictions within ±{threshold} Elo: {percentage_within_threshold:.2f}%")


  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):


Root Mean Squared Error (RMSE): 265.69
R-squared Score (R²): 0.12
Percentage of predictions within ±300 Elo: 75.97%


In [17]:
# Specify the player's name or FIDE ID
player_name = 'Caruana Fabiano'  # Replace with the player's name
player_fide_id = 2020009       # Replace with the player's FIDE ID (if available)

# Extract the 5 games for the player
player_games = summary_table[
    (summary_table['Name'] == player_name) | (summary_table['FideId'] == player_fide_id)
].head(10)  # Get the first 5 games

# If you have specific GameIDs
#game_ids = [1, 2, 3, 4, 5]  # Replace with the actual GameIDs
#player_games = summary_table[summary_table['GameID'].isin(game_ids)]
# Handle missing values in categorical features
categorical_features = ['Opening', 'Variation', 'Result', 'TotalMovesInterval', 'Player']

for col in categorical_features:
    # Ensure the column is of 'category' dtype
    if not pd.api.types.is_categorical_dtype(player_games[col]):
        player_games[col] = player_games[col].astype('category')
    # Add 'Unknown' to categories if not already present
    if 'Unknown' not in player_games[col].cat.categories:
        player_games[col] = player_games[col].cat.add_categories(['Unknown'])
    # Fill NaN values with 'Unknown'
    player_games[col] = player_games[col].fillna('Unknown')

# Handle missing values in numerical features (mistake intervals)
player_games[mistake_intervals] = player_games[mistake_intervals].fillna(0)

# Define the features
feature_columns = ['Opening', 'Variation', 'Result', 'TotalMovesInterval', 'Player'] + mistake_intervals
X_player = player_games[feature_columns]

  if not pd.api.types.is_categorical_dtype(player_games[col]):


In [18]:
# Handle missing values in categorical features
categorical_features = ['Opening', 'Variation', 'Result', 'TotalMovesInterval', 'Player']

for col in categorical_features:
    # Ensure the column is of 'category' dtype
    if not pd.api.types.is_categorical_dtype(player_games[col]):
        player_games[col] = player_games[col].astype('category')
    # Add 'Unknown' to categories if not already present
    if 'Unknown' not in player_games[col].cat.categories:
        player_games[col] = player_games[col].cat.add_categories(['Unknown'])
    # Fill NaN values with 'Unknown'
    player_games[col] = player_games[col].fillna('Unknown')

# Handle missing values in numerical features (mistake intervals)
player_games[mistake_intervals] = player_games[mistake_intervals].fillna(0)

# Define the features
feature_columns = ['Opening', 'Variation', 'Result', 'TotalMovesInterval', 'Player'] + mistake_intervals
X_player = player_games[feature_columns]

y_player_pred = pipeline.predict(X_player)

# Print the predictions for each game
for i, pred in enumerate(y_player_pred):
    print(f"Predicted Elo for Game {i+1}: {pred:.2f}")
    
average_predicted_elo = y_player_pred.mean()
print(f"\nAverage Predicted Elo for {player_name}: {average_predicted_elo:.2f}")


Predicted Elo for Game 1: 2258.58
Predicted Elo for Game 2: 2216.70
Predicted Elo for Game 3: 2371.31
Predicted Elo for Game 4: 2376.94
Predicted Elo for Game 5: 2287.00
Predicted Elo for Game 6: 2467.72
Predicted Elo for Game 7: 2269.58
Predicted Elo for Game 8: 2402.88
Predicted Elo for Game 9: 2242.95
Predicted Elo for Game 10: 2418.32

Average Predicted Elo for Caruana Fabiano: 2331.20


  if not pd.api.types.is_categorical_dtype(player_games[col]):


In [62]:
import pandas as pd
import numpy as np

# Step 3: Bin the 'Evaluation' Values in 'df' to Create an 'Interval' Column
# Assuming 'df' is your analyzed chess DataFrame and 'Evaluation' column exists
df['Interval'] = pd.cut(
    df['Evaluation'],
    bins=edges,
    labels=bin_labels,
    right=True,
    include_lowest=True,
)

# Step 4: Merge 'df' with 'results_df' on 'Interval'
# Select the columns to merge
columns_to_merge = ['Interval', 'WinningChance', 'LosingChance', 'TotalGames']

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

# Get the names of the categorical features after one-hot encoding
onehot_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)

# Combine with numerical feature names
all_feature_names = np.concatenate([onehot_feature_names, numerical_features])

# Get the coefficients from the linear regression model
coefficients = pipeline.named_steps['regressor'].coef_

# Create a DataFrame to display feature names and their coefficients
coef_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient': coefficients
})

# Sort the coefficients by absolute value
coef_df['AbsCoefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='AbsCoefficient', ascending=False)

# Display the top 20 features with the highest absolute coefficients
print("\nTop 20 features by absolute coefficient value:")
print(coef_df[['Feature', 'Coefficient']].head(20))


  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):
  if not pd.api.types.is_categorical_dtype(summary_table[col]):


Root Mean Squared Error (RMSE): 269.73
R-squared Score (R²): 0.08

Top 20 features by absolute coefficient value:
                                               Feature  Coefficient
254                             Opening_Scotch opening  -451.073717
130    Opening_King's Indian Defense: Kazakh Variation  -415.913072
36                              Opening_Canard opening  -372.655133
282  Opening_Trompowsky Attack: Classical Defense, ...   357.048160
112                   Opening_Gruenfeld with e3    Bd3  -334.868810
206                             Opening_Queen's Gambit  -328.209702
270  Opening_Sicilian, Szen variation, Dely-Kasparo...  -319.903968
52                           Opening_Damiano's defence  -317.661726
40                           Opening_Caro-Masi defence  -314.348275
181          Opening_Pirc Defense: Classical Variation   286.606038
293          Opening_Vienna gambit, Steinitz variation   266.532981
294             Opening_Vienna gambit, Wurzburger trap  -256.344930
31

In [49]:
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared Score (R²): {r2:.2f}')

# Calculate the percentage of predictions within ±300 Elo
absolute_errors = np.abs(y_pred - y_test)
threshold = 300
within_threshold = np.sum(absolute_errors <= threshold)
total_predictions = len(y_test)
percentage_within_threshold = (within_threshold / total_predictions) * 100

print(f"Percentage of predictions within ±{threshold} Elo: {percentage_within_threshold:.2f}%")

Root Mean Squared Error (RMSE): 266.19
R-squared Score (R²): 0.11
Percentage of predictions within ±300 Elo: 77.62%


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load your summary_table
#summary_table = pd.read_csv("../huge_analyzed_games/big_summary_table.csv")

# Filter data for White players only
summary_table_white = summary_table[summary_table['Player'] == 'White']

# Define the mistake intervals and labels
mistake_labels = [
    '(3,5]', '(5,10]', '(10,15]', '(15,20]', '(20,25]', '(25,30]',
    '(30,35]', '(35,40]', '(40,50]', '(50,60]', 
]

# Handle missing values in mistake intervals
summary_table_white[mistake_labels] = summary_table_white[mistake_labels].fillna(0)

# Ensure 'Elo' is numeric
summary_table_white['Elo'] = pd.to_numeric(summary_table_white['Elo'], errors='coerce')

# Remove rows with missing 'Elo' values
summary_table_white = summary_table_white.dropna(subset=['Elo'])

# Step 1: Bin the Elo Ratings
# Define Elo rating bins
elo_bins = [0, 1200, 1400, 1600, 1800, 2000, 2200, 2400, 2600, 2800, np.inf]
elo_labels = ['<1200', '1200-1399', '1400-1599', '1600-1799', '1800-1999',
              '2000-2199', '2200-2399', '2400-2599', '2600-2799', '2800+']

# Perform the merge
df = df.merge(
    results_df[columns_to_merge],
    on='Interval',
    how='left'
)

# Now 'df' has the new columns added


KeyError: "['(3,5]'] not in index"

In [None]:
df['WCL'] = df.groupby('GameID')['WinningChance'].diff().abs()
df['LCL'] = df.groupby('GameID')['LosingChance'].diff().abs()
df.loc[df['MoveNumber'] % 2 == 0, 'WCL'] = None
df.loc[df['MoveNumber'] % 2 != 0, 'LCL'] = None