In [116]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings

In [117]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [118]:
players_df = pd.read_csv("players_db/fm23/fm23db_processed.csv")
team_ratings = pd.read_csv("players_db/fm23/team_ratings.csv")
win_df = pd.read_csv("players_db/fm23/win_probability.csv", index_col=0)

<br><br><br><br>
<h2 style="color:blue;">  Win Probability</h3>

In [119]:
def Win_Probability(win_prob_df, power_y, power_x):
    logit_sum = 0
    prob_dict = {}
    for match_result in [1,0,2]:
        tpr_y = win_prob_df.loc["tpr_y", str(match_result)]
        tpr_x = win_prob_df.loc["tpr_x", str(match_result)]
        intercept = win_prob_df.loc["intercept", str(match_result)]
        fx = tpr_y*power_y + tpr_x*power_x + intercept
        logit_sum += np.e ** fx
        prob_dict[match_result] = np.e**fx
    for match_result in [1,0,2]:
        prob_dict[match_result] = prob_dict[match_result] / logit_sum
    prob_sum = sum(prob_dict.values())
    normalized_probs = {k: v / prob_sum for k, v in prob_dict.items()}
    return normalized_probs

In [120]:
Win_Probability(win_df, 90, 85)

{1: 0.5232914373372837, 0: 0.2342638818946376, 2: 0.24244468076807876}

<br><br><br><br>
<h2 style="color:blue;">  League Simuation Test</h3>

In [121]:
team_ratings.head(n=5)

Unnamed: 0,Club,Club_id,League_id,Division,Based,tpr,gk,def,pas,dri,fin,sta,str,hed,men,iq,fbref_name
0,Manchester City,6827,354,English Premier Division,England (Premier Division),93,88,89,98,94,92,101,95,86,94,92,Manchester City
1,Paris Saint-Germain,7994,773,Ligue 1 Uber Eats,France (Ligue 1 Uber Eats),93,81,86,100,97,90,93,91,88,90,91,Paris S-G
2,Liverpool,6518,354,English Premier Division,England (Premier Division),92,90,89,95,94,87,101,94,94,92,90,Liverpool
3,FC Barcelona,1435,1215,Spanish First Division,Spain (First Division),91,91,87,92,93,91,98,94,93,94,92,Barcelona
4,Internazionale,5215,710,Italian Serie A,Italy (Serie A),91,82,90,91,87,91,98,102,100,86,85,Inter


In [122]:
import pandas as pd
import numpy as np
from itertools import product

current_league_id = 354	
teams_in_league_ratings = team_ratings.query(f"League_id == {current_league_id}")

# Initialize dictionary to keep track of points for each team
points_table = {team: 0 for team in teams_in_league_ratings['Club']}

# Create a list of all possible home-away matchups
matchups = list(product(teams_in_league_ratings['Club'], repeat=2))

# Filter out matches where the team plays against itself
matchups = [match for match in matchups if match[0] != match[1]]

# Simulate each match
for home_team, away_team in matchups:
    home_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == home_team]['tpr'].values[0]
    away_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == away_team]['tpr'].values[0]
    
    # Simulate home match
    probs_home = Win_Probability(win_df, home_tpr, away_tpr)
    result_home = np.random.choice(list(probs_home.keys()), p=list(probs_home.values()))
    if result_home == 1:
        points_table[home_team] += 3
    elif result_home == 0:
        points_table[home_team] += 1
        points_table[away_team] += 1
    else:
        points_table[away_team] += 3

# Print final standings
standings = sorted(points_table.items(), key=lambda x: x[1], reverse=True)
print("Final League Standings:")
for i, (team, points) in enumerate(standings, start=1):
    print(f"{i}. {team}: {points} points")

Final League Standings:
1. Liverpool: 82 points
2. Manchester City: 80 points
3. Tottenham Hotspur: 73 points
4. Manchester United: 70 points
5. Chelsea: 61 points
6. West Ham United: 56 points
7. Arsenal: 56 points
8. Southampton: 56 points
9. Leicester City: 54 points
10. Aston Villa: 54 points
11. Leeds United: 54 points
12. Fulham: 50 points
13. Nottingham Forest: 46 points
14. Newcastle United: 42 points
15. Crystal Palace: 40 points
16. Everton: 38 points
17. Bournemouth: 38 points
18. Wolverhampton: 36 points
19. Brentford: 36 points
20. Brighton & Hove Albion: 34 points


In [123]:
# team_ratings.query(f"League_id == {354}")

<br><br><br><br>
<h1 style="color:red;">  Passing (players)</h1>

In [142]:
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

In [143]:
pdf = players_df.sort_values(by="Pos_Rank").dropna(subset=["fbref_player_name"])

In [144]:
pdf.Best_Pos.value_counts()

Best_Pos
MC     682
DC     642
GK     463
AML    450
ST     448
DR     352
AMR    325
DL     314
DM     234
AMC    118
ML      99
MR      91
WBR     22
WBL     17
Name: count, dtype: int64

In [127]:
url = "https://fbref.com/en/comps/Big5/2022-2023/passing/players/2022-2023-Big-5-European-Leagues-Stats"
data = pd.read_html(url)
data[0].columns = data[0].columns.get_level_values(1)

In [128]:
passing_data = data[0]

In [129]:
mdf = pd.merge(pdf, passing_data, how="inner", left_on="fbref_player_name", right_on="Player")
mdf = mdf.drop_duplicates(subset=['Player'], keep='first')
mdf = rename_duplicate_columns(mdf)

In [130]:
mdf["Att"] = pd.to_numeric(mdf["Att"], errors='coerce')
mdf["90s"] = pd.to_numeric(mdf["90s"], errors='coerce')
mdf["Cmp"] = pd.to_numeric(mdf["Cmp"], errors='coerce')
mdf["Cmp%"] = pd.to_numeric(mdf["Cmp%"], errors='coerce')

In [131]:
mdf["Att90"] = mdf["Att"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp90"] = mdf["Cmp"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp%"] = mdf["Cmp%"].apply(float)

In [132]:
# mdf.loc[0].to_dict()

In [133]:
mdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2467 entries, 0 to 5109
Columns: 138 entries, UID to Cmp90
dtypes: float64(6), int64(79), object(53)
memory usage: 2.6+ MB


In [134]:
mdf[["Player","Pos_Rank","Best_Pos","tpr","pas","Att90","Cmp90","Cmp%"]].head()

Unnamed: 0,Player,Pos_Rank,Best_Pos,tpr,pas,Att90,Cmp90,Cmp%
0,Vito Mannone,0,GK,71,53,32.569832,24.804469,76.2
2,Léo Jardim,0,GK,68,54,25.666667,20.333333,79.2
3,Wojciech Szczęsny,0,GK,81,57,28.690909,22.727273,79.2
4,Benjamin Lecomte,0,GK,75,62,27.5,17.9,65.1
6,Tobias Mohr,0,GK,38,31,54.408602,33.11828,60.9


In [135]:
mdf.drop_duplicates(subset=['Pos_Rank'], keep='first')[["Pos_Rank","Best_Pos"]]

Unnamed: 0,Pos_Rank,Best_Pos
0,0,GK
1325,1,WBR
1355,2,DR
1887,3,DC
3217,4,DL
3530,5,WBL
3544,6,DM
3777,7,MC
4273,8,MR
4325,9,AMC


In [136]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming mdf is your DataFrame and has the necessary columns
# mdf = pd.read_csv('your_data.csv')  # Load your data here if it's from a CSV file

# Select the relevant columns
df = mdf[["Pos_Rank", "tpr", "pas", "Cmp%"]]

# Drop any rows with missing values
df = df.dropna()
print(df.info())

# Define the independent variables (predictors) and the dependent variable (response)
X = df[["Pos_Rank", "tpr", "pas"]]
y = df["Cmp%"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print the coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Print performance metrics
print("Mean squared error (MSE):", mean_absolute_error(y_test, y_pred))
print("Mean squared error (MSE):", mean_squared_error(y_test, y_pred))
print("Coefficient of determination (R^2):", r2_score(y_test, y_pred))

<class 'pandas.core.frame.DataFrame'>
Index: 2449 entries, 0 to 5109
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pos_Rank  2449 non-null   int64  
 1   tpr       2449 non-null   int64  
 2   pas       2449 non-null   int64  
 3   Cmp%      2449 non-null   float64
dtypes: float64(1), int64(3)
memory usage: 95.7 KB
None
Coefficients: [-0.69954708 -0.00830549  0.16318856]
Intercept: 70.08746287631286
Mean squared error (MSE): 7.629352668009643
Mean squared error (MSE): 118.30061407302766
Coefficient of determination (R^2): 0.02352409203901984


In [137]:
def Predict_Pass_Completion_Rate(pos, tpr, pas):
    input_data = {'Pos_Rank': [pos], 'tpr': [tpr], 'pas': [pas]}
    input_df = pd.DataFrame(input_data)
    noise_range = (-5, 10)
    predicted_class = model.predict(input_df) + np.random.uniform(noise_range[0], noise_range[1], input_df.shape)
    return predicted_class

print("Predicted Class:", Predict_Pass_Completion_Rate(12, 100, 84))

Predicted Class: [[82.05791943 79.24671494 71.58590103]]


In [139]:
def Player_Passing_Volume_Share(df):
    df["Att_share"] = df["Att"] / df["Att"].sum()
    df["Cmp_share"] = df["Cmp"] / df["Cmp"].sum()
    return df

mdf = mdf.groupby('Club').apply(Player_Passing_Volume_Share).reset_index(drop=True)

In [138]:
passing_logs = pd.read_csv("match_logs/Big5@22-23@passing.csv")
passing_data = passing_data
passing_df = pd.merge(team_ratings, passing_data, how="inner", left_on="fbref_name", right_on="Squad")