In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings

In [2]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [3]:
players_ratings = pd.read_csv("../players_db/fm23/fm23db_processed.csv")
team_ratings = pd.read_csv("../players_db/fm23/team_ratings.csv")

<br><br><br><br>
<h2 style="color:blue;">  Win Probability</h3>

In [4]:
win_df = pd.read_csv("../players_db/fm23/win_probability.csv", index_col=0)
def Win_Probability(win_prob_df, power_x, power_y, is_neutral=False):
    logit_sum = 0
    prob_dict = {}
    for match_result in [1,0,2]:
        tpr_x = win_prob_df.loc["tpr_x", str(match_result)]
        tpr_y = win_prob_df.loc["tpr_y", str(match_result)]
        intercept = win_prob_df.loc["intercept", str(match_result)]
        fx = tpr_y*power_y + tpr_x*power_x + intercept
        logit_sum += np.e ** fx
        prob_dict[match_result] = np.e**fx
    for match_result in [1,0,2]:
        prob_dict[match_result] = prob_dict[match_result] / logit_sum
    prob_sum = sum(prob_dict.values())
    normalized_probs = {k: v / prob_sum for k, v in prob_dict.items()}
    
    if is_neutral and power_x==power_y:
        draw = normalized_probs[0]
        non_draw_side = (1 - draw) / 2
        return {1: non_draw_side, 0: draw, 2: non_draw_side}
    return normalized_probs

In [5]:
Win_Probability(win_df, 50, 60)

{1: 0.15141943498481023, 0: 0.43814483031639917, 2: 0.41043573469879063}

In [6]:
Win_Probability(win_df, 60, 70)

{1: 0.1876793963427024, 0: 0.3740351552084532, 2: 0.43828544844884443}

In [7]:
Win_Probability(win_df, 70, 80)

{1: 0.22807162854246782, 0: 0.3130594683962014, 2: 0.4588689030613309}

<br><br>

In [8]:
Win_Probability(win_df, 75, 75)

{1: 0.36105836634747795, 0: 0.33042638291014814, 2: 0.30851525074237396}

In [9]:
Win_Probability(win_df, 80, 80)

{1: 0.39175485727530857, 0: 0.29753740482911134, 2: 0.31070773789558015}

In [10]:
Win_Probability(win_df, 90, 90)

{1: 0.4532254040572556, 0: 0.23708341827188148, 2: 0.30969117767086296}

In [11]:
Win_Probability(win_df, 92, 79)

{1: 0.6229503239181537, 0: 0.2275294948764984, 2: 0.14952018120534785}

In [12]:
Win_Probability(win_df, 85, 84) == Win_Probability(win_df, 84, 85) 

False

<br><br><br><br>
<h2 style="color:blue;">  League Simuation Test</h3>

In [13]:
team_ratings.League_id.unique()

array([ 185, 1215,  354,  773,  710, 1331, 1017,  363], dtype=int64)

In [50]:
import pandas as pd
import numpy as np
from itertools import product

current_league_id = 1215	
teams_in_league_ratings = team_ratings.query(f"League_id == {current_league_id}")

points_table = {team: 0 for team in teams_in_league_ratings['Club']}
matchups = list(product(teams_in_league_ratings['Club'], repeat=2))
matchups = [match for match in matchups if match[0] != match[1]]

for home_team, away_team in matchups:
    home_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == home_team]['tpr'].values[0]
    away_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == away_team]['tpr'].values[0]
    
    probs_home = Win_Probability(win_df, home_tpr, away_tpr, is_neutral=True)
    result_home = np.random.choice(list(probs_home.keys()), p=list(probs_home.values()))
    if result_home == 1:
        points_table[home_team] += 3
    elif result_home == 0:
        points_table[home_team] += 1
        points_table[away_team] += 1
    else:
        points_table[away_team] += 3

standings = sorted(points_table.items(), key=lambda x: x[1], reverse=True)

markdown_table = "| Rank | Team | TPR | Points |\n|------|------|-----|-----|\n"
for i, (team, points) in enumerate(standings, start=1):
    tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == team]['tpr'].values[0]
    markdown_table += f"| {i} | {team} | {tpr} | {points} |\n"
from IPython.display import Markdown, display
display(Markdown(markdown_table))

| Rank | Team | TPR | Points |
|------|------|-----|-----|
| 1 | Valencia | 74 | 84 |
| 2 | Real Madrid | 87 | 79 |
| 3 | FC Barcelona | 87 | 77 |
| 4 | Atlético Madrid | 79 | 65 |
| 5 | Real Sociedad | 74 | 63 |
| 6 | Almería | 67 | 54 |
| 7 | Celta de Vigo | 74 | 54 |
| 8 | Sevilla | 77 | 51 |
| 9 | Villarreal | 77 | 50 |
| 10 | Real Betis | 74 | 48 |
| 11 | Real Valladolid | 67 | 48 |
| 12 | Rayo Vallecano | 69 | 44 |
| 13 | Girona | 67 | 43 |
| 14 | Elche CF | 67 | 42 |
| 15 | Osasuna | 67 | 42 |
| 16 | Athletic Bilbao | 74 | 39 |
| 17 | Getafe | 72 | 39 |
| 18 | Mallorca | 67 | 33 |
| 19 | Cádiz | 67 | 32 |
| 20 | Espanyol | 64 | 32 |


In [46]:
players_ratings.query("Club != '__Manchester City'")[["Name","tpr","tpr_general","tpr_Best"]].sort_values("tpr", ascending=False).head(20)

Unnamed: 0,Name,tpr,tpr_general,tpr_Best
3,Manuel Neuer,92,64,75
4164,Robert Lewandowski,92,72,75
3193,Erling Haaland,90,72,74
3195,Harry Kane,90,73,74
4172,Marc-André ter Stegen,89,51,73
3197,Bernardo Silva,89,73,73
3192,Kevin De Bruyne,89,73,73
3194,Mohamed Salah,89,72,73
4165,Luka Modrić,88,72,72
3213,Bruno Fernandes,88,72,72


<br><br><br><br>
<h1 style="color:red;">  Passing (players)</h1>

In [16]:
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

In [17]:
pdf = players_ratings.sort_values(by="Pos_Rank").dropna(subset=["fbref_player_name"])

In [18]:
pdf.Best_Pos.value_counts()

MC     682
DC     642
GK     463
AML    450
ST     448
DR     352
AMR    325
DL     314
DM     234
AMC    118
ML      99
MR      91
WBR     22
WBL     17
Name: Best_Pos, dtype: int64

In [19]:
url = "https://fbref.com/en/comps/Big5/2022-2023/passing/players/2022-2023-Big-5-European-Leagues-Stats"
data = pd.read_html(url)
data[0].columns = data[0].columns.get_level_values(1)

In [20]:
passing_data = data[0]

In [21]:
mdf = pd.merge(pdf, passing_data, how="inner", left_on="fbref_player_name", right_on="Player")
mdf = mdf.drop_duplicates(subset=['Player'], keep='first')
mdf = rename_duplicate_columns(mdf)

In [22]:
mdf["Att"] = pd.to_numeric(mdf["Att"], errors='coerce')
mdf["90s"] = pd.to_numeric(mdf["90s"], errors='coerce')
mdf["Cmp"] = pd.to_numeric(mdf["Cmp"], errors='coerce')
mdf["Cmp%"] = pd.to_numeric(mdf["Cmp%"], errors='coerce')

In [23]:
mdf["Att90"] = mdf["Att"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp90"] = mdf["Cmp"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp%"] = mdf["Cmp%"].apply(float)

In [24]:
# mdf.loc[0].to_dict()

In [25]:
mdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2460 entries, 0 to 5085
Columns: 138 entries, UID to Cmp90
dtypes: float64(9), int64(76), object(53)
memory usage: 2.6+ MB


In [26]:
mdf[["Player","Pos_Rank","Best_Pos","tpr","Passing","Att90","Cmp90","Cmp%"]].head()

Unnamed: 0,Player,Pos_Rank,Best_Pos,tpr,Passing,Att90,Cmp90,Cmp%
0,Vito Mannone,0,GK,75,49,32.569832,24.804469,76.2
2,Léo Jardim,0,GK,71,49,25.666667,20.333333,79.2
3,Wojciech Szczęsny,0,GK,84,54,28.690909,22.727273,79.2
4,Benjamin Lecomte,0,GK,76,54,27.5,17.9,65.1
6,Tobias Mohr,0,GK,49,44,54.408602,33.11828,60.9


In [27]:
mdf.drop_duplicates(subset=['Pos_Rank'], keep='first')[["Pos_Rank","Best_Pos"]]

Unnamed: 0,Pos_Rank,Best_Pos
0,0,GK
1312,1,WBR
1342,2,DR
1874,3,DC
3203,4,DL
3515,5,WBL
3529,6,DM
3758,7,MC
4250,8,MR
4302,9,AMC


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming mdf is your DataFrame and has the necessary columns
# mdf = pd.read_csv('your_data.csv')  # Load your data here if it's from a CSV file

# Select the relevant columns
df = mdf[["Pos_Rank", "tpr", "pas", "Cmp%"]]

# Drop any rows with missing values
df = df.dropna()
print(df.info())

# Define the independent variables (predictors) and the dependent variable (response)
X = df[["Pos_Rank", "tpr", "pas"]]
y = df["Cmp%"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print the coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Print performance metrics
print("Mean squared error (MSE):", mean_absolute_error(y_test, y_pred))
print("Mean squared error (MSE):", mean_squared_error(y_test, y_pred))
print("Coefficient of determination (R^2):", r2_score(y_test, y_pred))

KeyError: "['pas'] not in index"

In [None]:
def Predict_Pass_Completion_Rate(pos, tpr, pas):
    input_data = {'Pos_Rank': [pos], 'tpr': [tpr], 'pas': [pas]}
    input_df = pd.DataFrame(input_data)
    noise_range = (-5, 10)
    predicted_class = model.predict(input_df) + np.random.uniform(noise_range[0], noise_range[1], input_df.shape)
    return predicted_class

print("Predicted Class:", Predict_Pass_Completion_Rate(9, 100, 92))

In [None]:
def Player_Passing_Volume_Share(df):
    df["Att_share"] = df["Att"] / df["Att"].sum()
    df["Cmp_share"] = df["Cmp"] / df["Cmp"].sum()
    return df

mdf = mdf.groupby('Club').apply(Player_Passing_Volume_Share).reset_index(drop=True)

In [None]:
passing_logs = pd.read_csv("../match_logs/Big5@22-23@passing.csv")
passing_data = passing_data
passing_df = pd.merge(team_ratings, passing_data, how="inner", left_on="fbref_name", right_on="Squad")