In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings

In [38]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [39]:
players_ratings = pd.read_csv("../players_db/fm23/fm23db_processed.csv")
team_ratings = pd.read_csv("../players_db/fm23/team_ratings.csv")

<br><br><br><br>
<h2 style="color:blue;">  Win Probability</h3>

In [160]:
win_df = pd.read_csv("../players_db/fm23/win_probability.csv", index_col=0)
def Win_Probability(win_prob_df, power_x, power_y, is_neutral=False):
    logit_sum = 0
    prob_dict = {}
    for match_result in [1,0,2]:
        tpr_x = win_prob_df.loc["tpr_x", str(match_result)]
        tpr_y = win_prob_df.loc["tpr_y", str(match_result)]
        intercept = win_prob_df.loc["intercept", str(match_result)]
        fx = tpr_y*power_y + tpr_x*power_x + intercept
        logit_sum += np.e ** fx
        prob_dict[match_result] = np.e**fx
    for match_result in [1,0,2]:
        prob_dict[match_result] = prob_dict[match_result] / logit_sum
    prob_sum = sum(prob_dict.values())
    normalized_probs = {k: v / prob_sum for k, v in prob_dict.items()}
    
    if is_neutral and power_x==power_y:
        draw = normalized_probs[0]
        non_draw_side = (1 - draw) / 2
        return {1: non_draw_side, 0: draw, 2: non_draw_side}
    return normalized_probs

In [161]:
Win_Probability(win_df, 50, 60)

{1: 0.15141943498481023, 0: 0.43814483031639917, 2: 0.41043573469879063}

In [162]:
Win_Probability(win_df, 60, 70)

{1: 0.1876793963427024, 0: 0.3740351552084532, 2: 0.43828544844884443}

In [163]:
Win_Probability(win_df, 70, 80)

{1: 0.22807162854246782, 0: 0.3130594683962014, 2: 0.4588689030613309}

<br><br>

In [164]:
Win_Probability(win_df, 75, 75)

{1: 0.36105836634747795, 0: 0.33042638291014814, 2: 0.30851525074237396}

In [165]:
Win_Probability(win_df, 80, 80)

{1: 0.39175485727530857, 0: 0.29753740482911134, 2: 0.31070773789558015}

In [166]:
Win_Probability(win_df, 90, 90)

{1: 0.4532254040572556, 0: 0.23708341827188148, 2: 0.30969117767086296}

In [167]:
Win_Probability(win_df, 92, 79)

{1: 0.6229503239181537, 0: 0.2275294948764984, 2: 0.14952018120534785}

In [168]:
Win_Probability(win_df, 85, 84) == Win_Probability(win_df, 84, 85) 

False

<br><br><br><br>
<h2 style="color:blue;">  League Simuation Test</h3>

In [169]:
team_ratings.League_id.unique()

array([ 185, 1215,  354,  773,  710, 1331, 1017,  363], dtype=int64)

In [183]:
import pandas as pd
import numpy as np
from itertools import product

current_league_id = 1215	
teams_in_league_ratings = team_ratings.query(f"League_id == {current_league_id}")

points_table = {team: 0 for team in teams_in_league_ratings['Club']}
matchups = list(product(teams_in_league_ratings['Club'], repeat=2))
matchups = [match for match in matchups if match[0] != match[1]]

for home_team, away_team in matchups:
    home_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == home_team]['tpr'].values[0]
    away_tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == away_team]['tpr'].values[0]
    
    probs_home = Win_Probability(win_df, home_tpr, away_tpr, is_neutral=True)
    result_home = np.random.choice(list(probs_home.keys()), p=list(probs_home.values()))
    if result_home == 1:
        points_table[home_team] += 3
    elif result_home == 0:
        points_table[home_team] += 1
        points_table[away_team] += 1
    else:
        points_table[away_team] += 3

standings = sorted(points_table.items(), key=lambda x: x[1], reverse=True)

markdown_table = "| Rank | Team | Points | TPR |\n|------|------|--------|-----|\n"
for i, (team, points) in enumerate(standings, start=1):
    tpr = teams_in_league_ratings[teams_in_league_ratings['Club'] == team]['tpr'].values[0]
    markdown_table += f"| {i} | {team} | {points} | {tpr} |\n"
from IPython.display import Markdown, display
display(Markdown(markdown_table))

| Rank | Team | Points | TPR |
|------|------|--------|-----|
| 1 | Real Madrid | 76 | 87 |
| 2 | FC Barcelona | 72 | 87 |
| 3 | Celta de Vigo | 72 | 74 |
| 4 | Villarreal | 69 | 77 |
| 5 | Sevilla | 61 | 77 |
| 6 | Atlético Madrid | 58 | 79 |
| 7 | Valencia | 58 | 74 |
| 8 | Osasuna | 58 | 67 |
| 9 | Getafe | 53 | 72 |
| 10 | Athletic Bilbao | 49 | 74 |
| 11 | Real Sociedad | 47 | 74 |
| 12 | Rayo Vallecano | 44 | 69 |
| 13 | Real Betis | 42 | 74 |
| 14 | Almería | 41 | 67 |
| 15 | Real Valladolid | 41 | 67 |
| 16 | Cádiz | 41 | 67 |
| 17 | Mallorca | 37 | 67 |
| 18 | Elche CF | 37 | 67 |
| 19 | Girona | 35 | 67 |
| 20 | Espanyol | 27 | 64 |


In [184]:
players_ratings.query("Club == 'Manchester City'")[["Name","tpr"]].sort_values("tpr", ascending=False)

Unnamed: 0,Name,tpr
3193,Erling Haaland,90
3192,Kevin De Bruyne,89
3197,Bernardo Silva,89
3199,Ederson,88
3196,Rúben Dias,85
3202,Aymeric Laporte,85
3211,Phil Foden,83
3222,John Stones,83
3221,Jack Grealish,83
3205,İlkay Gündogan,83


<br><br><br><br>
<h1 style="color:red;">  Passing (players)</h1>

In [51]:
def rename_duplicate_columns(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

In [52]:
pdf = players_ratings.sort_values(by="Pos_Rank").dropna(subset=["fbref_player_name"])

In [53]:
pdf.Best_Pos.value_counts()

Best_Pos
MC     682
DC     642
GK     463
AML    450
ST     448
DR     352
AMR    325
DL     314
DM     234
AMC    118
ML      99
MR      91
WBR     22
WBL     17
Name: count, dtype: int64

In [54]:
url = "https://fbref.com/en/comps/Big5/2022-2023/passing/players/2022-2023-Big-5-European-Leagues-Stats"
data = pd.read_html(url)
data[0].columns = data[0].columns.get_level_values(1)

In [55]:
passing_data = data[0]

In [56]:
mdf = pd.merge(pdf, passing_data, how="inner", left_on="fbref_player_name", right_on="Player")
mdf = mdf.drop_duplicates(subset=['Player'], keep='first')
mdf = rename_duplicate_columns(mdf)

In [57]:
mdf["Att"] = pd.to_numeric(mdf["Att"], errors='coerce')
mdf["90s"] = pd.to_numeric(mdf["90s"], errors='coerce')
mdf["Cmp"] = pd.to_numeric(mdf["Cmp"], errors='coerce')
mdf["Cmp%"] = pd.to_numeric(mdf["Cmp%"], errors='coerce')

In [58]:
mdf["Att90"] = mdf["Att"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp90"] = mdf["Cmp"].apply(float) / mdf["90s"].apply(float)
mdf["Cmp%"] = mdf["Cmp%"].apply(float)

In [59]:
# mdf.loc[0].to_dict()

In [60]:
mdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2467 entries, 0 to 5109
Columns: 137 entries, UID to Cmp90
dtypes: float64(9), int64(75), object(53)
memory usage: 2.6+ MB


In [61]:
mdf[["Player","Pos_Rank","Best_Pos","tpr","pas","Att90","Cmp90","Cmp%"]].head()

KeyError: "['pas'] not in index"

In [None]:
mdf.drop_duplicates(subset=['Pos_Rank'], keep='first')[["Pos_Rank","Best_Pos"]]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming mdf is your DataFrame and has the necessary columns
# mdf = pd.read_csv('your_data.csv')  # Load your data here if it's from a CSV file

# Select the relevant columns
df = mdf[["Pos_Rank", "tpr", "pas", "Cmp%"]]

# Drop any rows with missing values
df = df.dropna()
print(df.info())

# Define the independent variables (predictors) and the dependent variable (response)
X = df[["Pos_Rank", "tpr", "pas"]]
y = df["Cmp%"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print the coefficients
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Print performance metrics
print("Mean squared error (MSE):", mean_absolute_error(y_test, y_pred))
print("Mean squared error (MSE):", mean_squared_error(y_test, y_pred))
print("Coefficient of determination (R^2):", r2_score(y_test, y_pred))

In [None]:
def Predict_Pass_Completion_Rate(pos, tpr, pas):
    input_data = {'Pos_Rank': [pos], 'tpr': [tpr], 'pas': [pas]}
    input_df = pd.DataFrame(input_data)
    noise_range = (-5, 10)
    predicted_class = model.predict(input_df) + np.random.uniform(noise_range[0], noise_range[1], input_df.shape)
    return predicted_class

print("Predicted Class:", Predict_Pass_Completion_Rate(9, 100, 92))

In [None]:
def Player_Passing_Volume_Share(df):
    df["Att_share"] = df["Att"] / df["Att"].sum()
    df["Cmp_share"] = df["Cmp"] / df["Cmp"].sum()
    return df

mdf = mdf.groupby('Club').apply(Player_Passing_Volume_Share).reset_index(drop=True)

In [None]:
passing_logs = pd.read_csv("../match_logs/Big5@22-23@passing.csv")
passing_data = passing_data
passing_df = pd.merge(team_ratings, passing_data, how="inner", left_on="fbref_name", right_on="Squad")