In [311]:
import pandas as pd
import numpy as np
import os
import sys
import math
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch
import matplotlib.patches as patches

In [312]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from pandas.errors import SettingWithCopyWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [313]:
parent_dir = os.path.abspath(os.path.join(os.path.dirname("./"), '..'))
sys.path.append(parent_dir)
from tools.lineup_tpr import LineupTPR
from tools.pickler import Save_Model, Load_Model
from tools.draw_pitch_plot import Draw_Pitch_Actions
from tools.fm_attributes import Attributes
fm_attributes = Attributes.attribute_list
gk_attributes = Attributes.gk_attributes

In [314]:
players_df = pd.read_csv("../players_db/fm23/fm23db_processed.csv")
team_df = pd.read_csv("../players_db/fm23/team_ratings.csv")

<br><br><br><br><br><br><br><br>
<h2 style="color:beige;background:blue;">  Data Prep --- for Action Player Volume Regression</h2>

<br><br><br><br>
<h3 style="color:yellow;background:pink;">  Passing</h3>

In [397]:
# url = "https://fbref.com/en/comps/Big5/2022-2023/passing/players/2022-2023-Big-5-European-Leagues-Stats"
# dataPASSING = pd.read_html(url)
# # Att, Att, Att -> same column names occurs
# fb = dataPASSING[0]
# #fb.columns
# fb['90s'] = pd.to_numeric(fb[('Unnamed: 8_level_0','90s')], errors='coerce')
# fb['Att'] = pd.to_numeric(fb[('Total','Att')], errors='coerce')
# fb['Pos'] = fb[('Unnamed: 3_level_0','Pos')]
# fb['Squad'] = fb[('Unnamed: 4_level_0','Squad')]
# fb['Player'] = fb[('Unnamed: 1_level_0','Player')]
# fb['Pass/90'] = fb['Att'] / fb['90s']
# fb2 = fb[fb["90s"] > 15]
# fb2.columns = fb2.columns.get_level_values(0)
# fb2.sort_values(by="Pass/90", ascending=False).head(3)[["Pos","Squad","Player","Att","Pass/90"]]

In [398]:
fb2_raw = pd.read_csv("data/Big5@Players@Passing@2022_2023.csv")
fb2 = fb2_raw[fb2_raw["90s"] > 15]
fb_passing = fb2.sort_values(by="Pass/90", ascending=False)[["Pos","Squad","Player","Att","Pass/90"]]
#fb_passing["Pos"] = fb_passing["Pos"].apply(lambda row: row.split(",")[0])
pass_merged_df = pd.merge(players_df, fb_passing, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))

In [399]:
passing_positions = pass_merged_df.sort_values(by="Pass/90", ascending=False).head(50)["Pos_Rank_Average"]
print(passing_positions.mean())
passing_ratings = pass_merged_df.sort_values(by="Pass/90", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

4.24


<br><br><br><br>
<h3 style="color:beige;background:red;">  Shooting</h3>

In [400]:
# url = "https://fbref.com/en/comps/Big5/2022-2023/shooting/players/2022-2023-Big-5-European-Leagues-Stats"
# dataSHOOTING = pd.read_html(url)
# dataSHOOTING[0].columns = dataSHOOTING[0].columns.get_level_values(1)
# fb = dataSHOOTING[0]
# fb.columns
# fb['90s'] = pd.to_numeric(fb['90s'], errors='coerce')
# fb['Sh'] = pd.to_numeric(fb['Sh'], errors='coerce')
# fb['Sh/90'] = pd.to_numeric(fb['Sh/90'], errors='coerce')
# fb2 = fb[fb["90s"] > 15]
# fb2.sort_values(by="Sh/90", ascending=False).head(3)[["Pos","Squad","Player","Sh","Sh/90"]]

In [524]:
fb2_raw = pd.read_csv("data/Big5@Players@Shooting@2022_2023.csv")
fb2 = fb2_raw[fb2_raw["90s"] > 15]
fb_shooters = fb2.sort_values(by="Sh/90", ascending=False)[["Pos","Squad","Player","Sh","Sh/90"]]
fb_shooters["Pos"] = fb_shooters["Pos"].apply(lambda row: row.split(",")[0])
sh_merged_df_raw = pd.merge(players_df, fb_shooters, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))
sh_merged_df = sh_merged_df_raw[sh_merged_df_raw["Best_Pos"] != "GK"]

In [525]:
shooters_positions = sh_merged_df.sort_values(by="Sh/90", ascending=False).head(50)["Pos_Rank_Average"]
print(shooters_positions.mean())
shooters_ratings = sh_merged_df.sort_values(by="Sh/90", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

11.3


In [707]:
def Prepare_Action_Player_Volume_Regression_Data(players_df, csv_file, filter_column, filter_value, target_variable, target_volume=None, topcolsN=None):
    fb2_raw = pd.read_csv(csv_file)
    fb2 = fb2_raw[fb2_raw[filter_column] > filter_value]
    if target_volume:
        fb_actions = fb2.sort_values(by=target_variable, ascending=False)[["Pos","Squad","Player",target_volume,target_variable]]
    else:
        fb_actions = fb2.sort_values(by=target_variable, ascending=False)[["Pos","Squad","Player",target_volume,target_variable]]
    fb_actions["Pos"] = fb_actions["Pos"].apply(lambda row: row.split(",")[0])
    sh_merged_df_raw = pd.merge(players_df, fb_actions, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))
    sh_merged_df = sh_merged_df_raw[sh_merged_df_raw["Best_Pos"] != "GK"]
    position_mean = sh_merged_df.sort_values(by=target_variable, ascending=False).head(50)["Pos_Rank_Average"].mean()
    shooters_ratings = sh_merged_df.sort_values(by=target_variable, ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)[:topcolsN].index
    return {
        "zero": 0,
        "target": target_variable,
        "df": sh_merged_df,
        "position_mean": position_mean,
        "top_cols": shooters_ratings,
    }

Prepare_Action_Player_Volume_Regression_Data(
    players_df,
    "data/Big5@Players@Shooting@2022_2023.csv",
    "90s",
    15,
    "Sh/90",
    "Sh",
    12
)["zero"]

0

<br><br><br><br>
<h3 style="color:red;background:orange;">  Dribbling</h3>

In [708]:
# url = "https://fbref.com/en/comps/Big5/2022-2023/possession/players/2022-2023-Big-5-European-Leagues-Stats"
# dataDRIBBLING = pd.read_html(url)
# fb = dataDRIBBLING[0]
# #fb.columns
# fb['90s'] = pd.to_numeric(fb[('Unnamed: 8_level_0','90s')], errors='coerce')
# fb['Att'] = pd.to_numeric(fb[('Take-Ons','Att')], errors='coerce')
# fb['Pos'] = fb[('Unnamed: 3_level_0','Pos')]
# fb['Squad'] = fb[('Unnamed: 4_level_0','Squad')]
# fb['Player'] = fb[('Unnamed: 1_level_0','Player')]
# fb['Dribble/90'] = fb['Att'] / fb['90s']
# fb2 = fb[fb["90s"] > 15]
# fb2.columns = fb2.columns.get_level_values(0)
# fb2.sort_values(by="Dribble/90", ascending=False).head(3)[["Pos","Squad","Player","Att","Dribble/90"]]

In [709]:
fb2_raw = pd.read_csv("data/Big5@Players@Dribbling@2022_2023.csv")
fb2 = fb2_raw[fb2_raw["90s"] > 15]
fb_dribbling = fb2.sort_values(by="Dribble/90", ascending=False)[["Pos","Squad","Player","Att","Dribble/90"]]
#fb_dribbling["Pos"] = fb_dribbling["Pos"].apply(lambda row: row.split(",")[0])
dribble_merged_df = pd.merge(players_df, fb_dribbling, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))

In [710]:
dribbling_positions = dribble_merged_df.sort_values(by="Dribble/90", ascending=False).head(50)["Pos_Rank_Average"]
print(dribbling_positions.mean())
dribbling_ratings= dribble_merged_df.sort_values(by="Dribble/90", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

9.92


<br><br>
<h3 style="color:red;background:;"></h3>

In [711]:
# dataPASSING[0].to_csv("data/Big5@Players@Passing@2022_2023.csv",index=False)
# dataSHOOTING[0].to_csv("data/Big5@Players@Shooting@2022_2023.csv",index=False)
# dataDRIBBLING[0].to_csv("data/Big5@Players@Dribbling@2022_2023.csv",index=False)

<br><br><br><br><br><br><br><br>
<h2 style="color:aqua;background:darkblue;">  Action Player Volume Regression</h2>

In [712]:
# pass_merged_df, sh_merged_df, dribble_merged_df

In [572]:
# passing_top_cols, shooting_top_cols, dribbling_top_cols

In [573]:
top_N = 12

In [574]:
shooting_top_cols = shooters_ratings[:top_N].index
shooting_top_cols

Index(['Off_the_Ball', 'Technique', 'Flair', 'Pace', 'Acceleration',
       'First_Touch', 'Finishing', 'Agility', 'Determination', 'Dribbling',
       'Anticipation', 'Natural_Fitness'],
      dtype='object')

In [575]:
passing_top_cols = passing_ratings[:top_N].index
passing_top_cols

Index(['Determination', 'Stamina', 'Natural_Fitness', 'Anticipation',
       'Work_Rate', 'Teamwork', 'Bravery', 'Composure', 'Tackling', 'Balance',
       'Strength', 'Decisions'],
      dtype='object')

In [576]:
dribbling_top_cols = dribbling_ratings[:top_N].index
dribbling_top_cols

Index(['Dribbling', 'Acceleration', 'Flair', 'Pace', 'Technique', 'Agility',
       'First_Touch', 'Determination', 'Natural_Fitness', 'Off_the_Ball',
       'Passing', 'Vision'],
      dtype='object')

<br><br>

In [681]:
mae = 100

for topN in range(1, len(fm_attributes)):
    prep_by = Prepare_Action_Player_Volume_Regression_Data(
        players_df,
        "data/Big5@Players@Passing@2022_2023.csv",
        "90s",
        45,
        "Pass/90",
        "Att",
        topN
    )
    
    chosen_dict = {
        "df": prep_by["df"],
        "top_cols": prep_by["top_cols"],
        "target": prep_by["target"],
    }
    
    for num in range(1,100):
        test_size = round(num/100, 3)
        this_mae = Train_Model(current_df, chosen_dict["top_cols"], chosen_dict["target"], test_size)["mae"]
        if this_mae < mae:
            mae =this_mae
            print(test_size, topN, mae)

0.01 1 11.385355559497983
0.02 1 10.814237710333904
0.04 1 9.879300522431377
0.05 1 8.849401623376748
0.06 1 8.752424216157419
0.08 1 8.636103749233392
0.09 1 8.50324365969474
0.09 2 8.43513964788689
0.09 3 8.325249882697124
0.09 6 8.22663428568669


KeyboardInterrupt: 

In [691]:
prep_by = Prepare_Action_Player_Volume_Regression_Data(
    players_df,
    "data/Big5@Players@Passing@2022_2023.csv",
    "90s",
    15,
    "Pass/90",
    "Att",
    45
)

chosen_dict = {
    "df": prep_by["df"],
    "top_cols": prep_by["top_cols"],
    "target": prep_by["target"],
}

In [692]:
current_df_extra = pd.merge(team_df, players_df, how="inner", left_on="Club", right_on="Club", suffixes=("_club",""))

action_merged_df = chosen_dict["df"].copy()
current_df = pd.merge(team_df, action_merged_df, how="inner", left_on="fbref_name", right_on="Squad", suffixes=("_club",""))

current_cols = np.concatenate((chosen_dict["top_cols"], ["Pos_Rank_Average"], ["tpr","tpr_club"]))
current_target = chosen_dict["target"]

In [693]:
def Train_Model(df, top_cols, target, test_size):
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import LinearRegression 
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

    current_cols = np.concatenate((top_cols, ["Pos_Rank_Average"], ["tpr","tpr_club"]))
    X = df[current_cols]
    y = df[target].astype(float)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred_raw = model.predict(X_test)
    y_pred = [max(0, pred) for pred in y_pred_raw]
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        "model": model,
        "mse": mse,
        "mae": mae,
        "r2": r2
    }

In [694]:
model = Train_Model(current_df, chosen_dict["top_cols"], chosen_dict["target"], 0.09)["model"]

In [695]:
# for num in range(1,100):
#     test_size = round(num/100, 3)
#     #print(test_size, Train_Model(current_df, chosen_dict["top_cols"], chosen_dict["target"], test_size)["mae"])

In [696]:
def DictToPredictionDict(d: dict) -> pd.DataFrame:
    rdict = {}
    for k,v in d.items():
        rdict[k] = [v]
    return pd.DataFrame(rdict)

In [697]:
def PlayerPredict(player_name, print_stats=False):
    if print_stats:
        selected_df = current_df 
    else:
        selected_df = current_df_extra
    d0 = selected_df.query(f"Name == '{player_name}'").iloc[0]
    d1 = d0[current_cols].to_dict()
    d2 = DictToPredictionDict(d1)
    
    print(f"Name:\t\t{d0['Name']}")
    print(f"Pos_Rank_Avg:\t{d0['Pos_Rank_Average']}")
    if print_stats:
        print(f"Real:\t\t{d0[current_target]}")
    prediction = max(0, model.predict(d2)[0])
    print(f"Prediction:\t[{prediction}]")
    return prediction

In [698]:
def Model_Predict(row, cols):
    d1 = row[cols].to_dict()
    d2 = DictToPredictionDict(d1)
    prediction = max(0, model.predict(d2)[0])
    return float(round(prediction, 2))

In [699]:
PlayerPredict("Kevin De Bruyne", True)

Name:		Kevin De Bruyne
Pos_Rank_Avg:	8
Real:		60.446096654275095
Prediction:	[67.68254238274645]


67.68254238274645

In [700]:
PlayerPredict("Harry Kane", True)

Name:		Harry Kane
Pos_Rank_Avg:	10
Real:		24.365079365079367
Prediction:	[47.554945148687565]


47.554945148687565

In [701]:
from tools.pickler import Save_Model, Load_Model
#Save_Model("_", "models/XXXXXXXXXX_per_90_model", model)

<br><br>

In [702]:
predicted_df = current_df.copy()
predicted_df["Predicted"] = predicted_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)
predicted_df["Error"] = predicted_df["Predicted"] - predicted_df[ chosen_dict["target"] ]
predicted_df[["Predicted",chosen_dict["target"],"Error"]].mean()

Predicted    46.385986
Pass/90      46.476848
Error        -0.090862
dtype: float64

In [703]:
predicted_team_name = "Real Madrid"
team_using_df = predicted_df.query(f"Club == '{predicted_team_name}'") \
            [["Name","Best_Pos","Pos_Rank_Average","Predicted",chosen_dict["target"],"Error"]] \
            .sort_values("Predicted", ascending=False)

<br><br>

In [704]:
only_fbref_df = current_df.copy()
only_fbref_df["Predicted"] = only_fbref_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)

all_players_df = current_df_extra.copy()
all_players_df["Predicted"] = all_players_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)

tm_name = "Real Madrid"
tm_name = "Manchester City"

try:
    tm = only_fbref_df.copy()
    tm["Error"] = tm["Predicted"] - tm[ chosen_dict["target"] ]
    tm2 = LineupTPR(tm, tm_name)[["Name","Best_Pos","Pos_Rank_Average","Predicted",chosen_dict["target"],"Error"]]
except:   
    tm = all_players_df.copy()
    tm2 = LineupTPR(tm, tm_name)[["Club","Name","Best_Pos","Playing_Position","Pos_Rank_Average","Predicted"]]

print( tm2["Predicted"].sum() )

639.1899999999999


In [705]:
all_players_df[["UID","Club","Name","Best_Pos","Pos_Rank_Average","Predicted"]].sort_values("Predicted", ascending=False).head(5)

Unnamed: 0,UID,Club,Name,Best_Pos,Pos_Rank_Average,Predicted
3,8718372,FC Bayern,Manuel Neuer,GK,0,98.95
45,35017428,FC Barcelona,Marc-André ter Stegen,GK,0,96.69
1831,63002306,Manchester United,Martin Dúbravka,GK,0,95.79
112,67006173,Liverpool,Adrián,GK,0,94.25
196,64016316,Atlético Madrid,Jan Oblak,GK,0,93.7


In [706]:
y_df = pd.merge(players_df, all_players_df[["UID","Predicted"]], how="inner", on="UID")

<br><br><br><br><br><br><br><br>
<h2 style="color:pink;background:purple;">  Data Prep --- for Action Player Accuracy Regressionn</h2>

<br>
<h3 style="color:black;background:white;">  Passing</h3>

In [468]:
fb2_raw = pd.read_csv("data/Big5@Players@Passing@2022_2023.csv")
fb2_raw["Att"] = pd.to_numeric(fb2_raw["Att"], errors="coerce")
filter_value = fb2_raw["Att"].max() / 10
print(filter_value)

322.3


In [469]:
fb2 = fb2_raw[fb2_raw["Att"] > filter_value]
fb2["Pass_Accuracy"] = pd.to_numeric(fb2["Total.2"])
fb_passing_acc = fb2.sort_values(by="Pass_Accuracy", ascending=False)[["Pos","Squad","Player","Pass_Accuracy"]]
#fb_passing_acc["Pos"] = fb_passing_acc["Pos"].apply(lambda row: row.split(",")[0])
pass_merged_df = pd.merge(players_df, fb_passing_acc, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))

In [470]:
passing_positions = pass_merged_df.sort_values(by="Pass_Accuracy", ascending=False).head(50)["Pos_Rank_Average"]
print(passing_positions.mean())
passing_ratings = pass_merged_df.sort_values(by="Pass_Accuracy", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

4.54


<br>
<h3 style="color:black;background:white;">  Shooting</h3>

In [471]:
fb2_raw = pd.read_csv("data/Big5@Players@Shooting@2022_2023.csv")
fb2_raw = fb2_raw[fb2_raw["Pos"] != "GK"]
fb2_raw["Sh"] = pd.to_numeric(fb2_raw["Sh"], errors="coerce")
filter_value = fb2_raw["Sh"].max() / 4
print(filter_value)

36.0


In [472]:
fb2 = fb2_raw[fb2_raw["Sh"] > filter_value]
fb2["Shot_Accuracy"] = pd.to_numeric(fb2["SoT%"])
fb_shooting_acc = fb2.sort_values(by="Shot_Accuracy", ascending=False)[["Pos","Squad","Player","Shot_Accuracy"]]
#fb_shooting_acc["Pos"] = fb_shooting_acc["Pos"].apply(lambda row: row.split(",")[0])
sh_merged_df = pd.merge(players_df, fb_shooting_acc, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))

In [473]:
shooters_positions = sh_merged_df.sort_values(by="Shot_Accuracy", ascending=False).head(50)["Pos_Rank_Average"]
print(shooters_positions.mean())
shooters_ratings = sh_merged_df.sort_values(by="Shot_Accuracy", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

10.62


<br>
<h3 style="color:black;background:white;">  Dribbling</h3>

In [474]:
fb2_raw = pd.read_csv("data/Big5@Players@Dribbling@2022_2023.csv")
fb2_raw = fb2_raw[fb2_raw["Pos"] != "GK"]
fb2_raw["Take-Ons"] = pd.to_numeric(fb2_raw["Take-Ons"], errors="coerce")
filter_value = fb2_raw["Take-Ons"].max() / 3
print(filter_value)

102.0


In [475]:
fb2 = fb2_raw[fb2_raw["Take-Ons"] > filter_value].dropna()
fb2["Dribble_Accuracy"] = pd.to_numeric(fb2["Take-Ons.2"])
fb_dribbling_acc = fb2.sort_values(by="Dribble_Accuracy", ascending=False)[["Pos","Squad","Player","Dribble_Accuracy"]]
#fb_dribbling_acc["Pos"] = fb_dribbling_acc["Pos"].apply(lambda row: row.split(",")[0])
dribble_merged_df = pd.merge(players_df, fb_dribbling_acc, how="inner", left_on="Name", right_on="Player", suffixes=("","_fb"))

In [476]:
dribbling_positions = dribble_merged_df.sort_values(by="Dribble_Accuracy", ascending=False).head(50)["Pos_Rank_Average"]
print(dribbling_positions.mean())
dribbling_ratings= dribble_merged_df.sort_values(by="Dribble_Accuracy", ascending=False).head(50)[fm_attributes].mean().sort_values(ascending=False)

9.12


<br><br><br><br><br><br><br><br>
<h2 style="color:purple;background:pink;">Player Accuracy Regression</h2>

In [477]:
# pass_merged_df, sh_merged_df, dribble_merged_df

In [494]:
# passing_top_cols, shooting_top_cols, dribbling_top_cols

In [495]:
top_N = 12

In [496]:
shooting_top_cols = shooters_ratings[:top_N].index
shooting_top_cols

Index(['Pace', 'Acceleration', 'Technique', 'Off_the_Ball', 'Flair', 'Agility',
       'Natural_Fitness', 'Dribbling', 'First_Touch', 'Determination',
       'Work_Rate', 'Finishing'],
      dtype='object')

In [497]:
passing_top_cols = passing_ratings[:top_N].index
passing_top_cols

Index(['Determination', 'Stamina', 'Natural_Fitness', 'Anticipation',
       'Work_Rate', 'Teamwork', 'Bravery', 'Composure', 'Tackling', 'Balance',
       'Strength', 'Decisions'],
      dtype='object')

In [498]:
dribbling_top_cols = dribbling_ratings[:top_N].index
dribbling_top_cols

Index(['Dribbling', 'Technique', 'Acceleration', 'Pace', 'Flair', 'Agility',
       'Determination', 'First_Touch', 'Natural_Fitness', 'Work_Rate',
       'Stamina', 'Balance'],
      dtype='object')

<br><br>

In [596]:
chosen_dict = {
    "df": sh_merged_df,
    "top_cols": shooting_top_cols,
    "target": 'Shot_Accuracy',
}
chosen_dict = {
    "df": pass_merged_df,
    "top_cols": passing_top_cols,
    "target": 'Pass_Accuracy',
}

In [500]:
current_df_extra = pd.merge(team_df, players_df, how="inner", left_on="Club", right_on="Club", suffixes=("_club",""))

In [505]:
action_merged_df = chosen_dict["df"].copy()
current_df = pd.merge(team_df, action_merged_df, how="inner", left_on="fbref_name", right_on="Squad", suffixes=("_club",""))
current_cols = np.concatenate((chosen_dict["top_cols"], ["Pos_Rank_Average"], ["tpr","tpr_club"]))
current_target = chosen_dict["target"]

<br><br>

In [502]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#df = current_df.query("Best_Pos != 'GK'") # SHOOTING
#df = current_df # PASSING
#df = current_df.query("Best_Pos != 'GK'") # DRIBBLING
df = current_df
X = df[current_cols]
y = df[current_target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.07, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred_raw = model.predict(X_test)
y_pred = [max(0, pred) for pred in y_pred_raw]

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)

Mean Absolute Error: 5.158896300956627


In [503]:
model.feature_names_in_

array(['Determination', 'Stamina', 'Natural_Fitness', 'Anticipation',
       'Work_Rate', 'Teamwork', 'Bravery', 'Composure', 'Tackling',
       'Balance', 'Strength', 'Decisions', 'Pos_Rank_Average', 'tpr'],
      dtype=object)

In [504]:
def DictToPredictionDict(d: dict) -> pd.DataFrame:
    rdict = {}
    for k,v in d.items():
        rdict[k] = [v]
    return pd.DataFrame(rdict)

In [489]:
def PlayerPredict(player_name, print_stats=False):
    if print_stats:
        selected_df = current_df 
    else:
        selected_df = current_df_extra
    d0 = selected_df.query(f"Name == '{player_name}'").iloc[0]
    d1 = d0[current_cols].to_dict()
    d2 = DictToPredictionDict(d1)
    
    print(f"Name:\t\t{d0['Name']}")
    print(f"Pos_Rank_Avg:\t{d0['Pos_Rank_Average']}")
    if print_stats:
        print(f"Real:\t\t{d0[current_target]}")
    prediction = max(0, model.predict(d2)[0])
    print(f"Prediction:\t[{prediction}]")
    #return prediction

In [490]:
def Model_Predict(row, cols):
    d1 = row[cols].to_dict()
    d2 = DictToPredictionDict(d1)
    prediction = max(0, model.predict(d2)[0])
    return float(round(prediction, 2))

In [491]:
from tools.pickler import Save_Model, Load_Model
#Save_Model("_", "models/xxxxxxxxxxxxxxxx_accuracy_model", model)

<br><br>

In [492]:
predicted_df = current_df.copy()
predicted_df["Predicted"] = predicted_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)
predicted_df["Error"] = predicted_df["Predicted"] - predicted_df[ chosen_dict["target"] ]
predicted_df[["Predicted",chosen_dict["target"],"Error"]].mean()

Predicted        78.001787
Pass_Accuracy    77.995926
Error             0.005861
dtype: float64

In [493]:
predicted_team_name = "Real Madrid"
pred_df = predicted_df \
            [["Name","Best_Pos","Pos_Rank_Average","Predicted",chosen_dict["target"],"Error"]] \
            .sort_values("Predicted", ascending=False)
pred_df.head()

Unnamed: 0,Name,Best_Pos,Pos_Rank_Average,Predicted,Pass_Accuracy,Error
40,Virgil van Dijk,DC,3,93.7,90.5,3.2
12,Benjamin Pavard,DR,2,92.65,88.3,4.35
207,Thiago Silva,DC,3,91.69,90.9,0.79
36,Andreas Christensen,DM,5,91.45,94.0,-2.55
93,Mats Hummels,DC,3,91.42,87.5,3.92


<br><br>

In [454]:
only_fbref_df = current_df.copy()
only_fbref_df["Predicted"] = only_fbref_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)

all_players_df = current_df_extra.copy()
all_players_df["Predicted"] = all_players_df.apply(lambda row: Model_Predict(row, model.feature_names_in_), axis=1)

tm_name = "Real Madrid"
tm_name = "Manchester City"

try:
    tm = only_fbref_df.copy()
    tm["Error"] = tm["Predicted"] - tm[ chosen_dict["target"] ]
    tm2 = LineupTPR(tm, tm_name)[["Name","Best_Pos","Pos_Rank_Average","Predicted",chosen_dict["target"],"Error"]]
except:   
    tm = all_players_df.copy()
    tm2 = LineupTPR(tm, tm_name)[["Club","Name","Best_Pos","Playing_Position","Pos_Rank_Average","Predicted"]]

print( tm2["Predicted"].sum() )
tm2
all_players_df[["Club","Name","Best_Pos","Pos_Rank_Average","Predicted"]].sort_values("Predicted", ascending=False).head(5)

259.33000000000004


Unnamed: 0,Club,Name,Best_Pos,Pos_Rank_Average,Predicted
3926,Sassuolo,Giovanni Perini,AMR,11,45.91
6140,Vizela,José Rui,AML,13,45.57
4016,Tottenham Hotspur,Heung-Min Son,AML,11,45.56
1843,Manchester United,Alejandro Garnacho,AML,12,45.14
129,Liverpool,Oakley Cannonier,ST,12,44.63


In [455]:
all_players_df[["Club","Name","Best_Pos","Pos_Rank_Average","Predicted"]].sort_values("Predicted", ascending=False).head(5)

Unnamed: 0,Club,Name,Best_Pos,Pos_Rank_Average,Predicted
3926,Sassuolo,Giovanni Perini,AMR,11,45.91
6140,Vizela,José Rui,AML,13,45.57
4016,Tottenham Hotspur,Heung-Min Son,AML,11,45.56
1843,Manchester United,Alejandro Garnacho,AML,12,45.14
129,Liverpool,Oakley Cannonier,ST,12,44.63


In [456]:
predicted_team_name = "Real Madrid"
team_using_df = predicted_df.query(f"Club == '{predicted_team_name}'") \
            [["Name","Best_Pos","Pos_Rank_Average","Predicted",chosen_dict["target"],"Error"]] \
            .sort_values("Predicted", ascending=False)
LineupTPR(players_df, predicted_team_name)[["Name","Best_Pos","Playing_Position","tpr"]]#.sort_values("Playing_Position", ascending=True)

Unnamed: 0,Name,Best_Pos,Playing_Position,tpr
4168,Thibaut Courtois,GK,GK,84
4211,Dani Carvajal,DR,DR,83
4219,Éder Militão,DC,DC,84
4217,Antonio Rüdiger,DC,DC,83
4188,David Alaba,DL,DL,85
4165,Luka Modrić,MC,MC,88
4176,Toni Kroos,MC,MC,83
4183,Federico Valverde,MC,AMR,84
4218,Marco Asensio,AMR,AMC,82
4166,Karim Benzema,ST,ST,87


<br><br><br><br><br><br><br><br>
<h2 style="color:white;background:orange;">  Predict</h2>

In [377]:
location_df = pd.read_csv("data/location_df.csv")
volume_formula_df = pd.read_csv("formulas/Attacking_Touches_volume_formula_df.csv") # Attacking_Touches

<br>

In [378]:
fm_managerial_tactics = {
    'Standard': {"Pass": 1.0, "Shot": 1.0, "Dribble": 1.0},
    'Gegen - Direct': {"Pass": 1.2, "Shot": 1.0, "Dribble": 0.8},
    'Gegen - High Tempo': {"Pass": 1.1, "Shot": 1.1, "Dribble": 0.9},
    'Gegen - Slow Passing': {"Pass": 0.9, "Shot": 1.0, "Dribble": 1.0},
    'Gegen - Wing Play': {"Pass": 1.3, "Shot": 1.0, "Dribble": 0.7},
    
    'Tiki Taka - Direct': {"Pass": 1.5, "Shot": 0.8, "Dribble": 0.8},
    'Tiki Taka - High Tempo': {"Pass": 1.4, "Shot": 0.9, "Dribble": 0.8},
    'Tiki Taka - Slow Passing': {"Pass": 1.6, "Shot": 0.7, "Dribble": 0.9},
    'Tiki Taka - Wing Play': {"Pass": 1.7, "Shot": 0.6, "Dribble": 0.8},
    
    'Control - Direct': {"Pass": 1.1, "Shot": 1.0, "Dribble": 1.0},
    'Control - High Tempo': {"Pass": 1.0, "Shot": 1.1, "Dribble": 1.0},
    'Control - Slow Passing': {"Pass": 1.0, "Shot": 0.9, "Dribble": 1.1},
    'Control - Wing Play': {"Pass": 1.2, "Shot": 1.0, "Dribble": 0.9},
    
    'Counter - Direct': {"Pass": 0.8, "Shot": 1.2, "Dribble": 1.0},
    'Counter - High Tempo': {"Pass": 0.9, "Shot": 1.1, "Dribble": 1.1},
    'Counter - Slow Passing': {"Pass": 0.7, "Shot": 1.0, "Dribble": 1.2},
    'Counter - Wing Play': {"Pass": 0.8, "Shot": 1.3, "Dribble": 1.0},
    
    'Total Football - Direct': {"Pass": 1.2, "Shot": 1.2, "Dribble": 1.0},
    'Total Football - High Tempo': {"Pass": 1.1, "Shot": 1.3, "Dribble": 1.0},
    'Total Football - Slow Passing': {"Pass": 1.3, "Shot": 1.1, "Dribble": 0.9},
    'Total Football - Wing Play': {"Pass": 1.4, "Shot": 1.0, "Dribble": 0.8},
}

selected_tactics = [
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Standard'],
    fm_managerial_tactics['Counter - High Tempo'],
    fm_managerial_tactics['Counter - Wing Play'],
]

In [379]:
def create_triple_prob_dict(row, tactical_coef={"Pass": 0, "Shot": 0, "Dribble": 0}):
    prob_dict = {
        "Pass": row["pass_prob"]+row["pass_prob"]*tactical_coef["Pass"],
        "Shot": row["shot_prob"]+row["shot_prob"]*tactical_coef["Shot"],
        "Dribble": row["dribble_prob"]+row["dribble_prob"]*tactical_coef["Dribble"]
    }
    sum_prob_dict = sum(prob_dict.values())
    weighted_prob_dict = {key: prob_dict[key]/sum_prob_dict for key in prob_dict.keys()}
    return weighted_prob_dict

In [380]:
def Action_Volume_Predict_Randomnessed(attribute, formula_df, tpr_x, tpr_y):
    input_df = pd.Series({"tpr_x": tpr_x, "tpr_y": tpr_y})
    custom_cols = list(input_df.index)
    coefficients = sum(formula_df.loc[:, f"{attribute}_coef"] * input_df[custom_cols].values)
    result = (coefficients + formula_df.loc[:, f"{attribute}_intercept"])
    randomnessed_result = result.iloc[0] - result.iloc[0] * (np.random.randint(-10, 10)/100)
    return max(0, int(randomnessed_result / 90))

In [381]:
def Action_Volume_Predict(attribute, formula_df, tpr_x, tpr_y):
    input_df = pd.Series({"tpr_x": tpr_x, "tpr_y": tpr_y})
    custom_cols = list(input_df.index)
    coefficients = sum(formula_df.loc[:, f"{attribute}_coef"] * input_df[custom_cols].values)
    result = (coefficients + formula_df.loc[:, f"{attribute}_intercept"])
    return max(0, int(result.iloc[0] / 90))

In [382]:
Action_Volume_Predict("Attacking_Touches", volume_formula_df, 92, 77)

7

In [383]:
def Generate_Random_Match_Stats_Per_Minute(location_df, formula_df, tpr_x, tpr_y, match_number=1):
    all_matches= []
    shot_map = []
    pass_map = []
    dribble_map = []
    location_df["triple_prob_dict"] = location_df.apply(lambda row: create_triple_prob_dict(row, selected_tactics[2]), axis=1)
    
    for process in range(match_number):
        Action_Counter = {"Pass": 0, "Shot": 0, "Dribble": 0}
        Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", formula_df, tpr_x, tpr_y)
        for minute in range(90):
            for moment in range(Touches):
                pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
                triple_probs = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].values())
                triple_actions = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].keys())
                action = np.random.choice(triple_actions, p=triple_probs)
                
                if action=="Pass":
                    pass_map.append(pitch_number)
                elif action=="Shot":
                    shot_map.append(pitch_number)
                elif action=="Dribble":
                    dribble_map.append(pitch_number)
                Action_Counter[action]+=1
            
        all_matches.append(Action_Counter)

        global progress
        progress = int((process + 1) / match_number * 100)
                      
    #print("!__completed__!")
    return {
        "match_data": all_matches,
        "pass_map": pass_map, 
        "shot_map": shot_map, 
        "dribble_map": dribble_map, 
    }

In [384]:
touches_df_raw_1 = pd.read_csv("../match_logs/Big5@22-23@attacking_touches.csv")
touches_df_raw_2 = pd.merge(touches_df_raw_1, team_df, left_on="Club", right_on="Club", how="inner") 
touches_df = pd.merge(touches_df_raw_2, team_df, left_on="Opponent", right_on="Club", how="inner")  

In [385]:
t = touches_df[["Club_x","tpr_x","tpr_y","Club_y","Date"]].iloc[25]
tpr_a = LineupTPR(players_df, t.Club_x).query("Best_Pos != 'GK'")["tpr"].mean()
tpr_b = LineupTPR(players_df, t.Club_y).query("Best_Pos != 'GK'")["tpr"].mean()
t

Club_x        Everton
tpr_x              72
tpr_y              64
Club_y    Bournemouth
Date       2023-05-28
Name: 25, dtype: object

In [386]:
st1 = Generate_Random_Match_Stats_Per_Minute(location_df, volume_formula_df, tpr_a, tpr_b)["match_data"]
st2 = Generate_Random_Match_Stats_Per_Minute(location_df, volume_formula_df, tpr_b, tpr_a)["match_data"]
print(tpr_a, t.Club_x, st1)
print(tpr_b, t.Club_y, st2)

77.3 Everton [{'Pass': 513, 'Shot': 13, 'Dribble': 14}]
75.2 Bournemouth [{'Pass': 428, 'Shot': 11, 'Dribble': 11}]


<br><br><br><br><br><br><br><br>
<h2 style="color:white;background:red;">  yyyydffff</h2>

In [387]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

# Create DataFrame
df = merged_df.copy()

# Normalize the data
# scaler = MinMaxScaler()
# df[['Att', 'Dribbling', 'Pace', "tpr"]] = scaler.fit_transform(df[['Att', 'Dribbling', 'Pace', "tpr"]])

# Separate features and target variable
X = df[['Dribbling', 'Pace', "tpr"]]
y = df['Att']

# Create and train the model
model = LinearRegression()
model.fit(X, y)

# Get the coefficients
coefficients = model.coef_
intercept = model.intercept_

print("Coefficients:", coefficients)
print("Intercept:", intercept)

NameError: name 'merged_df' is not defined

In [None]:
def f(dribbling, pace, tpr):
    return intercept + coefficients[0] * dribbling + coefficients[1] * pace + coefficients[2] * tpr

f(56, 72, 72)

In [None]:
# sort_dribble = df.sort_values("Att", ascending=False)
# sort_dribble[list(set(fm_attributes) - set(gk_attributes))].head(15).mean().sort_values(ascending=False)

In [None]:
pass_df = pd.read_csv("../match_logs/Big5@22-23@passing.csv")[["Att","Club"]].groupby(by="Club").mean().reset_index().sort_values("Att", ascending=False)
shot_df = pd.read_csv("../match_logs/Big5@22-23@shooting.csv")[["Sh","Club"]].groupby(by="Club").mean().reset_index().sort_values("Sh", ascending=False)
dribble_df = pd.read_csv("../match_logs/Big5@22-23@possession.csv")[["Att","Club"]].groupby(by="Club").mean().reset_index().sort_values("Att", ascending=False)
merged_df = pd.merge(
    pass_df.merge(shot_df, on="Club", suffixes=("", "_shot")),
    dribble_df, on="Club", suffixes=("_pass", "_dribble")
)
merged_df.rename(columns={"Sh":"Att_shot"}, inplace=True)
action_columns = merged_df.select_dtypes(exclude="object").columns

In [None]:
merged_df["Total_Actions"] = merged_df.apply(lambda row: row[action_columns].sum(), axis=1)
merged_df["Triple_Prob_Dict"] = merged_df.apply(lambda row: {
    "Pass": (row["Att_pass"]/row["Total_Actions"]),
    "Shot": row["Att_shot"]/row["Total_Actions"],
    "Dribble": row["Att_dribble"]/row["Total_Actions"],
}, axis=1)

In [None]:
exrow = merged_df.sort_values("Att_dribble",ascending=False).iloc[72].Triple_Prob_Dict
avg = location_df[["pass_prob","shot_prob","dribble_prob"]].mean()

In [None]:
d = {
    "p": exrow["Pass"] /  avg["pass_prob"],# * f["pass_prob"],
    "s": exrow["Shot"] / avg["shot_prob"],#  * f["shot_prob"],
    "d": exrow["Dribble"] / avg["dribble_prob"],#  * f["shot_prob"],
}
d