In [442]:
import pandas as pd
import numpy as np
import os
import math
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [443]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from pandas.errors import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [444]:
defense = pd.read_csv("../match_logs/Big5@22-23@defense.csv")
gca = pd.read_csv("../match_logs/Big5@22-23@gca.csv")
keeper = pd.read_csv("../match_logs/Big5@22-23@keeper.csv")
misc = pd.read_csv("../match_logs/Big5@22-23@misc.csv")
passing = pd.read_csv("../match_logs/Big5@22-23@passing.csv")
passing_types = pd.read_csv("../match_logs/Big5@22-23@passing_types.csv")
possession = pd.read_csv("../match_logs/Big5@22-23@possession.csv")
shooting = pd.read_csv("../match_logs/Big5@22-23@shooting.csv")

In [445]:
pd.read_csv("../players_db/fm23/fm23db_processed.csv").sort_values(by="tpr",ascending=False)[["Club","Name","Best_Pos","tpr"]].head(5)

Unnamed: 0,Club,Name,Best_Pos,tpr
3,FC Bayern,Manuel Neuer,GK,92
4164,FC Barcelona,Robert Lewandowski,ST,92
3193,Manchester City,Erling Haaland,ST,90
3195,Tottenham Hotspur,Harry Kane,ST,90
4172,FC Barcelona,Marc-André ter Stegen,GK,89


In [446]:
team_ratings = pd.read_csv("../players_db/fm23/team_ratings.csv")
team_ratings_cols = team_ratings.columns

In [447]:
tpr_cols = []
team_rating_cols_only_numeric = []
except_cols = ["Club","Based","Division","Club_id","League_id","fbref_name"]

for col in team_ratings_cols:
    if col not in except_cols:
        team_rating_cols_only_numeric.append(col)
        for xy in ["x","y"]:
            xy_col = f"{col}_{xy}"
            tpr_cols.append(xy_col)

In [448]:
len(tpr_cols)

76

In [449]:
team_ratings.sort_values("tpr", ascending=False).head(n=2)

Unnamed: 0,Club,GK,Based,Division,Club_id,League_id,tpr,Anticipation,Stamina,Balance,...,Corners,Leadership,Teamwork,Strength,Determination,Work_Rate,Long_Shots,Passing,First_Touch,fbref_name
0,FC Bayern,92,Germany (Bundesliga),Bundesliga,3704,185,92,85,87,77,...,71,87,86,82,88,85,84,76,73,Bayern Munich
2,Liverpool,86,England (Premier Division),English Premier Division,6518,354,92,83,85,87,...,78,90,88,81,86,85,66,85,88,Liverpool


<br><br><br><br><br><br><br><br>
<h2 style="color:orange;background:blue;">@.  Logs to Regression</h2>

In [450]:
touches_df = possession.copy()
touches_df["Attacking_Touches"] = possession.Att + passing.Att + shooting.Sh
touches_df["Error"] = touches_df["Touches"]-touches_df["Attacking_Touches"]
touches_df[["Club","Touches","Attacking_Touches","Error"]].mean(numeric_only=True)

Touches              596.650246
Attacking_Touches    518.024357
Error                 78.625889
dtype: float64

In [451]:
ATTRIBUTES_DICT = {
    "passing":"Att",
    "shooting":"Sh",
    "gca":"SCA",
    "possession":"Att",
    "touches_df":"Attacking_Touches"
}

In [452]:
def Prepare_Logs(action_title, action_attribute, team_ratings):
    CURRENT_DATAFRAME_NAME = action_title
    attribute = action_attribute
    CURRENT_DATAFRAME = globals()[CURRENT_DATAFRAME_NAME]
    CURRENT_DATAFRAME["Unnamed: 0"] = "itsanobject"
    DATAFRAME_COLUMNS = list(CURRENT_DATAFRAME.columns)
    NUMERIC_DATAFRAME_COLUMNS = list(CURRENT_DATAFRAME.select_dtypes(exclude="object").columns)
    DESCRIBE = CURRENT_DATAFRAME[NUMERIC_DATAFRAME_COLUMNS].describe().loc[['count','min', 'mean', 'max']]
    
    merged_df_1 = pd.merge(CURRENT_DATAFRAME, team_ratings, on="Club", how="inner")  
    merged_df_2 = pd.merge(merged_df_1, team_ratings, left_on="Opponent", right_on="fbref_name", how="inner") 
    merged_df = merged_df_2.copy()
    merged_df["Versus_Title"] = merged_df.Club_x + " plays vs " + merged_df.Club_y + " as " + merged_df.Venue + " team"
    merged_df["tpr_diff"] = merged_df.tpr_x - merged_df.tpr_y
    merged_df["tpr_diff_abs"] = abs(merged_df.tpr_x - merged_df.tpr_y)
    merged_df["goal_diff"] = merged_df.GF - merged_df.GA
    merged_df["goal_diff_abs"] = abs(merged_df.GF - merged_df.GA)
    merged_df = merged_df.query("goal_diff > 0 and tpr_diff > 0")
    MERGED_NUMERIC_DATAFRAME_COLUMNS = list(merged_df.select_dtypes(exclude="object").columns)
    reg_df = merged_df[['Club_x','Club_y',"Versus_Title","Date"] + MERGED_NUMERIC_DATAFRAME_COLUMNS] 
    return {
        "reg_df": reg_df.dropna(),
        "merged_df": merged_df,
        "merged_df_1": merged_df_1,
        "merged_df_2": merged_df_2
    }

In [453]:
Prepare_Logs("shooting", "Sh", team_ratings)["reg_df"].head(n=2)

Unnamed: 0,Club_x,Club_y,Versus_Title,Date,GF,GA,Gls,Sh,SoT,SoT%,...,Strength_y,Determination_y,Work_Rate_y,Long_Shots_y,Passing_y,First_Touch_y,tpr_diff,tpr_diff_abs,goal_diff,goal_diff_abs
0,Manchester City,West Ham United,Manchester City plays vs West Ham United as Aw...,2022-08-07,2.0,0.0,2.0,13.0,1.0,7.7,...,84,73,78,73,67,75,12,12,2.0,2.0
1,Manchester City,West Ham United,Manchester City plays vs West Ham United as Ho...,2023-05-03,3.0,0.0,3.0,16.0,7.0,43.8,...,84,73,78,73,67,75,12,12,3.0,3.0


<br><br><br><br>
<h2 style="color:white;background:darkred;">  Regression</h2>

In [454]:
def Logs_to_Regression(action_title, action_attribute, team_ratings, formula_df, tpr_cols):
    attribute = action_attribute
    reg_df = Prepare_Logs(action_title, action_attribute, team_ratings)["reg_df"].dropna(axis=0)
    
    formula_indexes = formula_df[formula_df.index.str.contains(action_title, case=True)].index.tolist()
    formula_cols = [col.replace(f"{action_title}_", "") for col in formula_indexes] 
    
    NUMERIC_DATAFRAME_COLUMNS = [attribute]
    for attribute in NUMERIC_DATAFRAME_COLUMNS:
        X = reg_df[formula_cols]
        Y = reg_df[[attribute]]
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
        model = LinearRegression()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        
        formula_df.loc[formula_indexes, "coef"] = model.coef_.flatten()
        formula_df.loc[formula_indexes, "intercept" ] = model.intercept_[0]
    
        mse = mean_squared_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)
        print(f'{action_title}.{attribute} --- Mean Squared Error: {mse}')
        print(f'{action_title}.{attribute} --- R^2 Score: {r2}')
        # formula_df.to_excel(f"{action_title.capitalize()}_Coefficients.xlsx")
        return formula_df

In [455]:
attribute_cols = ATTRIBUTES_DICT.keys()
formula_indexes = []
# tpr_cols = ["tpr_x","tpr_y"]
for attribute_title in attribute_cols:
    for tpr_col in tpr_cols:
        action_col = f"{attribute_title}_{tpr_col}"
        formula_indexes.append(action_col)
formula_df = pd.DataFrame(index=formula_indexes)

In [456]:
for title, attribute in ATTRIBUTES_DICT.items():
    regression_formula_df = Logs_to_Regression(title, attribute, team_ratings, formula_df, tpr_cols)

passing.Att --- Mean Squared Error: 9690.45553006582
passing.Att --- R^2 Score: 0.36273520076722643
shooting.Sh --- Mean Squared Error: 28.08377928064441
shooting.Sh --- R^2 Score: -0.09237218833347693
gca.SCA --- Mean Squared Error: 65.30240604341219
gca.SCA --- R^2 Score: 0.01912008651578523
possession.Att --- Mean Squared Error: 34.731064224177565
possession.Att --- R^2 Score: 0.046157997556217745
touches_df.Attacking_Touches --- Mean Squared Error: 10107.03515870864
touches_df.Attacking_Touches --- R^2 Score: 0.3707738647596569


In [457]:
formula_df[formula_df.index.str.contains("touches_df_tpr", case=True)]

Unnamed: 0,coef,intercept
touches_df_tpr_x,7.535324,387.620021
touches_df_tpr_y,-5.13362,387.620021


<br><br><br><br><br><br><br><br>
<h2 style="color:#33FF49;background:blue;">  Regression for One Atrribute</h2>

In [458]:
action_title ="touches_df"
attribute = "Attacking_Touches"
prepared_df = Prepare_Logs(action_title, attribute, team_ratings)
reg_df = prepared_df["reg_df"]

In [459]:
tpr_x_dict = {"tpr_x": None} 
tpr_y_dict = {"tpr_y": None}

In [460]:
# q1 = reg_df.query("Versus_Title == 'Manchester City plays vs Liverpool as Home team'")
# q2 = reg_df.query("Versus_Title == 'Liverpool plays vs Manchester City as Home team'")
# queries = ["", q1, q2]
# query_no = 1
# print(queries[query_no].iloc[0].Date)
# queries[query_no].iloc[0].name
# queries[query_no].iloc[0].name

In [461]:
# https://fbref.com/en/matches/5965a1a5/Bayern-Munich-Bochum-February-11-2023-Bundesliga
# row_index= 3221 # for Bayern
# row_index2222 = 3758 # for Bochum

row_index333 = 722 # for Liverpool
row_index222 = 343 # for Manchester City

row_index = 25

x_cols = tpr_x_dict.keys()
y_cols = tpr_y_dict.keys()

tpr_x_custom_data = reg_df[x_cols].iloc[row_index].to_dict()
tpr_y_custom_data = reg_df[y_cols].iloc[row_index].to_dict()

custom_data = {**tpr_x_custom_data, **tpr_y_custom_data}
custom_cols = custom_data.keys()

In [462]:
match_info = reg_df.iloc[row_index].Versus_Title + "\n" + reg_df.iloc[row_index].Date + "\n" 
print(match_info)
reg_df[["Club_x","Club_y","tpr_x","tpr_y","Date","Attacking_Touches"]].iloc[row_index]

Brighton & Hove Albion plays vs Bournemouth as Away team
2023-04-04



Club_x               Brighton & Hove Albion
Club_y                          Bournemouth
tpr_x                                    72
tpr_y                                    64
Date                             2023-04-04
Attacking_Touches                     699.0
Name: 71, dtype: object

In [463]:
special_formula_df = pd.DataFrame(index=custom_cols)

df_to_custom = reg_df.copy()
X = df_to_custom[custom_cols]
Y = df_to_custom[[attribute]]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

coef_column_name = f"{attribute}_coef"
intercept_column_name = f"{attribute}_intercept"
special_formula_df[coef_column_name] = model.coef_.flatten()
special_formula_df[intercept_column_name] = model.intercept_[0]

mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

In [464]:
formula_df[formula_df.index.str.contains("touches_df_tpr", case=True)]

Unnamed: 0,coef,intercept
touches_df_tpr_x,7.535324,387.620021
touches_df_tpr_y,-5.13362,387.620021


In [465]:
special_formula_df

Unnamed: 0,Attacking_Touches_coef,Attacking_Touches_intercept
tpr_x,11.001498,227.953607
tpr_y,-7.252917,227.953607


In [466]:
custom_tpr_df = pd.DataFrame(custom_data, index=["tpr_value"]).T

In [467]:
custom_tpr_df[f"{attribute}_coef"] = special_formula_df[f"{attribute}_coef"][custom_cols].tolist()
custom_tpr_df[f"{attribute}_intercept"] = special_formula_df[f"{attribute}_intercept"][custom_cols].tolist()
custom_tpr_df["coef_result"] = custom_tpr_df[f"{attribute}_coef"] * custom_tpr_df["tpr_value"]

custom_tpr_df["formula_result"] = custom_tpr_df["coef_result"].sum() + custom_tpr_df[f"{attribute}_intercept"].iloc[0]
custom_tpr_df["formula_result"] = custom_tpr_df["formula_result"].iloc[0]
custom_tpr_df["formula_result_per_minute"] = custom_tpr_df["formula_result"].iloc[0] / 90

In [468]:
formula_result = round(custom_tpr_df["formula_result"].iloc[0])
round(custom_tpr_df["formula_result_per_minute"].iloc[0])

formula_result

556

In [469]:
prob_of_shot_in_a_minute = custom_tpr_df["formula_result_per_minute"].iloc[0]

def ProbAction(prob_of_shot_in_a_minute):
    r = np.random.choice([1, 0], p=[prob_of_shot_in_a_minute, 1-prob_of_shot_in_a_minute])
    return r

In [470]:
# counter = 0

# for minute in range(1, 91):
#     result = ProbAction(prob_of_shot_in_a_minute)
#     counter+=result

<br><br><br><br><br><br><br><br>
<h2 style="color:blue;background:aqua;">  Model Test</h2>

In [471]:
def Action_Volume_Predict(attribute, formula_df, tpr_x, tpr_y):
    input_df = pd.Series({"tpr_x": tpr_x, "tpr_y": tpr_y})
    coefficients = sum(formula_df.loc[:, f"{attribute}_coef"] * input_df[custom_cols])
    result = (coefficients + formula_df.loc[:, f"{attribute}_intercept"])
    return int(result.iloc[0])

In [472]:
tpr_test_data = pd.Series({"tpr_x": 69, "tpr_y": 67})

In [473]:
Action_Volume_Predict("Attacking_Touches", special_formula_df, 69, 67)

501

In [474]:
error_df = reg_df.copy()[["Club_x","Club_y","tpr_x","tpr_y","tpr_diff_abs","Date","Attacking_Touches"]]
error_df.sort_values(by="tpr_diff_abs", ascending=True).head().iloc[0]

Club_x                FSV Mainz
Club_y                  FC Köln
tpr_x                        69
tpr_y                        67
tpr_diff_abs                  2
Date                 2022-10-21
Attacking_Touches         536.0
Name: 3718, dtype: object

In [475]:
error_df["T_90"] = error_df["Attacking_Touches"] / 90
error_df["Predicted"] = error_df.apply(lambda row: Action_Volume_Predict(attribute, special_formula_df, row["tpr_x"], row["tpr_y"]), axis=1)
error_df["Predicted_90"] = error_df["Predicted"] / 90
error_df["Error"] = error_df["Attacking_Touches"] - error_df["Predicted"]
error_df["Error_min"] = error_df["T_90"] - error_df["Predicted_90"]

In [476]:
error_df[["Attacking_Touches","Predicted","Error"]].mean()

Attacking_Touches    578.238329
Predicted            578.786241
Error                 -0.547912
dtype: float64

<br><br><br><br><br><br><br><br>
<h2 style="color:#FFFBDA;background:#E4003A;">  Randomness (ltr)</h2>

In [477]:
mancity_raw = prepared_df["merged_df_2"].dropna()
mancity_raw["tpr_diff"] = mancity_raw.tpr_x - mancity_raw.tpr_y
mancity_raw["tpr_diff_abs"] = abs(mancity_raw.tpr_x - mancity_raw.tpr_y)
mancity = mancity_raw.query("tpr_diff_abs > 5")[["Club_x","Club_y","tpr_x","tpr_y","Attacking_Touches"]]

for n in range(30):
    mancity = mancity_raw.query(f"tpr_diff_abs > {n}")[["Club_x","Club_y","tpr_x","tpr_y","Attacking_Touches"]]
    
    data = mancity.Attacking_Touches
    mancity_att = pd.Series(data)
    mean_val = mancity_att.mean()
    distances = mancity_att - mean_val
    percentage_distances = (distances / mean_val) * 100
    np.random.seed(42)
    randomness = np.random.uniform(-0.1, 0.1, size=percentage_distances.shape) * percentage_distances
    adjusted_values = mancity_att + randomness
    
    df_comparison = pd.DataFrame({
        'Original': mancity_att,
        'Percentages_Distance': percentage_distances,
        'Randomness': randomness,
        'Adjusted': adjusted_values
    })
    
    median = df_comparison.Percentages_Distance.median()
    # print(n, median)

In [478]:
def Action_Volume_Predict_Randomnessed(attribute, formula_df, tpr_x, tpr_y):
    input_df = pd.Series({"tpr_x": tpr_x, "tpr_y": tpr_y})
    coefficients = sum(formula_df.loc[:, f"{attribute}_coef"] * input_df[custom_cols])
    result = (coefficients + formula_df.loc[:, f"{attribute}_intercept"])
    randomnessed_result = result.iloc[0] - result.iloc[0] * (np.random.randint(-10, 10)/100)
    return int(randomnessed_result)

<br><br><br><br><br><br><br><br>
<h2 style="color:#F3F7EC;background:#399918;">  Locational</h2>

In [479]:
pass_locations = pd.read_csv("../statsbomb/Pass_locations.csv")
shot_locations = pd.read_csv("../statsbomb/Shot_locations.csv")
dribble_locations = pd.read_csv("../statsbomb/Dribble_locations.csv")

pass_locations.iloc[46].percentage, shot_locations.iloc[46].percentage, dribble_locations.iloc[46].percentage

(0.214, 12.727, 1.286)

In [480]:
location_df = pass_locations \
.merge(shot_locations, on='Pitch_Number', how='inner', suffixes=("_pass", "_shot")) \
.merge(dribble_locations, on='Pitch_Number', how='inner', suffixes=("", "_dribble"))
location_df["percentage_dribble"] = location_df["percentage"]
location_df["count_dribble"] = location_df["count"]

In [481]:
location_df["total_count"] = location_df["count_pass"]+location_df["count_shot"]+location_df["count_dribble"]
location_df["total_percentage"] = location_df["percentage_pass"]+location_df["percentage_shot"]+location_df["percentage_dribble"]
location_df["pitch_prob"] = location_df["total_count"] / location_df["total_count"].sum()

location_df["pass_prob"] = location_df["count_pass"] / location_df["total_count"]
location_df["shot_prob"] = location_df["count_shot"] / location_df["total_count"]
location_df["dribble_prob"] = location_df["count_dribble"] / location_df["total_count"]

In [482]:
must_cols = ["Pitch_Number","total_count"]
percentage_cols = ["percentage_pass","percentage_shot","percentage_dribble"]
prob_cols = ["pitch_prob","pass_prob","shot_prob","dribble_prob"]

location_df[must_cols+percentage_cols+prob_cols].sort_values("total_count", ascending=False)[:5]

Unnamed: 0,Pitch_Number,total_count,percentage_pass,percentage_shot,percentage_dribble,pitch_prob,pass_prob,shot_prob,dribble_prob
53,54,840,2.087,0.0,0.214,0.019988,0.997619,0.0,0.002381
41,42,804,1.99,0.107,0.429,0.019131,0.993781,0.001244,0.004975
28,29,785,1.945,0.0,0.429,0.018679,0.994904,0.0,0.005096
67,68,775,1.903,0.107,1.072,0.018441,0.985806,0.00129,0.012903
42,43,766,1.893,0.0,0.643,0.018227,0.992167,0.0,0.007833


In [483]:
location_df["triple_prob"] = (
    location_df["pass_prob"].astype(str) + "," +
    location_df["shot_prob"].astype(str) + "," +
    location_df["dribble_prob"].astype(str)
)

In [498]:
tpr_united = 92
tpr_city = 76
Home_Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", special_formula_df, tpr_united, tpr_city)
Away_Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", special_formula_df, tpr_city, tpr_united)
Action_Counter_Home = {"Pass": 0, "Shot": 0, "Dribble": 0}
Action_Counter_Away = {"Pass": 0, "Shot": 0, "Dribble": 0}

for moment in range(Home_Touches):
    pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
    triple_splitted = location_df[location_df["Pitch_Number"] == pitch_number].triple_prob.iloc[0].split(",")
    triple_probs_raw = [float(x) for x in triple_splitted]
    triple_probs = [float(x) / sum(triple_probs_raw) for x in triple_splitted]
    action = np.random.choice(["Pass","Shot","Dribble"], p=triple_probs)
    Action_Counter_Home[action]+=1

for moment in range(Away_Touches):
    pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
    triple_splitted = location_df[location_df["Pitch_Number"] == pitch_number].triple_prob.iloc[0].split(",")
    triple_probs_raw = [float(x) for x in triple_splitted]
    triple_probs = [float(x) / sum(triple_probs_raw) for x in triple_splitted]
    action = np.random.choice(["Pass","Shot","Dribble"], p=triple_probs)
    Action_Counter_Away[action]+=1

print(tpr_united, Action_Counter_Home)
print(tpr_city, Action_Counter_Away)

92 {'Pass': 725, 'Shot': 20, 'Dribble': 12}
76 {'Pass': 382, 'Shot': 6, 'Dribble': 4}


In [485]:
test_touches = 656
test_touches = test_touches - test_touches * (np.random.randint(-10, 10)/100)
test_touches

682.24