In [1]:
import pandas as pd
import numpy as np
import os
import sys
import math
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from mplsoccer.pitch import Pitch
import matplotlib.patches as patches
from statsbombpy import sb
from statsbombpy.api_client import NoAuthWarning
from collections import Counter
from IPython.display import Markdown

In [2]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from pandas.errors import SettingWithCopyWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))
warnings.simplefilter(action='ignore', category=NoAuthWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [3]:
parent_dir = os.path.abspath(os.path.join(os.path.dirname("./"), '..'))
sys.path.append(parent_dir)
from tools.lineup_tpr import LineupTPR
from tools.draw_pitch_plot import Draw_Pitch_Actions
from tools.pickler import Save_Model, Load_Model
from tools.fm_attributes import Attributes as fm_attributes

<br><br><br><br><br><br><br><br>
<h2 style="color:black;background:white;">Start</h2>

In [4]:
defense = pd.read_csv("../match_logs/Big5@22-23@defense.csv")
gca = pd.read_csv("../match_logs/Big5@22-23@gca.csv")
keeper = pd.read_csv("../match_logs/Big5@22-23@keeper.csv")
misc = pd.read_csv("../match_logs/Big5@22-23@misc.csv")
passing = pd.read_csv("../match_logs/Big5@22-23@passing.csv")
passing_types = pd.read_csv("../match_logs/Big5@22-23@passing_types.csv")
possession = pd.read_csv("../match_logs/Big5@22-23@possession.csv")
shooting = pd.read_csv("../match_logs/Big5@22-23@shooting.csv")

In [5]:
pd.read_csv("../players_db/fm23/fm23db_processed.csv").sort_values(by="tpr",ascending=False)[["Club","Name","Best_Pos","tpr"]].head(5)

Unnamed: 0,Club,Name,Best_Pos,tpr
3,FC Bayern,Manuel Neuer,GK,92
4164,FC Barcelona,Robert Lewandowski,ST,92
3193,Manchester City,Erling Haaland,ST,90
3195,Tottenham Hotspur,Harry Kane,ST,90
4172,FC Barcelona,Marc-André ter Stegen,GK,89


In [6]:
players_df = pd.read_csv("../players_db/fm23/fm23db_processed.csv")

In [7]:
team_df = pd.read_csv("../players_db/fm23/team_ratings.csv")
team_df_cols = team_df.columns

In [8]:
tpr_cols = []
team_rating_cols_only_numeric = []
except_cols = ["Club","Based","Division","Club_id","League_id","fbref_name"]

for col in team_df_cols:
    if col not in except_cols:
        team_rating_cols_only_numeric.append(col)
        for xy in ["x","y"]:
            xy_col = f"{col}_{xy}"
            tpr_cols.append(xy_col)

In [9]:
len(tpr_cols)

76

In [10]:
team_df.sort_values("tpr", ascending=False).head(n=2)

Unnamed: 0,Club,GK,Based,Division,Club_id,League_id,tpr,Anticipation,Stamina,Balance,...,Corners,Leadership,Teamwork,Strength,Determination,Work_Rate,Long_Shots,Passing,First_Touch,fbref_name
0,FC Bayern,92,Germany (Bundesliga),Bundesliga,3704,185,92,85,87,77,...,71,87,86,82,88,85,84,76,73,Bayern Munich
2,Liverpool,86,England (Premier Division),English Premier Division,6518,354,92,83,85,87,...,78,90,88,81,86,85,66,85,88,Liverpool


<br><br><br><br><br><br><br><br>
<h2 style="color:orange;background:blue;">@.  Logs to Regression</h2>

In [11]:
touches_df = possession.copy()
touches_df["Attacking_Touches"] = possession.Att + passing.Att + shooting.Sh
touches_df["Error"] = touches_df["Touches"]-touches_df["Attacking_Touches"]
touches_df[["Club","Touches","Attacking_Touches","Error"]].mean(numeric_only=True)

Touches              596.650246
Attacking_Touches    518.024357
Error                 78.625889
dtype: float64

In [12]:
#touches_df.to_csv("../match_logs/Big5@22-23@attacking_touches.csv")

In [13]:
ATTRIBUTES_DICT = {
    "passing":"Att",
    "shooting":"Sh",
    "gca":"SCA",
    "possession":"Att",
    "touches_df":"Attacking_Touches"
}

In [14]:
def Prepare_Logs(action_title, action_attribute, team_df):
    CURRENT_DATAFRAME_NAME = action_title
    attribute = action_attribute
    CURRENT_DATAFRAME = globals()[CURRENT_DATAFRAME_NAME]
    CURRENT_DATAFRAME["Unnamed: 0"] = "itsanobject"
    DATAFRAME_COLUMNS = list(CURRENT_DATAFRAME.columns)
    NUMERIC_DATAFRAME_COLUMNS = list(CURRENT_DATAFRAME.select_dtypes(exclude="object").columns)
    DESCRIBE = CURRENT_DATAFRAME[NUMERIC_DATAFRAME_COLUMNS].describe().loc[['count','min', 'mean', 'max']]
    
    merged_df_1 = pd.merge(CURRENT_DATAFRAME, team_df, on="Club", how="inner")  
    merged_df_2 = pd.merge(merged_df_1, team_df, left_on="Opponent", right_on="fbref_name", how="inner") 
    merged_df = merged_df_2.copy()
    merged_df["Versus_Title"] = merged_df.Club_x + " plays vs " + merged_df.Club_y + " as " + merged_df.Venue + " team"
    merged_df["tpr_diff"] = merged_df.tpr_x - merged_df.tpr_y
    merged_df["tpr_diff_abs"] = abs(merged_df.tpr_x - merged_df.tpr_y)
    merged_df["goal_diff"] = merged_df.GF - merged_df.GA
    merged_df["goal_diff_abs"] = abs(merged_df.GF - merged_df.GA)
    merged_df = merged_df.query("goal_diff > 0 and tpr_diff > 0")
    MERGED_NUMERIC_DATAFRAME_COLUMNS = list(merged_df.select_dtypes(exclude="object").columns)
    reg_df = merged_df[['Club_x','Club_y',"Versus_Title","Date"] + MERGED_NUMERIC_DATAFRAME_COLUMNS] 
    return {
        "reg_df": reg_df.dropna(),
        "merged_df": merged_df,
        "merged_df_1": merged_df_1,
        "merged_df_2": merged_df_2
    }

In [15]:
Prepare_Logs("shooting", "Sh", team_df)["reg_df"].head(n=254).iloc[25][["tpr_x","tpr_y","Versus_Title","Date"]]

tpr_x                                                          72
tpr_y                                                          64
Versus_Title    Brighton & Hove Albion plays vs Bournemouth as...
Date                                                   2023-04-04
Name: 71, dtype: object

<br><br><br><br><br><br><br><br>
<h2 style="color:white;background:darkred;">  Regression</h2>

In [16]:
def Logs_to_Regression(action_title, action_attribute, team_df, formula_df, tpr_cols):
    attribute = action_attribute
    reg_df = Prepare_Logs(action_title, action_attribute, team_df)["reg_df"].dropna(axis=0)
    
    formula_indexes = formula_df[formula_df.index.str.contains(action_title, case=True)].index.tolist()
    formula_cols = [col.replace(f"{action_title}_", "") for col in formula_indexes] 
    
    NUMERIC_DATAFRAME_COLUMNS = [attribute]
    for attribute in NUMERIC_DATAFRAME_COLUMNS:
        X = reg_df[formula_cols]
        Y = reg_df[[attribute]]
        
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
        model = LinearRegression()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        
        formula_df.loc[formula_indexes, "coef"] = model.coef_.flatten()
        formula_df.loc[formula_indexes, "intercept" ] = model.intercept_[0]
    
        mse = mean_squared_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)
        print(f'{action_title}.{attribute} --- Mean Squared Error: {mse}')
        print(f'{action_title}.{attribute} --- R^2 Score: {r2}')
        # formula_df.to_excel(f"{action_title.capitalize()}_Coefficients.xlsx")
        return formula_df

In [17]:
attribute_cols = ATTRIBUTES_DICT.keys()
formula_indexes = []
# tpr_cols = ["tpr_x","tpr_y"]
for attribute_title in attribute_cols:
    for tpr_col in tpr_cols:
        action_col = f"{attribute_title}_{tpr_col}"
        formula_indexes.append(action_col)
formula_df = pd.DataFrame(index=formula_indexes)

In [18]:
for title, attribute in ATTRIBUTES_DICT.items():
    regression_formula_df = Logs_to_Regression(title, attribute, team_df, formula_df, tpr_cols)

passing.Att --- Mean Squared Error: 9690.455530065823
passing.Att --- R^2 Score: 0.3627352007672262
shooting.Sh --- Mean Squared Error: 28.08377928064442
shooting.Sh --- R^2 Score: -0.09237218833347738
gca.SCA --- Mean Squared Error: 65.30240604341209
gca.SCA --- R^2 Score: 0.019120086515786783
possession.Att --- Mean Squared Error: 34.73106422417757
possession.Att --- R^2 Score: 0.04615799755621741
touches_df.Attacking_Touches --- Mean Squared Error: 10107.035158708646
touches_df.Attacking_Touches --- R^2 Score: 0.37077386475965646


In [19]:
formula_df[formula_df.index.str.contains("touches_df_tpr", case=True)]

Unnamed: 0,coef,intercept
touches_df_tpr_x,7.535324,387.620021
touches_df_tpr_y,-5.13362,387.620021


<br><br><br><br><br><br><br><br>
<h2 style="color:red;background:aqua;">  Attacking_Touches Regression Hyperparameter Tuning</h2>

In [20]:
action_title ="touches_df"
attribute = "Attacking_Touches"
prepared_df = Prepare_Logs(action_title, attribute, team_df)
reg_df = prepared_df["reg_df"]

In [21]:
pitch_attributes = list(set(fm_attributes.attribute_list) - set(fm_attributes.gk_attributes) | set(["Attacking_Touches","Club"]))
tpr_all_cols = []
tpr_x_cols = []
tpr_y_cols = []
except_cols = ["Club","Based","Division","Club_id","League_id","fbref_name"]

for col in team_df_cols:
    if col not in except_cols:
        for xy in ["x","y"]:
            xy_col = f"{col}_{xy}"
            tpr_all_cols.append(xy_col)
            if xy == "x":
                tpr_x_cols.append(xy_col)
            if xy == "y":
                tpr_y_cols.append(xy_col)      

In [22]:
reg_df.query("Attacking_Touches > 300").shape[0]

812

In [23]:
def Train_Touches_Model(reg_df, filter_value, tpr_all_cols, attribute="Attacking_Touches", test_size=None):
    tpr_all_cols.append("Attacking_Touches")
    touches_tpr_mean = reg_df[tpr_all_cols].query(f"Attacking_Touches > {filter_value}").mean().sort_values(ascending=False)
    pitch_cols = touches_tpr_mean[tpr_all_cols].drop(attribute).sort_values(ascending=False).index
    formula_df = pd.DataFrame(index=list(pitch_cols))
    
    touches_reg_df = reg_df.dropna()
    X = touches_reg_df[pitch_cols]
    Y = touches_reg_df[[attribute]]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=0)
    model = LinearRegression()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    
    coef_column_name = f"{attribute}_coef"
    abs_coef_column_name = f"{attribute}_abs_coef"
    intercept_column_name = f"{attribute}_intercept"
    formula_df[coef_column_name] = model.coef_.flatten()
    formula_df[abs_coef_column_name] = abs(model.coef_.flatten())
    formula_df[intercept_column_name] = model.intercept_[0]
    
    mse = mean_squared_error(Y_test, Y_pred)
    mae = mean_absolute_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    return {
        "mae": mae,
        "formula_df": formula_df,
        "model": model,
    }

In [24]:
attribute = "Attacking_Touches"
coef_col = f"{attribute}_coef"
abs_coef_col = f"{attribute}_abs_coef"
filter_value = 300
test_size = 0.02

model = Train_Touches_Model(reg_df, filter_value, tpr_all_cols, attribute, test_size)
formula_df = model["formula_df"].sort_values(abs_coef_col, ascending=False)
formula_df_indexes = formula_df.index

most_effective_formula_cols = list(formula_df_indexes)
print(len(most_effective_formula_cols))
print("mae", model["mae"])

76
mae 76.26124140141302


<br><br><br><br>
#### Tuning Most Effective Cols

In [25]:
mae = 100
for filter_value in range(300, 600, 50):
    for test_size_num in range(1, 5, 1):
        for slice_index in range(10, 25):
            sliced_tpr_all_cols = most_effective_formula_cols[:slice_index]
            test_size = test_size_num/100 
            model = Train_Touches_Model(reg_df, filter_value, sliced_tpr_all_cols, attribute, test_size)
            this_mae = model["mae"]
            if this_mae < mae:
                mae = this_mae
                markdown_table = f"""
| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| {1} | {slice_index} | {test_size} | {filter_value} | {this_mae} |
"""
                display(Markdown(markdown_table))


| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 10 | 0.01 | 300 | 81.43901796856623 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 11 | 0.01 | 300 | 80.10570020420126 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 21 | 0.01 | 300 | 75.0290882318514 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 22 | 0.01 | 300 | 74.69658802299854 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 23 | 0.01 | 300 | 72.38327479188065 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 24 | 0.01 | 300 | 71.721478810156 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 10 | 0.02 | 300 | 58.14724016388676 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 23 | 0.02 | 300 | 56.88330887151897 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 23 | 0.02 | 450 | 56.88330887151891 |


In [26]:
most_effective_formula_cols[:23]

['tpr_x',
 'tpr_y',
 'Crossing_x',
 'Work_Rate_x',
 'Stamina_y',
 'Crossing_y',
 'Finishing_x',
 'Acceleration_y',
 'Technique_y',
 'Strength_y',
 'Corners_x',
 'Flair_x',
 'Stamina_x',
 'Positioning_x',
 'First_Touch_y',
 'Anticipation_x',
 'Composure_y',
 'Determination_x',
 'Tackling_x',
 'Positioning_y',
 'Free_Kick_Taking_x',
 'Heading_y',
 'Work_Rate_y']

<br><br><br><br>
#### Tuning slices out of Most Effective Cols

In [27]:
mae = 100
slice_index = 23
sliced_tpr_all_cols = most_effective_formula_cols[:slice_index]

for filter_value in range(300, 600, 50):
    for test_size_num in range(10, 30, 1):
        test_size = test_size_num/100 
        model = Train_Touches_Model(reg_df, filter_value, sliced_tpr_all_cols, attribute, test_size)
        this_mae = model["mae"]
        if this_mae < mae:
            mae = this_mae
            markdown_table = f"""
| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| {1} | {slice_index} | {test_size} | {filter_value} | {this_mae} |
"""
            display(Markdown(markdown_table))


| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 23 | 0.1 | 300 | 69.8509193067728 |



| HEADER | slice_index | test_size | filter_Value | this_mae |
|-------------|-------------|-----------|--------------|----------|
| 1 | 23 | 0.1 | 450 | 69.85091930677277 |


In [28]:
exrow = reg_df.iloc[0]
print(exrow[["Club_x","tpr_x","tpr_y","Club_y","Attacking_Touches"]])
model = Train_Touches_Model(reg_df, 450, most_effective_formula_cols[:23], "Attacking_Touches", 0.1)
model_written = model["model"]

sum(exrow[model_written.feature_names_in_].values * model_written.coef_.flatten()) + model_written.intercept_

Club_x               Manchester City
tpr_x                             89
tpr_y                             77
Club_y               West Ham United
Attacking_Touches              908.0
Name: 0, dtype: object


array([651.16112787])

<br><br><br><br>

In [29]:
#model["formula_df"].to_csv("formulas/Attacking_Touches_volume_formula_df_hyper.csv", index=True)

<br><br><br><br><br><br><br><br>
<h2 style="color:#33FF49;background:blue;">  Regression for One Atrribute</h2>

In [30]:
action_title ="touches_df"
attribute = "Attacking_Touches"
prepared_df = Prepare_Logs(action_title, attribute, team_df)
reg_df = prepared_df["reg_df"]

In [31]:
tpr_x_dict = {
    "tpr_x": None,
} 
tpr_y_dict = {
    "tpr_y": None,
}

In [32]:
# q1 = reg_df.query("Versus_Title == 'Manchester City plays vs Liverpool as Home team'")
# q2 = reg_df.query("Versus_Title == 'Liverpool plays vs Manchester City as Home team'")
# queries = ["", q1, q2]
# query_no = 1
# print(queries[query_no].iloc[0].Date)
# queries[query_no].iloc[0].name
# queries[query_no].iloc[0].name

In [33]:
# https://fbref.com/en/matches/5965a1a5/Bayern-Munich-Bochum-February-11-2023-Bundesliga
# row_index= 3221 # for Bayern
# row_index2222 = 3758 # for Bochum

row_index333 = 722 # for Liverpool
row_index222 = 343 # for Manchester City

row_index = 0

x_cols = tpr_x_dict.keys()
y_cols = tpr_y_dict.keys()

tpr_x_custom_data = reg_df[x_cols].iloc[row_index].to_dict()
tpr_y_custom_data = reg_df[y_cols].iloc[row_index].to_dict()

custom_data = {**tpr_x_custom_data, **tpr_y_custom_data}
custom_cols = list(custom_data.keys())

In [34]:
custom_cols

['tpr_x', 'tpr_y']

In [35]:
match_info = reg_df.iloc[row_index].Versus_Title + "\n" + reg_df.iloc[row_index].Date + "\n" 
print(match_info)

match_info_tpr = reg_df[["Club_x","Club_y","Date","Attacking_Touches"] + custom_cols].iloc[row_index]
match_info_tpr

Manchester City plays vs West Ham United as Away team
2022-08-07



Club_x               Manchester City
Club_y               West Ham United
Date                      2022-08-07
Attacking_Touches              908.0
tpr_x                             89
tpr_y                             77
Name: 0, dtype: object

In [36]:
special_formula_df = pd.DataFrame(index=custom_cols)

df_to_custom = reg_df.copy()
X = df_to_custom[custom_cols]
Y = df_to_custom[[attribute]]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

coef_column_name = f"{attribute}_coef"
intercept_column_name = f"{attribute}_intercept"
special_formula_df[coef_column_name] = model.coef_.flatten()
special_formula_df[intercept_column_name] = model.intercept_[0]

mse = mean_squared_error(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

In [37]:
formula_df[formula_df.index.str.contains("touches_df_tpr", case=True)]

Unnamed: 0,Attacking_Touches_coef,Attacking_Touches_abs_coef,Attacking_Touches_intercept


In [38]:
special_formula_df

Unnamed: 0,Attacking_Touches_coef,Attacking_Touches_intercept
tpr_x,11.001498,227.953607
tpr_y,-7.252917,227.953607


In [39]:
custom_tpr_df = pd.DataFrame(custom_data, index=["tpr_value"]).T

In [40]:
custom_tpr_df[f"{attribute}_coef"] = special_formula_df[f"{attribute}_coef"][custom_cols].tolist()
custom_tpr_df[f"{attribute}_intercept"] = special_formula_df[f"{attribute}_intercept"][custom_cols].tolist()
custom_tpr_df["coef_result"] = custom_tpr_df[f"{attribute}_coef"] * custom_tpr_df["tpr_value"]

custom_tpr_df["formula_result"] = custom_tpr_df["coef_result"].sum() + custom_tpr_df[f"{attribute}_intercept"].iloc[0]
custom_tpr_df["formula_result"] = custom_tpr_df["formula_result"].iloc[0]
custom_tpr_df["formula_result_per_minute"] = custom_tpr_df["formula_result"].iloc[0] / 90

In [41]:
formula_result = round(custom_tpr_df["formula_result"].iloc[0])
round(custom_tpr_df["formula_result_per_minute"].iloc[0])

formula_result

649

In [42]:
prob_of_shot_in_a_minute = custom_tpr_df["formula_result_per_minute"].iloc[0]

def ProbAction(prob_of_shot_in_a_minute):
    r = np.random.choice([1, 0], p=[prob_of_shot_in_a_minute, 1-prob_of_shot_in_a_minute])
    return r

In [43]:
# special_formula_df.to_csv("formulas/Attacking_Touches_volume_formula_df.csv", index=False)

<br><br><br><br><br><br><br><br>
<h2 style="color:blue;background:aqua;">  Model Test</h2>

In [44]:
def Action_Volume_Predict(attribute, formula_df, tpr_x, tpr_y):
    input_df = pd.Series({"tpr_x": tpr_x, "tpr_y": tpr_y})
    coefficients = sum(formula_df.loc[:, f"{attribute}_coef"] * input_df[custom_cols])
    result = (coefficients + formula_df.loc[:, f"{attribute}_intercept"])
    return int(result.iloc[0])

In [45]:
tpr_test_data = pd.Series({
    "tpr_x": match_info_tpr.tpr_x,
    "tpr_y": match_info_tpr.tpr_y,
})

In [46]:
Action_Volume_Predict("Attacking_Touches", special_formula_df, 69, 67)

501

In [47]:
error_df = reg_df.copy()[["Club_x","Club_y","tpr_x","tpr_y","tpr_diff_abs","Date","Attacking_Touches"]]
error_df.sort_values(by="tpr_diff_abs", ascending=True).head().iloc[0]

Club_x                FSV Mainz
Club_y                  FC Köln
tpr_x                        69
tpr_y                        67
tpr_diff_abs                  2
Date                 2022-10-21
Attacking_Touches         536.0
Name: 3718, dtype: object

In [48]:
error_df["T_90"] = error_df["Attacking_Touches"] / 90
error_df["Predicted"] = error_df.apply(lambda row: Action_Volume_Predict(attribute, special_formula_df, row["tpr_x"], row["tpr_y"]), axis=1)
error_df["Predicted_90"] = error_df["Predicted"] / 90
error_df["Error"] = error_df["Attacking_Touches"] - error_df["Predicted"]
error_df["Error_min"] = error_df["T_90"] - error_df["Predicted_90"]

In [49]:
error_df[["Attacking_Touches","Predicted","Error"]].mean()

Attacking_Touches    578.238329
Predicted            578.786241
Error                 -0.547912
dtype: float64

<br><br><br><br><br><br><br><br>
<h2 style="color:#F3F7EC;background:#399918;">  Locational</h2>

In [50]:
pass_locations = pd.read_csv("../statsbomb/data/position_included_location_384_squared_ligue1_Pass.csv").query("position=='ALL'")
shot_locations = pd.read_csv("../statsbomb/data/position_included_location_384_squared_ligue1_Shot.csv").query("position=='ALL'")
dribble_locations = pd.read_csv("../statsbomb/data/position_included_location_384_squared_ligue1_Dribble.csv").query("position=='ALL'")

pass_locations.iloc[46].percentage, shot_locations.iloc[46].percentage, dribble_locations.iloc[46].percentage

(0.001220504475183, 0.0, 0.0043706293706293)

In [51]:
location_df = pass_locations \
    .merge(shot_locations, on='Pitch_Number', how='inner', suffixes=("_pass", "_shot")) \
    .merge(dribble_locations, on='Pitch_Number', how='inner', suffixes=("", "_dribble"))
location_df["percentage_dribble"] = location_df["percentage"]
location_df["count_dribble"] = location_df["count"]

In [52]:
location_df[["x_interval","y_interval"]][:20]

Unnamed: 0,x_interval,y_interval
0,"(0.0, 5.0)","(0.0, 5.0)"
1,"(0.0, 5.0)","(5.0, 10.0)"
2,"(0.0, 5.0)","(10.0, 15.0)"
3,"(0.0, 5.0)","(15.0, 20.0)"
4,"(0.0, 5.0)","(20.0, 25.0)"
5,"(0.0, 5.0)","(25.0, 30.0)"
6,"(0.0, 5.0)","(30.0, 35.0)"
7,"(0.0, 5.0)","(35.0, 40.0)"
8,"(0.0, 5.0)","(40.0, 45.0)"
9,"(0.0, 5.0)","(45.0, 50.0)"


In [53]:
location_df["total_count"] = location_df["count_pass"]+location_df["count_shot"]+location_df["count_dribble"]
location_df["total_percentage"] = location_df["percentage_pass"]+location_df["percentage_shot"]+location_df["percentage_dribble"]
location_df["pitch_prob"] = location_df["total_count"] / location_df["total_count"].sum()

location_df["pass_prob"] = location_df["count_pass"] / location_df["total_count"]
location_df["shot_prob"] = location_df["count_shot"] / location_df["total_count"]
location_df["dribble_prob"] = location_df["count_dribble"] / location_df["total_count"]

In [54]:
must_cols = ["Pitch_Number","total_count"]
percentage_cols = ["percentage_pass","percentage_shot","percentage_dribble"]
prob_cols = ["pitch_prob","pass_prob","shot_prob","dribble_prob"]

location_df[must_cols+percentage_cols+prob_cols].sort_values("total_count", ascending=False).iloc[311]

Pitch_Number          287.000000
total_count            45.000000
percentage_pass         0.000976
percentage_shot         0.007051
percentage_dribble      0.002622
pitch_prob              0.001158
pass_prob               0.800000
shot_prob               0.133333
dribble_prob            0.066667
Name: 286, dtype: float64

In [55]:
location_df.head()

Unnamed: 0,Pitch_Number,count_pass,x_interval_pass,y_interval_pass,percentage_pass,position_pass,action_pass,prob_pass,count_shot,x_interval_shot,...,action,prob,percentage_dribble,count_dribble,total_count,total_percentage,pitch_prob,pass_prob,shot_prob,dribble_prob
0,1,7.0,"(0.0, 5.0)","(0.0, 5.0)",0.00019,ALL,Pass,,0.0,"(0.0, 5.0)",...,Dribble,,0.0,0.0,7.0,0.00019,0.00018,1.0,0.0,0.0
1,2,19.0,"(0.0, 5.0)","(5.0, 10.0)",0.000515,ALL,Pass,,0.0,"(0.0, 5.0)",...,Dribble,,0.001748,2.0,21.0,0.002264,0.00054,0.904762,0.0,0.095238
2,3,30.0,"(0.0, 5.0)","(10.0, 15.0)",0.000814,ALL,Pass,,0.0,"(0.0, 5.0)",...,Dribble,,0.0,0.0,30.0,0.000814,0.000772,1.0,0.0,0.0
3,4,44.0,"(0.0, 5.0)","(15.0, 20.0)",0.001193,ALL,Pass,,0.0,"(0.0, 5.0)",...,Dribble,,0.000874,1.0,45.0,0.002068,0.001158,0.977778,0.0,0.022222
4,5,45.0,"(0.0, 5.0)","(20.0, 25.0)",0.001221,ALL,Pass,,0.0,"(0.0, 5.0)",...,Dribble,,0.001748,2.0,47.0,0.002969,0.001209,0.957447,0.0,0.042553


In [56]:
#location_df.to_csv("data/location_df.csv", index=False)

In [57]:
fm_managerial_tactics = {
    'Gegen - Direct': {"Pass": 1.2, "Shot": 1.0, "Dribble": 0.8},
    'Gegen - High Tempo': {"Pass": 1.1, "Shot": 1.1, "Dribble": 0.9},
    'Gegen - Slow Passing': {"Pass": 0.9, "Shot": 1.0, "Dribble": 1.0},
    'Gegen - Wing Play': {"Pass": 1.3, "Shot": 1.0, "Dribble": 0.7},
    
    'Tiki Taka - Direct': {"Pass": 1.5, "Shot": 0.8, "Dribble": 0.8},
    'Tiki Taka - High Tempo': {"Pass": 1.4, "Shot": 0.9, "Dribble": 0.8},
    'Tiki Taka - Slow Passing': {"Pass": 1.6, "Shot": 0.7, "Dribble": 0.9},
    'Tiki Taka - Wing Play': {"Pass": 1.7, "Shot": 0.6, "Dribble": 0.8},
    
    'Control - Direct': {"Pass": 1.1, "Shot": 1.0, "Dribble": 1.0},
    'Control - High Tempo': {"Pass": 1.0, "Shot": 1.1, "Dribble": 1.0},
    'Control - Slow Passing': {"Pass": 1.0, "Shot": 0.9, "Dribble": 1.1},
    'Control - Wing Play': {"Pass": 1.2, "Shot": 1.0, "Dribble": 0.9},
    
    'Counter - Direct': {"Pass": 0.8, "Shot": 1.2, "Dribble": 1.0},
    'Counter - High Tempo': {"Pass": 0.9, "Shot": 1.1, "Dribble": 1.1},
    'Counter - Slow Passing': {"Pass": 0.7, "Shot": 1.0, "Dribble": 1.2},
    'Counter - Wing Play': {"Pass": 0.8, "Shot": 1.3, "Dribble": 1.0},
    
    'Total Football - Direct': {"Pass": 1.2, "Shot": 1.2, "Dribble": 1.0},
    'Total Football - High Tempo': {"Pass": 1.1, "Shot": 1.3, "Dribble": 1.0},
    'Total Football - Slow Passing': {"Pass": 1.3, "Shot": 1.1, "Dribble": 0.9},
    'Total Football - Wing Play': {"Pass": 1.4, "Shot": 1.0, "Dribble": 0.8},
}

selected_tactics = [
    fm_managerial_tactics['Tiki Taka - Direct'],
    fm_managerial_tactics['Counter - High Tempo'],
    fm_managerial_tactics['Counter - Wing Play'],
]

In [58]:
def create_triple_prob_dict(row, tactical_coef={"Pass": 0, "Shot": 0, "Dribble": 0}):
    prob_dict = {
        "Pass": row["pass_prob"]+row["pass_prob"]*tactical_coef["Pass"],
        "Shot": row["shot_prob"]+row["shot_prob"]*tactical_coef["Shot"],
        "Dribble": row["dribble_prob"]+row["dribble_prob"]*tactical_coef["Dribble"]
    }
    sum_prob_dict = sum(prob_dict.values())
    weighted_prob_dict = {key: prob_dict[key]/sum_prob_dict for key in prob_dict.keys()}
    return weighted_prob_dict

In [59]:
tpr1 = LineupTPR(players_df, "Liverpool").query("Best_Pos != 'GK'").mean().tpr
tpr2 = LineupTPR(players_df, "FC Bayern").query("Best_Pos != 'GK'").mean().tpr
print(tpr1, tpr2)

84.4 83.6


In [60]:
tpr_a = tpr1
tpr_b = tpr2

Home_Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", special_formula_df, tpr_a, tpr_b)
Away_Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", special_formula_df, tpr_b, tpr_a)
Action_Counter_Home = {"Pass": 0, "Shot": 0, "Dribble": 0}
Action_Counter_Away = {"Pass": 0, "Shot": 0, "Dribble": 0}

for moment in range(Home_Touches):
    location_df["triple_prob_dict"] = location_df.apply(lambda row: create_triple_prob_dict(row, selected_tactics[0]), axis=1)
    pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
    triple_probs = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].values())
    action = np.random.choice(["Pass","Shot","Dribble"], p=triple_probs)
    Action_Counter_Home[action]+=1

for moment in range(Away_Touches):
    location_df["triple_prob_dict"] = location_df.apply(lambda row: create_triple_prob_dict(row, selected_tactics[1]), axis=1)
    pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
    triple_probs = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].values())
    action = np.random.choice(["Pass","Shot","Dribble"], p=triple_probs)
    Action_Counter_Away[action]+=1

print(selected_tactics, "\n")
print(tpr_a, Action_Counter_Home)
print(tpr_b, Action_Counter_Away)

NameError: name 'Action_Volume_Predict_Randomnessed' is not defined

In [61]:
from IPython.display import display, HTML
loading_bar_html = """
<div id="progress-container" style="width: 50%; border: 1px solid #ccc; margin: 20px 0;">
  <div id="progress-bar" style="width: 0%; height: 20px; background-color: #4CAF50; text-align: center; line-height: 20px; color: white;"></div>
</div>
<script>
function updateProgressBar(progress) {
  var progressBar = document.getElementById('progress-bar');
  progressBar.style.width = progress + '%';
  progressBar.innerHTML = progress + '%';
}
</script>
"""

In [62]:
def Generate_Random_Match_Stats(tpr_a, tpr_b, match_number=1):
    all_matches= []
    shot_map = []
    pass_map = []
    dribble_map = []
    location_df["triple_prob_dict"] = location_df.apply(lambda row: create_triple_prob_dict(row, selected_tactics[2]), axis=1)
    
    for process in range(match_number):
        Action_Counter = {"Pass": 0, "Shot": 0, "Dribble": 0}
        Touches = Action_Volume_Predict_Randomnessed("Attacking_Touches", special_formula_df, tpr_a, tpr_b)
        for moment in range(Touches):
            pitch_number = np.random.choice(location_df["Pitch_Number"], p=location_df["pitch_prob"])
            triple_probs = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].values())
            triple_actions = list(location_df[location_df["Pitch_Number"] == pitch_number]["triple_prob_dict"].iloc[0].keys())
            action = np.random.choice(triple_actions, p=triple_probs)
            if action=="Pass":
                pass_map.append(pitch_number)
            elif action=="Shot":
                shot_map.append(pitch_number)
            elif action=="Dribble":
                dribble_map.append(pitch_number)
            Action_Counter[action]+=1
        all_matches.append(Action_Counter)

    print("__completed__!")
    return {
        "all_matches": all_matches,
        "pass_map": pass_map, 
        "shot_map": shot_map, 
        "dribble_map": dribble_map, 
    }

In [63]:
Generate_Random_Match_Stats(72, 64, 5)["all_matches"]

NameError: name 'Action_Volume_Predict_Randomnessed' is not defined

<br><br><br><br><br><br><br><br>
<h2 style="color:black;background:yellow;">  Pitch Plot</h2>

In [None]:
actions_generated = Generate_Random_Match_Stats(82, 71, match_number=1)["shot_map"]
p = Draw_Pitch_Actions(actions_generated, False, "../statsbomb/data/Pitch_Locations_384_squared_ligue1_Pass.csv", "pass_map").plot() \
    .save_fig("./plots_imgs/counts_df_Locations.png")