In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time
import requests
from bs4 import BeautifulSoup
import lxml
import json
from urllib.parse import quote

from futsim_funcs import TPR,TFR,TPSR,TGKR,TDR

In [3]:
import warnings
warnings.filterwarnings("ignore", message="The default value of numeric_only in DataFrame.mean is deprecated.", category=FutureWarning)

In [4]:
players_df = pd.read_csv("../fifa24_db/pdb_23.csv")

In [5]:
tpr_data = TPR(players_df) # Overall
tfr_data = TFR(players_df) # Shooting
tgkr_data = TGKR(players_df) # Goalkeeping
tdr_data = TDR(players_df) # Defense

In [6]:
tdr_data.sort_values("defense", ascending=False).head(n=5)

Unnamed: 0,league_id,club_team_id,league_name,club_name,power,defense
109,53,243,La Liga,Real Madrid,84,78
101,16,73,Ligue 1,Paris Saint Germain,83,76
30,13,5,Premier League,Chelsea,84,75
70,31,44,Serie A,Inter,83,75
81,13,9,Premier League,Liverpool,84,75


<br><br><br><br>

# FBRef Match Logs URLs

In [7]:
leagues = ["Premier League", "Serie A", "Ligue 1", "LaLiga", "Bundesliga"]
fbref_league_dict = {
    "Premier League": {"name": "Premier League", "league_id": 9},
    "Serie A": {"name": "Serie A", "league_id": 11},
    "Ligue 1": {"name": "Ligue 1", "league_id": 13},
    "LaLiga": {"name": "LaLiga", "league_id": 12},
    "Bundesliga": {"name": "Bundesliga", "league_id": 20},   
}

In [8]:
def GetTeamCredentials(league_id, season="2022-2023"):
    league_url = f"https://fbref.com/en/comps/{league_id}/{season}"
    response = requests.get(league_url)
    soup = BeautifulSoup(response.content, "lxml")

    table = soup.find("table", {"class": "stats_table"})
    team_ids = []
    team_names = []

    for row in table.find_all("tr")[1:]:
        team_name = row.find("td", {"data-stat": "team"}).text
        team_id = row.find("td", {"data-stat": "team"}).a.get("href").split("/")[3]
        team_ids.append(team_id)
        team_names.append(team_name)
    return {
        "team_ids":team_ids,
        "team_names":team_names,
    }

In [9]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

In [10]:
def TFRFromId(tfr_data, club_team_id):
    power = tfr_data.query(f"club_team_id == {club_team_id}")["power"].iloc[0]
    finishing = tfr_data.query(f"club_team_id == {club_team_id}")["finishing"].iloc[0]
    return {
        "power": power,
        "finishing": finishing,
    }

In [11]:
def TeamNameIdConverter(tpr_data, team_name, league_id, print_output=False):
    # TeamNameIdConverter(tpr_data,"chelseas",13,1)
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()
    club_team_name, score, other = find_best_match(team_name, club_team_names)
    club_team_id = club_team_ids[club_team_names.index(club_team_name)]
    if print_output:
        print(f"ID for {club_team_name}:")
    return club_team_id

In [82]:
def FB_URLS(creds, stat_attribute, fbref_league_id, league_id, season="2022-2023"):
    team_ids = creds["team_ids"]
    team_names = creds["team_names"]
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()

    for team_id, team_name in zip(team_ids, team_names):
        best_match = find_best_match(team_name, club_team_names)
        if best_match:
            club_team_name, score, other = best_match
            club_team_id = club_team_ids[club_team_names.index(club_team_name)]
            match_logs_url = f"https://fbref.com/en/squads/{team_id}/{season}/matchlogs/c{fbref_league_id}/{stat_attribute}/{team_name.replace(' ', '-')}"
            match_logs_stats_dict[stat_attribute].append({
                "id": team_id,
                "stat": stat_attribute,
                "team": team_name,
                "club_team_id": club_team_id,
                "club_team_name": club_team_name, 
                "url": match_logs_url
            })
        else:
            print(f"No match found for team: {team_name}")

<h1 style="color:orange">🦸‍♂️Must Parameters</h1>

In [79]:
players_df.groupby(by="league_id").mean()

  players_df.groupby(by="league_id").mean()


Unnamed: 0_level_0,player_id,overall,potential,value_eur,wage_eur,height_cm,club_team_id,club_jersey_number,nationality_id,attacking_crossing,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
league_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,241035.631902,66.885481,73.451943,3060573.0,5398.364008,182.184049,18984.801636,17.97546,41.633947,50.374233,...,54.102249,54.102249,54.102249,54.433538,53.570552,52.251534,52.251534,52.251534,53.570552,22.445808
13,226529.99005,73.78607,79.45937,15188930.0,55532.25539,182.386401,389.86733,22.567164,38.557214,57.25539,...,61.9801,61.9801,61.9801,61.538972,60.749585,59.804312,59.804312,59.804312,60.749585,22.945274
16,233313.313975,70.945554,76.479129,6923693.0,20629.764065,181.555354,405.036298,22.756806,47.909256,53.747731,...,57.388385,57.388385,57.388385,57.424682,56.566243,55.37931,55.37931,55.37931,56.566243,23.123412
19,232001.811787,72.064639,77.593156,9600951.0,23243.060837,184.522814,11703.604563,21.019011,34.846008,54.423954,...,58.912548,58.912548,58.912548,58.809886,57.963878,57.093156,57.093156,57.093156,57.963878,23.279468
31,230203.273043,72.695652,77.643478,9767178.0,29984.086957,183.810435,35411.274783,26.64,40.876522,55.297391,...,60.0,60.0,60.0,59.907826,59.227826,58.293913,58.293913,58.293913,59.227826,22.492174
53,231693.8672,72.9472,78.2624,10854250.0,31030.4,181.496,6250.5888,17.6016,50.4992,55.3584,...,59.7216,59.7216,59.7216,59.352,58.4976,57.4432,57.4432,57.4432,58.4976,23.1984
68,229606.451362,67.883268,71.424125,2452481.0,13692.412451,181.552529,72987.733463,27.233463,56.058366,53.519455,...,55.55642,55.55642,55.55642,55.85214,54.964981,53.856031,53.856031,53.856031,54.964981,21.214008
308,243340.683594,68.580078,74.429688,3565283.0,5750.097656,182.001953,25890.546875,30.271484,55.169922,51.384766,...,55.412109,55.412109,55.412109,55.794922,55.019531,53.806641,53.806641,53.806641,55.019531,21.632812


In [85]:
match_logs_stats_dict = {
    "shooting":[],
    "passing":[],
    "defense":[],
    "keeper":[],
    "passing_types":[],
    "gca":[],
    "possession":[],
    "misc":[],
}

attributes = match_logs_stats_dict.keys()
league_id = 13
league_name = "Premier League"
creds = GetTeamCredentials(fbref_league_dict[league_name]["league_id"], "2022-2023")
fbref_league_id = fbref_league_dict[league_name]["league_id"]

for stat_attribute in attributes:
    FB_URLS(creds, stat_attribute, fbref_league_id, league_id)

In [86]:
match_logs_stats_dict

{'shooting': [{'id': 'b8fd03ef',
   'stat': 'shooting',
   'team': ' Manchester City',
   'club_team_id': 10,
   'club_team_name': 'Manchester City',
   'url': 'https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City'},
  {'id': '18bb7c10',
   'stat': 'shooting',
   'team': ' Arsenal',
   'club_team_id': 1,
   'club_team_name': 'Arsenal',
   'url': 'https://fbref.com/en/squads/18bb7c10/2022-2023/matchlogs/c9/shooting/-Arsenal'},
  {'id': '19538871',
   'stat': 'shooting',
   'team': ' Manchester Utd',
   'club_team_id': 11,
   'club_team_name': 'Manchester United',
   'url': 'https://fbref.com/en/squads/19538871/2022-2023/matchlogs/c9/shooting/-Manchester-Utd'},
  {'id': 'b2b47a98',
   'stat': 'shooting',
   'team': ' Newcastle Utd',
   'club_team_id': 13,
   'club_team_name': 'Newcastle United',
   'url': 'https://fbref.com/en/squads/b2b47a98/2022-2023/matchlogs/c9/shooting/-Newcastle-Utd'},
  {'id': '822bd0ba',
   'stat': 'shooting',
   'team': ' Liverpo

<br><br><br><br>

<h1 style="color:green">Scraping FBRef</h1>

In [180]:
df = pd.read_html("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City")[1][:38]
df.sort_values(('Against Manchester City', 'Date')).head()
df.columns

MultiIndex([('Against Manchester City',         'Date'),
            ('Against Manchester City',         'Time'),
            ('Against Manchester City',        'Round'),
            ('Against Manchester City',          'Day'),
            ('Against Manchester City',        'Venue'),
            ('Against Manchester City',       'Result'),
            ('Against Manchester City',           'GF'),
            ('Against Manchester City',           'GA'),
            ('Against Manchester City',     'Opponent'),
            (               'Standard',          'Gls'),
            (               'Standard',           'Sh'),
            (               'Standard',          'SoT'),
            (               'Standard',         'SoT%'),
            (               'Standard',         'G/Sh'),
            (               'Standard',        'G/SoT'),
            (               'Standard',         'Dist'),
            (               'Standard',           'FK'),
            (               'St

In [181]:
def FB_MatchLogs(url, last_matchweek=38):
    encoded_url = quote(url, safe=':/')
    df=pd.read_html(encoded_url)
    return {
        "for": df[0][:last_matchweek],
        "against": df[1][:last_matchweek],
    }

In [182]:
def MatchLogs_MultiIndexColumnDict(df, stat_attribute=None):
    stat_columns_dict = {}
    for item in df.columns:
        right_side = item[1]
        stat_columns_dict[right_side] = item
    if stat_attribute:
        return stat_columns_dict[stat_attribute]
    return stat_columns_dict

In [183]:
MatchLogs_MultiIndexColumnDict(df)

{'Date': ('Against Manchester City', 'Date'),
 'Time': ('Against Manchester City', 'Time'),
 'Round': ('Against Manchester City', 'Round'),
 'Day': ('Against Manchester City', 'Day'),
 'Venue': ('Against Manchester City', 'Venue'),
 'Result': ('Against Manchester City', 'Result'),
 'GF': ('Against Manchester City', 'GF'),
 'GA': ('Against Manchester City', 'GA'),
 'Opponent': ('Against Manchester City', 'Opponent'),
 'Gls': ('Standard', 'Gls'),
 'Sh': ('Standard', 'Sh'),
 'SoT': ('Standard', 'SoT'),
 'SoT%': ('Standard', 'SoT%'),
 'G/Sh': ('Standard', 'G/Sh'),
 'G/SoT': ('Standard', 'G/SoT'),
 'Dist': ('Standard', 'Dist'),
 'FK': ('Standard', 'FK'),
 'PK': ('Standard', 'PK'),
 'PKatt': ('Standard', 'PKatt'),
 'xG': ('Expected', 'xG'),
 'npxG': ('Expected', 'npxG'),
 'npxG/Sh': ('Expected', 'npxG/Sh'),
 'G-xG': ('Expected', 'G-xG'),
 'np:G-xG': ('Expected', 'np:G-xG'),
 'Match Report': ('Unnamed: 24_level_0', 'Match Report')}

In [184]:
match_logs_stats_dict["shooting"][0]

{'id': 'b8fd03ef',
 'stat': 'shooting',
 'team': ' Manchester City',
 'club_team_id': 10,
 'club_team_name': 'Manchester City',
 'url': 'https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City'}

<br><br><br><br>
<h1 style="color:black">Match Logs DataFrame</h1>

In [20]:
df_log_array = []
current_stat = "shooting"

# for tindex in range(20):
#     print(tindex)
#     time.sleep(5)
#     current_team = match_logs_stats_dict[current_stat][tindex]

#     url = current_team["url"]
#     print(url)
    
#     df = FB_MatchLogs(url, last_matchweek=38)
#     df_for = df["for"]
#     df_against = df["against"]
#     logs_cols_for = MatchLogs_MultiIndexColumnDict(df_for)
#     logs_cols_against = MatchLogs_MultiIndexColumnDict(df_against)

#     team_names = df_for[logs_cols_for["Opponent"]].unique().tolist()
#     team_ids = [TeamNameIdConverter(tpr_data, team_name, league_id) for team_name in team_names]
#     team_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])

#     for i in range(38):
#         x = df_for.iloc[i]
#         y = df_against.iloc[i]
#         x_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])
#         y_rating_data = TFRFromId(tfr_data, TeamNameIdConverter(tpr_data, y[logs_cols_against["Opponent"]], league_id))
#         lab_df_dict = {
#             "team_name": current_team["club_team_name"],
#             "team_id": current_team["club_team_id"],
#             "team_sh": x[logs_cols_for["Sh"]],
#             "team_sot": x[logs_cols_for["SoT"]],
#             "team_scored": x[logs_cols_for["Gls"]],
#             "opponent_name": y[logs_cols_against["Opponent"]],
#             "opponent_id": TeamNameIdConverter(tpr_data, y[logs_cols_against["Opponent"]], league_id),
#             "opponent_sh": y[logs_cols_against["Sh"]],
#             "opponent_sot": y[logs_cols_against["SoT"]],
#             "opponent_scored": y[logs_cols_against["Gls"]],
#             "team_power": x_rating_data["power"],
#             "team_finishing": x_rating_data["finishing"],
#             "opponent_power": y_rating_data["power"],
#             "opponent_finishing": y_rating_data["finishing"],
#             "team_sh_share": x[logs_cols_for["Sh"]] / ( x[logs_cols_for["Sh"]] + y[logs_cols_against["Sh"]] ),
#             "opponent_sh_share": y[logs_cols_against["Sh"]] / ( x[logs_cols_for["Sh"]] + y[logs_cols_against["Sh"]] ),
#         }
#         df_log_array.append(lab_df_dict)
        
# df_log = pd.DataFrame(df_log_array)
league_name_for_csv_file = league_name
# df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name_for_csv_file}.csv")

In [21]:
df_log = pd.read_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name_for_csv_file}.csv")

<br><br><br><br>
<h1 style="color:aqua">Regression Boilerplate</h1>

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_linear_regression(df, features, target_variable, apply_dtype, test_size=0.19, random_state=42):
    """
    Train a linear regression model and evaluate its performance.

    Args:
    - df (DataFrame): Input DataFrame containing the dataset.
    - features (list): List of feature columns.
    - target_variable (str): Name of the target variable column.
    - test_size (float): Proportion of the dataset to include in the test split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - model (LinearRegression): Trained linear regression model.
    - mse (float): Mean squared error of the model.
    - coefficients (dict): Coefficients of the trained model.
    """

    X = df[features]
    y = df[target_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    df['predicted'] = model.predict(df[features])
    df['predicted'] = df['predicted'].apply(apply_dtype)
    
    mse = mean_squared_error(y_test, y_pred)
    coefficients = dict(zip(features, model.coef_))
    intercept = model.intercept_
    return model, mse, coefficients, intercept

def model_error(df, target_variable):
    df['error'] = df[target_variable] - df['predicted']
    return df['error'].apply(abs)

def model_head(df, features, target_variable, sort_parameter="team_power", sort_ascending=False, n=5):
    return df[features+[target_variable,"predicted","error"]].sort_values(sort_parameter, ascending=sort_ascending).head(n=n)

def model_equation(coefficients, intercept):
    equation = " + ".join([f"{coef:.2e} * {feat}" for feat, coef in coefficients.items()])
    equation = equation + f" + {intercept:.2e}"
    equation = equation.replace("+ -", "- ")
    return equation

def model_predict(**args):
    pass

def plot_predicted_vs_actual(df, target_variable, model=None, features=None):
    """
    Plot the actual values and predicted values from a DataFrame.

    Args:
    - df (DataFrame): Input DataFrame containing the dataset.
    - target_variable (str): Name of the target variable column.
    - model (LinearRegression): Trained linear regression model.
    - features (list): List of feature columns used in the model (optional).

    Returns:
    - None
    """

    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df[target_variable], marker='o', linestyle='-', color="b", label='Actual')
    
    if model and features:
        plt.plot(df.index, df['predicted'], marker='o', linestyle='-', color="r", label='Predicted')
    plt.title('Line Plot of Predicted Values')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

In [23]:
def convert_equation_to_json(features, values, intercept):
    # Usage: convert_equation_to_json(features, coefficients.values(), intercept)
    equation_dict = {}
    for feature, value in zip(features, values):
        equation_dict[feature] = {
            "name": feature,
            "value": value
        }
    equation_dict["intercept"] = {"name":"intercept", "value":intercept}
    return equation_dict

<br>
<h1 style="color:aqua">Randomness Factor</h1>

In [24]:
def randomness_proportional(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_min) / (target_mean) 
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

def randomness_volume(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_mean) / (target_std)
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

In [237]:
def interval_proportional(value, step=0.05):
    lower_bound = value - step
    upper_bound = value + step
    numbers = np.arange(-upper_bound*3, upper_bound + step*3, step)
    return numbers

def interval_volume(value, step=1):
    lower_bound = int(value - step)
    upper_bound = int(value + step)
    numbers = np.arange(-upper_bound*3, upper_bound + step*3, step)
    return numbers

In [26]:
def probability_protector(value):
    if value < 0:
        return 0
    elif value > 1:
        return 1
    else:
        return value

In [27]:
def random_choice(array):
    return np.random.choice(array)

<br><br><br><br>
<h1 style="color:red">Total Shot Volume</h1>

In [28]:
columns_to_drop = ['team_name','team_id','opponent_name','opponent_id']
df = df_log
df["total_sh"] =  df["team_sh"] + df["opponent_sh"]

In [29]:
features = ['team_power', 'team_finishing', 'opponent_power', 'opponent_finishing', "team_sh", "opponent_sh"]
target_variable = 'total_sh'

In [30]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 8.682910377467137e-30
Error Mean: 0.18552631578947368


In [31]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [32]:
model_equation(coefficients, intercept)

'-1.68e-17 * team_power - 8.07e-17 * team_finishing + 1.91e-16 * opponent_power - 5.76e-16 * opponent_finishing + 1.00e+00 * team_sh + 1.00e+00 * opponent_sh + 4.26e-14'

In [33]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_sh,opponent_sh,team_name,team_id,opponent_name,opponent_id,total_sh,predicted,error
296,78,74,81,75,6,3,Athletic Club,448,Sevilla,2,9,9,0
448,81,75,78,74,3,6,Sevilla,481,Athletic Club,5,9,9,0
333,74,72,76,76,8,3,Mallorca,453,Getafe,10,11,11,0
561,76,76,74,72,3,8,Getafe,1860,Mallorca,10,11,11,0
751,74,74,75,71,7,5,Elche,468,Valencia,10,12,12,0


In [34]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 9 , Max: 47
Result: 4.4231151718134
Interval: [-5 -4 -3 -2 -1  0  1  2  3  4  5  6  7]


2

In [35]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: value + random_choice(interval))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].head(n=5).sort_values("randomness")

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_sh,opponent_sh,team_name,team_id,opponent_name,opponent_id,total_sh,predicted,randomness
4,83,74,75,73,16,6,FC Barcelona,241,Cádiz,10,22,22,18
1,83,74,78,72,15,10,FC Barcelona,241,Real Sociedad,1,25,25,25
0,83,74,75,71,21,4,FC Barcelona,241,Rayo Vallecano,2,25,25,30
2,83,74,74,71,24,8,FC Barcelona,241,Valladolid,2,32,31,31
3,83,74,81,75,18,9,FC Barcelona,241,Sevilla,2,27,27,34


<br><br><br><br>
<h1 style="color:red">Shot Share</h1>

In [36]:
features = ['team_power', 'opponent_power']
target_variable = 'team_sh_share'

In [37]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, float)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 0.023256066852908472
Error Mean: 0.12653189274319573


In [38]:
model_equation(coefficients, intercept)

'1.34e-02 * team_power - 1.04e-02 * opponent_power + 2.72e-01'

In [39]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [40]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,error
49,84,73,Real Madrid,243,Girona,1808,0.730769,0.636166,0.094603
38,84,73,Real Madrid,243,Almería,110,0.74359,0.636166,0.107424
68,84,73,Real Madrid,243,Girona,1808,0.580645,0.636166,-0.055521
69,84,73,Real Madrid,243,Almería,110,0.608696,0.636166,-0.02747
57,84,74,Real Madrid,243,Mallorca,10,0.826087,0.625733,0.200354


In [41]:
randomness_func, interval_func = randomness_proportional, interval_proportional
result = randomness_func(df, target_variable)
interval = interval_func(result, step=0.01)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0.0384615384615384 , Max: 0.9615384615384616
Result: 0.0769230769230768
Interval: [-0.08692308 -0.07692308 -0.06692308 -0.05692308 -0.04692308 -0.03692308
 -0.02692308 -0.01692308 -0.00692308  0.00307692  0.01307692  0.02307692
  0.03307692  0.04307692  0.05307692  0.06307692  0.07307692  0.08307692
  0.09307692  0.10307692  0.11307692]


0.07307692307692312

In [42]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: value + random_choice(interval))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].head(n=5).sort_values("randomness")

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,randomness
3,83,81,FC Barcelona,241,Sevilla,2,0.666667,0.539295,0.452372
2,83,74,FC Barcelona,241,Valladolid,2,0.75,0.612326,0.555403
1,83,78,FC Barcelona,241,Real Sociedad,1,0.6,0.570594,0.573671
0,83,75,FC Barcelona,241,Rayo Vallecano,2,0.84,0.601893,0.62497
4,83,75,FC Barcelona,241,Cádiz,10,0.727273,0.601893,0.63497


In [43]:
team_power = 82
opponent_power = 80

y = 1.34e-02 * team_power - 1.04e-02 * opponent_power + 2.72e-01
probability_protector(y+random_choice(interval)) # MODEL IS DONE!

0.6018769230769232

<br><br><br><br>
<h1 style="color:red">Shot on Target</h1>

In [58]:
features = ['team_power', 'team_finishing', 'opponent_power', 'team_sh']
target_variable = 'team_sot'

In [59]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 2.948568123815928
Error Mean: 1.375


In [60]:
model_equation(coefficients, intercept)

'5.83e-04 * team_power + 3.81e-02 * team_finishing - 4.25e-02 * opponent_power + 2.99e-01 * team_sh + 7.52e-01'

In [61]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [62]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,team_sh,team_name,team_id,opponent_name,opponent_id,team_sot,predicted,error
89,84,80,75,35,Real Madrid,243,Cádiz,10,11,11,0
612,84,80,73,29,Real Madrid,243,Almería,110,15,9,6
240,84,80,74,29,Real Madrid,243,Valladolid,2,17,9,8
461,80,82,75,30,Villarreal,483,Espanyol,1,12,9,3
83,83,74,74,28,FC Barcelona,241,Mallorca,10,8,8,0


In [63]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0 , Max: 17
Result: 1.7009399024455396
Interval: [-2 -1  0  1  2  3  4]


2

In [64]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: value + random_choice(interval))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].head(n=5).sort_values("randomness")

Unnamed: 0,team_power,team_finishing,opponent_power,team_sh,team_name,team_id,opponent_name,opponent_id,team_sot,predicted,randomness
3,83,74,76,9,FC Barcelona,241,Celta Vigo,5,2,3,3
1,83,74,78,10,FC Barcelona,241,Athletic Club,5,7,3,4
4,84,80,76,14,Real Madrid,243,Celta Vigo,5,5,4,6
2,83,74,78,10,FC Barcelona,241,Athletic Club,5,2,3,7
0,83,74,76,13,FC Barcelona,241,Celta Vigo,5,5,4,8


<br><br><br><br>
<h1 style="color:purple">Shot Models Test</h1>

<br><br><br><br>
<h1 style="color:black">Lineup Test</h1>

In [65]:
lineup_test_ids = [230621, 235212, 155862, 207865, 252145, 199556, 234153, 230767, 231747, 158023, 190871]
test_team_df = []
for player_id in lineup_test_ids:
    player_df = players_df[players_df['player_id'] == player_id]
    test_team_df.append(player_df)
result_df = pd.concat(test_team_df, ignore_index=True)

In [68]:
TFR(result_df)

Unnamed: 0,league_id,club_team_id,league_name,club_name,power,finishing
0,16,73,Ligue 1,Paris Saint Germain,85,79


<br><br><br><br>
<h1 style="color:red">Pass Total Attempts</h1>

In [243]:
df = pd.read_html("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/passing/-Manchester-City")[0][:38]
df.sort_values(('For Manchester City', 'Date')).head()
df.columns = df.columns.map('.'.join)

In [244]:
df.columns

Index(['For Manchester City.Date', 'For Manchester City.Time',
       'For Manchester City.Round', 'For Manchester City.Day',
       'For Manchester City.Venue', 'For Manchester City.Result',
       'For Manchester City.GF', 'For Manchester City.GA',
       'For Manchester City.Opponent', 'Total.Cmp', 'Total.Att', 'Total.Cmp%',
       'Total.TotDist', 'Total.PrgDist', 'Short.Cmp', 'Short.Att',
       'Short.Cmp%', 'Medium.Cmp', 'Medium.Att', 'Medium.Cmp%', 'Long.Cmp',
       'Long.Att', 'Long.Cmp%', 'Unnamed: 23_level_0.Ast',
       'Unnamed: 24_level_0.xAG', 'Unnamed: 25_level_0.xA',
       'Unnamed: 26_level_0.KP', 'Unnamed: 27_level_0.1/3',
       'Unnamed: 28_level_0.PPA', 'Unnamed: 29_level_0.CrsPA',
       'Unnamed: 30_level_0.PrgP', 'Unnamed: 31_level_0.Match Report'],
      dtype='object')

In [246]:
# Example DataFrame columns
columns = ['For Manchester City.Date', 'For Manchester City.Time',
           'For Manchester City.Round', 'For Manchester City.Day',
           'For Manchester City.Venue', 'For Manchester City.Result',
           'For Manchester City.GF', 'For Manchester City.GA',
           'For Manchester City.Opponent', 'Total.Cmp', 'Total.Att', 'Total.Cmp%',
           'Total.TotDist', 'Total.PrgDist', 'Short.Cmp', 'Short.Att',
           'Short.Cmp%', 'Medium.Cmp', 'Medium.Att', 'Medium.Cmp%', 'Long.Cmp',
           'Long.Att', 'Long.Cmp%', 'Unnamed: 23_level_0.Ast',
           'Unnamed: 24_level_0.xAG', 'Unnamed: 25_level_0.xA',
           'Unnamed: 26_level_0.KP', 'Unnamed: 27_level_0.1/3',
           'Unnamed: 28_level_0.PPA', 'Unnamed: 29_level_0.CrsPA',
           'Unnamed: 30_level_0.PrgP', 'Unnamed: 31_level_0.Match Report']

# Rename columns
new_columns = [col.split('.')[1] if 'For' in col else col for col in columns]

# Assign new column names to DataFrame
df.columns = new_columns

# Display the DataFrame with updated column names
df


Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,Total.Cmp,...,Long.Cmp%,Unnamed: 23_level_0.Ast,Unnamed: 24_level_0.xAG,Unnamed: 25_level_0.xA,Unnamed: 26_level_0.KP,Unnamed: 27_level_0.1/3,Unnamed: 28_level_0.PPA,Unnamed: 29_level_0.CrsPA,Unnamed: 30_level_0.PrgP,Unnamed: 31_level_0.Match Report
0,2022-08-07,16:30,Matchweek 1,Sun,Away,W,2,0,West Ham,792,...,69.4,1,1.3,1.6,11,52,9,0,51,Match Report
1,2022-08-13,15:00,Matchweek 2,Sat,Home,W,4,0,Bournemouth,672,...,74.1,3,1.7,1.8,17,76,13,3,73,Match Report
2,2022-08-21,16:30,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,554,...,69.5,3,1.8,2.0,17,48,9,2,60,Match Report
3,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,734,...,78.3,4,2.3,1.8,15,82,22,8,81,Match Report
4,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,741,...,74.2,3,1.8,1.6,13,60,13,1,63,Match Report
5,2022-09-03,17:30,Matchweek 6,Sat,Away,D,1,1,Aston Villa,695,...,70.6,1,1.7,1.2,10,34,13,3,55,Match Report
6,2022-09-17,12:30,Matchweek 8,Sat,Away,W,3,0,Wolves,586,...,73.2,3,0.9,1.3,14,44,12,2,50,Match Report
7,2022-10-02,14:00,Matchweek 9,Sun,Home,W,6,3,Manchester Utd,506,...,78.9,6,2.3,2.4,16,25,13,2,49,Match Report
8,2022-10-08,15:00,Matchweek 10,Sat,Home,W,4,0,Southampton,687,...,71.7,4,1.9,1.1,16,46,14,3,66,Match Report
9,2022-10-16,16:30,Matchweek 11,Sun,Away,L,0,1,Liverpool,646,...,69.8,0,0.9,1.2,12,53,17,4,76,Match Report


In [210]:
def Passing_Mapper(side, team_name):
    side = side.capitalize()
    mapper = {
        "Opponent": (f'{side} {team_name}',     'Opponent'),
        "Total_Cmp": (              'Total',          'Cmp'),
        "Total_Att": (              'Total',          'Att'),
    }
    return mapper

In [211]:
df_log_array = []
current_stat = "passing"
last_matchweek = 38

for tindex in range(20):
    print(tindex)
    time.sleep(1)
    current_team = match_logs_stats_dict[current_stat][tindex]

    url = current_team["url"]
    print(url)
    
    df = FB_MatchLogs(url, last_matchweek=38)
    df_for = df["for"]
    df_against = df["against"]
    logs_cols_for = Passing_Mapper("for", current_team["club_team_name"])
    logs_cols_against = Passing_Mapper("against", current_team["club_team_name"])

    team_names = df_for[logs_cols_for["Opponent"]].unique().tolist()
    team_ids = [TeamNameIdConverter(tpr_data, team_name, league_id) for team_name in team_names]
    team_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])

    for i in range(38):
        x = df_for.iloc[i]
        y = df_against.iloc[i]
        x_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])
        y_rating_data = TFRFromId(tfr_data, TeamNameIdConverter(tpr_data, y[logs_cols_against["Opponent"]], league_id))
        lab_df_dict = {
            "team_name": current_team["club_team_name"],
            "team_id": current_team["club_team_id"],
            "opponent_name": y[logs_cols_against["Opponent"]],
            "opponent_id": TeamNameIdConverter(tpr_data, y[logs_cols_against["Opponent"]], league_id),
            
            "team_cmp_pass": x[logs_cols_for["Total_Cmp"]],
            "team_att_pass": x[logs_cols_for["Total_Att"]],
            
            
            "team_power": x_rating_data["power"],
            "opponent_power": y_rating_data["power"],
        }
        df_log_array.append(lab_df_dict)
        
df_log = pd.DataFrame(df_log_array)
league_name_for_csv_file = league_name
df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name_for_csv_file}.csv")

0
https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/passing/-Manchester-City
1
https://fbref.com/en/squads/18bb7c10/2022-2023/matchlogs/c9/passing/-Arsenal
2
https://fbref.com/en/squads/19538871/2022-2023/matchlogs/c9/passing/-Manchester-Utd
3
https://fbref.com/en/squads/b2b47a98/2022-2023/matchlogs/c9/passing/-Newcastle-Utd
4
https://fbref.com/en/squads/822bd0ba/2022-2023/matchlogs/c9/passing/-Liverpool
5
https://fbref.com/en/squads/d07537b9/2022-2023/matchlogs/c9/passing/-Brighton
6
https://fbref.com/en/squads/8602292d/2022-2023/matchlogs/c9/passing/-Aston-Villa
7
https://fbref.com/en/squads/361ca564/2022-2023/matchlogs/c9/passing/-Tottenham
8
https://fbref.com/en/squads/cd051869/2022-2023/matchlogs/c9/passing/-Brentford
9
https://fbref.com/en/squads/fd962109/2022-2023/matchlogs/c9/passing/-Fulham
10
https://fbref.com/en/squads/47c64c55/2022-2023/matchlogs/c9/passing/-Crystal-Palace
11
https://fbref.com/en/squads/cff3d9bb/2022-2023/matchlogs/c9/passing/-Chelsea
12
https://f

KeyError: ('For AFC Bournemouth', 'Opponent')

In [229]:
df = pd.DataFrame(df_log_array)

In [230]:
features = ['team_power', 'opponent_power', 'team_cmp_pass']
target_variable = 'team_att_pass'

In [231]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 205.44532718959235
Error Mean: 12.781954887218046


In [232]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [233]:
model_equation(coefficients, intercept)

'-5.31e-01 * team_power - 1.28e+00 * opponent_power + 9.96e-01 * team_cmp_pass + 2.45e+02'

In [234]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

Unnamed: 0,team_power,opponent_power,team_cmp_pass,team_name,team_id,opponent_name,opponent_id,team_att_pass,predicted,error
318,75,85,135,Brentford,1925,Manchester City,10,237,231,6
321,75,84,135,Brentford,1925,Liverpool,9,274,232,42
522,79,76,145,West Ham United,19,Fulham,144,235,250,-15
309,75,76,162,Brentford,1925,Leeds United,8,252,269,-17
345,76,79,166,Fulham,144,Arsenal,1,259,269,-10


In [238]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=12)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 235 , Max: 917
Result: 3.904932163637664
Interval: [-45 -33 -21  -9   3  15  27  39]


3

In [241]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: value + random_choice(interval))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].head(n=5).sort_values("randomness")

Unnamed: 0,team_power,opponent_power,team_cmp_pass,team_name,team_id,opponent_name,opponent_id,team_att_pass,predicted,randomness
2,85,78,554,Manchester City,10,Newcastle Utd,13,660,652,643
4,85,76,741,Manchester City,10,Nott'ham Forest,14,831,840,807
1,85,74,672,Manchester City,10,Bournemouth,1943,739,774,813
3,85,76,734,Manchester City,10,Crystal Palace,1799,822,834,825
0,85,79,792,Manchester City,10,West Ham,19,869,887,842
