In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time
import requests
from bs4 import BeautifulSoup
import lxml
import json
from urllib.parse import quote

from futsim_funcs import TPR,TFR,TPSR,TGKR,TDR

In [2]:
import warnings
warnings.filterwarnings("ignore", message="The default value of numeric_only in DataFrame.mean is deprecated.", category=FutureWarning)

In [3]:
players_df = pd.read_csv("../fifa24_db/pdb_23.csv")

In [4]:
tpr_data = TPR(players_df) # Overall
tfr_data = TFR(players_df) # Shooting
tgkr_data = TGKR(players_df) # Goalkeeping
tdr_data = TDR(players_df) # Defense

In [5]:
tdr_data.sort_values("defense", ascending=False).head(n=5)

Unnamed: 0,league_id,club_team_id,league_name,club_name,power,defense
109,53,243,La Liga,Real Madrid,84,78
101,16,73,Ligue 1,Paris Saint Germain,83,76
30,13,5,Premier League,Chelsea,84,75
70,31,44,Serie A,Inter,83,75
81,13,9,Premier League,Liverpool,84,75


<br><br><br><br>

# FBRef Match Logs URLs

In [6]:
leagues = ["Premier League", "Serie A", "Ligue 1", "LaLiga", "Bundesliga"]
fbref_league_dict = {
    "Premier League": {"name": "Premier League", "fbref_league_id": 9, "league_id": 13},
    "Serie A": {"name": "Serie A", "fbref_league_id": 11, "league_id": 31},
    "Ligue 1": {"name": "Ligue 1", "fbref_league_id": 13, "league_id": 16},
    "LaLiga": {"name": "LaLiga", "fbref_league_id": 12, "league_id": 53},
    "Bundesliga": {"name": "Bundesliga", "fbref_league_id": 20, "league_id": 19},   
}

In [7]:
def GetTeamCredentials(league_id, season="2022-2023"):
    league_url = f"https://fbref.com/en/comps/{league_id}/{season}"
    response = requests.get(league_url)
    soup = BeautifulSoup(response.content, "lxml")

    table = soup.find("table", {"class": "stats_table"})
    team_ids = []
    team_names = []

    for row in table.find_all("tr")[1:]:
        team_name = row.find("td", {"data-stat": "team"}).text
        team_id = row.find("td", {"data-stat": "team"}).a.get("href").split("/")[3]
        team_ids.append(team_id)
        team_names.append(team_name)
    return {
        "team_ids":team_ids,
        "team_names":team_names,
    }

In [8]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

In [9]:
def TPRFromId(tpr_data, club_team_id):
    power = tpr_data.query(f"club_team_id == {club_team_id}")["power"].iloc[0]
    return {
        "power": power,
    }

In [10]:
def TFRFromId(tfr_data, club_team_id):
    tfr_data = tfr_data["data"]
    power = tfr_data.query(f"club_team_id == {club_team_id}")["power"].iloc[0]
    finishing = tfr_data.query(f"club_team_id == {club_team_id}")["finishing"].iloc[0]
    return {
        "power": power,
        "finishing": finishing,
    }

In [11]:
def TeamNameIdConverter(tpr_data, team_name, league_id, print_output=False):
    # TeamNameIdConverter(tpr_data,"chelseas",13,1)
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()
    club_team_name, score, other = find_best_match(team_name, club_team_names)
    club_team_id = club_team_ids[club_team_names.index(club_team_name)]
    if print_output:
        print(f"ID for {club_team_name}:")
    return club_team_id

In [15]:
def FB_URLS(creds, tpr_data, stat_attribute, fbref_league_id, league_id, season="2022-2023"):
    tpr_data = tpr_data["data"]
    team_ids = creds["team_ids"]
    team_names = creds["team_names"]
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()

    for team_id, team_name in zip(team_ids, team_names):
        best_match = find_best_match(team_name, club_team_names)
        if best_match:
            club_team_name, score, other = best_match
            club_team_id = club_team_ids[club_team_names.index(club_team_name)]
            match_logs_url = f"https://fbref.com/en/squads/{team_id}/{season}/matchlogs/c{fbref_league_id}/{stat_attribute}/{team_name.replace(' ', '-')}"
            match_logs_stats_dict[stat_attribute].append({
                "id": team_id,
                "league_id": league_id,
                "fbref_league_id": fbref_league_id,
                "stat": stat_attribute,
                "team": team_name,
                "club_team_id": club_team_id,
                "club_team_name": club_team_name, 
                "url": match_logs_url
            })
        else:
            print(f"No match found for team: {team_name}")

<h1 style="color:orange">ü¶∏‚Äç‚ôÇÔ∏èMust Parameters</h1>

In [16]:
match_logs_stats_dict = {
    "shooting":[],
    "passing":[],
    "defense":[],
    "keeper":[],
    "passing_types":[],
    "gca":[],
    "possession":[],
    "misc":[],
}

attributes = match_logs_stats_dict.keys()
league_name = "Premier League"
league_id = fbref_league_dict[league_name]["league_id"]
fbref_league_id = fbref_league_dict[league_name]["fbref_league_id"]
season = "2022-2023"
team_number_league = 20
last_matchweek = 38
creds = GetTeamCredentials(fbref_league_id, season)

for stat_attribute in attributes:
    FB_URLS(creds, tpr_data, stat_attribute, fbref_league_id, league_id)

<br><br><br><br>
<h1 style="color:green">Scraping FBRef</h1>

In [17]:
df = pd.read_html("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City")[1][:38]
df.sort_values(('Against Manchester City', 'Date')).head()
df.columns

MultiIndex([('Against Manchester City',         'Date'),
            ('Against Manchester City',         'Time'),
            ('Against Manchester City',        'Round'),
            ('Against Manchester City',          'Day'),
            ('Against Manchester City',        'Venue'),
            ('Against Manchester City',       'Result'),
            ('Against Manchester City',           'GF'),
            ('Against Manchester City',           'GA'),
            ('Against Manchester City',     'Opponent'),
            (               'Standard',          'Gls'),
            (               'Standard',           'Sh'),
            (               'Standard',          'SoT'),
            (               'Standard',         'SoT%'),
            (               'Standard',         'G/Sh'),
            (               'Standard',        'G/SoT'),
            (               'Standard',         'Dist'),
            (               'Standard',           'FK'),
            (               'St

In [18]:
def FB_MatchLogs(url, last_matchweek=38):
    encoded_url = quote(url, safe=':/')
    df=pd.read_html(encoded_url)
    return {
        "for": df[0][:last_matchweek],
        "against": df[1][:last_matchweek],
    }

In [19]:
def MatchLogs_MultiIndexColumnDict(df, stat_attribute=None):
    stat_columns_dict = {}
    for item in df.columns:
        right_side = item[1]
        stat_columns_dict[right_side] = item
    if stat_attribute:
        return stat_columns_dict[stat_attribute]
    return stat_columns_dict

In [20]:
match_logs_stats_dict["shooting"][0]

{'id': 'b8fd03ef',
 'league_id': 13,
 'fbref_league_id': 9,
 'stat': 'shooting',
 'team': ' Manchester City',
 'club_team_id': 10,
 'club_team_name': 'Manchester City',
 'url': 'https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City'}

<br><br><br><br>
<h1 style="color:aqua">Regression Boilerplate</h1>

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_linear_regression(df, features, target_variable, apply_dtype, test_size=0.19, random_state=42):
    """
    Train a linear regression model and evaluate its performance.

    Args:
    - df (DataFrame): Input DataFrame containing the dataset.
    - features (list): List of feature columns.
    - target_variable (str): Name of the target variable column.
    - test_size (float): Proportion of the dataset to include in the test split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - model (LinearRegression): Trained linear regression model.
    - mse (float): Mean squared error of the model.
    - coefficients (dict): Coefficients of the trained model.
    """

    X = df[features]
    y = df[target_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    df['predicted'] = model.predict(df[features])
    df['predicted'] = df['predicted'].apply(apply_dtype)
    
    mse = mean_squared_error(y_test, y_pred)
    coefficients = dict(zip(features, model.coef_))
    intercept = model.intercept_
    return model, mse, coefficients, intercept

def model_error(df, target_variable):
    df['error'] = df[target_variable] - df['predicted']
    return df['error'].apply(abs)

def model_head(df, features, target_variable, sort_parameter="team_power", sort_ascending=False, n=5):
    return df[features+[target_variable,"predicted","error"]].sort_values(sort_parameter, ascending=sort_ascending).head(n=n)

def model_equation(coefficients, intercept):
    equation = " + ".join([f"{coef:.2e} * {feat}" for feat, coef in coefficients.items()])
    equation = equation + f" + {intercept:.2e}"
    equation = equation.replace("+ -", "- ")
    return equation

def model_predict(**args):
    pass

def plot_predicted_vs_actual(df, target_variable, model=None, features=None):
    """
    Plot the actual values and predicted values from a DataFrame.

    Args:
    - df (DataFrame): Input DataFrame containing the dataset.
    - target_variable (str): Name of the target variable column.
    - model (LinearRegression): Trained linear regression model.
    - features (list): List of feature columns used in the model (optional).

    Returns:
    - None
    """

    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df[target_variable], marker='o', linestyle='-', color="b", label='Actual')
    
    if model and features:
        plt.plot(df.index, df['predicted'], marker='o', linestyle='-', color="r", label='Predicted')
    plt.title('Line Plot of Predicted Values')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

In [22]:
def convert_equation_to_json(features, values, intercept):
    # Usage: convert_equation_to_json(features, coefficients.values(), intercept)
    equation_dict = {}
    for feature, value in zip(features, values):
        equation_dict[feature] = {
            "name": feature,
            "value": value
        }
    equation_dict["intercept"] = {"name":"intercept", "value":intercept}
    return equation_dict

<br>
<h1 style="color:aqua">Randomness Factor</h1>

In [23]:
def randomness_proportional(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_min) / (target_mean) 
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

def randomness_volume(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_mean) / (target_std)
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

In [24]:
def interval_proportional(value, step=0.05, expand_multiplier=1):
    lower_bound = round(value - step, 3)
    upper_bound = round(value + step, 3)
    numbers = np.arange(-upper_bound*expand_multiplier, upper_bound + step*expand_multiplier, step)
    return numbers

def interval_volume(value, step=1, expand_multiplier=1):
    lower_bound = int(value - step)
    upper_bound = int(value + step)
    numbers = np.arange(-upper_bound*expand_multiplier, upper_bound + step*expand_multiplier, step)
    return numbers

In [25]:
def probability_protector(value):
    if value < 0:
        return 0
    elif value > 1:
        return 1
    else:
        return value
    
def negativity_protector(value, raw_value):
    if value < 0:
        return raw_value
    else:
        return value

In [26]:
def random_choice(array):
    return np.random.choice(array)

<br><br><br><br>
<h1 style="color:pink">Match Logs DataFrame</h1>

In [27]:
def ColumnConverter(df, keyword):
    df_cols = df.columns.map('.'.join)
    new_cols = [col.split('.')[1] if keyword in col else col for col in df_cols]
    return new_cols

In [28]:
index_values = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
                'Opponent', 'Total.Cmp', 'Total.Att', 'Total.Cmp%', 'Total.TotDist',
                'Total.PrgDist', 'Short.Cmp', 'Short.Att', 'Short.Cmp%', 'Medium.Cmp',
                'Medium.Att', 'Medium.Cmp%', 'Long.Cmp', 'Long.Att', 'Long.Cmp%',
                'Unnamed: 23_level_0.Ast', 'Unnamed: 24_level_0.xAG',
                'Unnamed: 25_level_0.xA', 'Unnamed: 26_level_0.KP',
                'Unnamed: 27_level_0.1/3', 'Unnamed: 28_level_0.PPA',
                'Unnamed: 29_level_0.CrsPA', 'Unnamed: 30_level_0.PrgP',
                'Unnamed: 31_level_0.Match Report']

terms = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
         'Opponent', 'Total_Completion', 'Total_Attempts', 'Total_Completion_Rate',
         'Total_Total_Distance', 'Total_Progressed_Distance', 'Short_Completion',
         'Short_Attempts', 'Short_Completion_Rate', 'Medium_Completion',
         'Medium_Attempts', 'Medium_Completion_Rate', 'Long_Completion',
         'Long_Attempts', 'Long_Completion_Rate', 'Assists', 'Expected_Goals_Against',
         'Expected_Assists', 'Key_Passes', 'Final_Third_Passes', 'Passes_Progressive',
         'Crosses_Penalty_Area', 'Progressive_Passes',
         'Match_Report']

print(len(index_values))
print(len(terms))

test_index_dict = dict(zip(terms, index_values))
test_index_dict

32
32


{'Date': 'Date',
 'Time': 'Time',
 'Round': 'Round',
 'Day': 'Day',
 'Venue': 'Venue',
 'Result': 'Result',
 'GF': 'GF',
 'GA': 'GA',
 'Opponent': 'Opponent',
 'Total_Completion': 'Total.Cmp',
 'Total_Attempts': 'Total.Att',
 'Total_Completion_Rate': 'Total.Cmp%',
 'Total_Total_Distance': 'Total.TotDist',
 'Total_Progressed_Distance': 'Total.PrgDist',
 'Short_Completion': 'Short.Cmp',
 'Short_Attempts': 'Short.Att',
 'Short_Completion_Rate': 'Short.Cmp%',
 'Medium_Completion': 'Medium.Cmp',
 'Medium_Attempts': 'Medium.Att',
 'Medium_Completion_Rate': 'Medium.Cmp%',
 'Long_Completion': 'Long.Cmp',
 'Long_Attempts': 'Long.Att',
 'Long_Completion_Rate': 'Long.Cmp%',
 'Assists': 'Unnamed: 23_level_0.Ast',
 'Expected_Goals_Against': 'Unnamed: 24_level_0.xAG',
 'Expected_Assists': 'Unnamed: 25_level_0.xA',
 'Key_Passes': 'Unnamed: 26_level_0.KP',
 'Final_Third_Passes': 'Unnamed: 27_level_0.1/3',
 'Passes_Progressive': 'Unnamed: 28_level_0.PPA',
 'Crosses_Penalty_Area': 'Unnamed: 29_level_0.C

In [113]:
def prepare_expand_dict(column_dict, x, y):
    keys = column_dict.keys()
    values = column_dict.values()
    expanded_dict = {}
    for key, value in zip(keys, values):
        expanded_dict["team_"+key.lower()] = x[value]
        expanded_dict["opponent_"+key.lower()] = y[value]
    return expanded_dict

In [114]:
prepared_rating_data = [tpr_data, tfr_data]

In [115]:
def prepare_rating_dict(ratings_array, team_id, suffix):
    dict_ = {}
    for rating in ratings_array:
        metric_data = rating["data"].query(f"club_team_id == {team_id}")[rating["name"]].iloc[0]
        dict_[suffix+"_"+rating["name"]] = metric_data
    return dict_

In [116]:
prepare_rating_dict(prepared_rating_data, 1, "team")

{'team_power': 79, 'team_finishing': 78}

In [117]:
# expand_dict(test_index_dict, x, y=None)

In [137]:
def generate_fbref_match_logs_array \
(match_logs_stats_dict, columns_dict, tpr_data, other_rating_data, league_id, team_number_league, last_matchweek, current_stat, opponent=True):
    
    df_log_array = []
    tpr_data = tpr_data["data"]
    
    for tindex in range(team_number_league):
        time.sleep(0.1)
        current_team = match_logs_stats_dict[current_stat][tindex]

        url = current_team["url"]
        print(url, tindex)

        match_logs = FB_MatchLogs(url, last_matchweek=last_matchweek)
        df0 = match_logs["for"]
        df0.columns = ColumnConverter(df0, "For")
        
        df1 = match_logs["against"]
        df1.columns = ColumnConverter(df1, "Against")
        
        
        team_names = df0["Opponent"].unique().tolist()
        team_ids = [TeamNameIdConverter(tpr_data, team_name, league_id) for team_name in team_names]
        team_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])

        for i in range(last_matchweek):
            x = df0.iloc[i]
            y = df1.iloc[i]
            x_rating_data = prepare_rating_dict(other_rating_data, current_team["club_team_id"], suffix="team")
            y_rating_data = prepare_rating_dict(other_rating_data, TeamNameIdConverter(tpr_data, x["Opponent"], league_id), suffix="opponent")
            lab_df_dict = {
                "league_id": league_id,
                "team_name": current_team["club_team_name"],
                "team_id": current_team["club_team_id"],
                "opponent_name": x["Opponent"],
                "opponent_id": TeamNameIdConverter(tpr_data, x["Opponent"], league_id),
                "team_power": x_rating_data["team_power"],
                "opponent_power": y_rating_data["opponent_power"],
            }
            merged_dict = {**lab_df_dict, **prepare_expand_dict(columns_dict, x, y), **x_rating_data, **y_rating_data}
            df_log_array.append(merged_dict)
    Print("Data collecting has been completed! ‚úÖ")
    return df_log_array

<br><br><br><br>
<h1 style="color:red">"shooting"</h1>

In [138]:
current_stat = "shooting"

In [139]:
column_dict = {
    'date': 'Date',
    'time': 'Time',
    'round': 'Round',
    'day': 'Day',
    'venue': 'Venue',
    'result': 'Result',
    'goals_for': 'GF',
    'goals_against': 'GA',
    'opponent': 'Opponent',
    'standard_goals': 'Standard.Gls',
    'standard_shots': 'Standard.Sh',
    'standard_shots_on_target': 'Standard.SoT',
    'standard_shots_on_target_percentage': 'Standard.SoT%',
    'standard_goals_per_shot': 'Standard.G/Sh',
    'standard_goals_per_shot_on_target': 'Standard.G/SoT',
    'standard_distance': 'Standard.Dist',
    'standard_free_kicks': 'Standard.FK',
    'standard_penalties': 'Standard.PK',
    'standard_penalty_attempts': 'Standard.PKatt',
    'expected_goals': 'Expected.xG',
    'expected_non_penalty_expected_goals': 'Expected.npxG',
    'expected_non_penalty_expected_goals_per_shot': 'Expected.npxG/Sh',
    'expected_goals_minus_expected_goals': 'Expected.G-xG',
    'expected_non_penalty_goals_minus_expected_goals': 'Expected.np:G-xG',
    'match_report': 'Unnamed: 24_level_0.Match Report'
}
prepared_index_dict = column_dict

In [140]:
df_log_array = generate_fbref_match_logs_array \
(match_logs_stats_dict, prepared_index_dict, tpr_data, prepared_rating_data, league_id, team_number_league, last_matchweek, current_stat)
df_log = pd.DataFrame(df_log_array)
df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City 0
https://fbref.com/en/squads/18bb7c10/2022-2023/matchlogs/c9/shooting/-Arsenal 1
https://fbref.com/en/squads/19538871/2022-2023/matchlogs/c9/shooting/-Manchester-Utd 2
https://fbref.com/en/squads/b2b47a98/2022-2023/matchlogs/c9/shooting/-Newcastle-Utd 3
https://fbref.com/en/squads/822bd0ba/2022-2023/matchlogs/c9/shooting/-Liverpool 4
https://fbref.com/en/squads/d07537b9/2022-2023/matchlogs/c9/shooting/-Brighton 5
https://fbref.com/en/squads/8602292d/2022-2023/matchlogs/c9/shooting/-Aston-Villa 6
https://fbref.com/en/squads/361ca564/2022-2023/matchlogs/c9/shooting/-Tottenham 7
https://fbref.com/en/squads/cd051869/2022-2023/matchlogs/c9/shooting/-Brentford 8
https://fbref.com/en/squads/fd962109/2022-2023/matchlogs/c9/shooting/-Fulham 9
https://fbref.com/en/squads/47c64c55/2022-2023/matchlogs/c9/shooting/-Crystal-Palace 10
https://fbref.com/en/squads/cff3d9bb/2022-2023/matchlogs/c9/shooting/-Chelsea 11


In [141]:
df_log = pd.read_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

In [142]:
df_log.loc[0]

Unnamed: 0                                                                0
league_id                                                                13
team_name                                                   Manchester City
team_id                                                                  10
opponent_name                                                      West Ham
opponent_id                                                              19
team_power                                                               85
opponent_power                                                           79
team_date                                                        2022-08-07
opponent_date                                                    2022-08-07
team_time                                                             16:30
opponent_time                                                         16:30
team_round                                                      Matchweek 1
opponent_rou

<br><br><br><br>
<h1 style="color:red">Total Shot Volume</h1>

In [144]:
columns_to_drop = ['team_name','team_id','opponent_name','opponent_id']
df = df_log
df["total_sh"] =  df["team_standard_shots"] + df["opponent_standard_shots"]

In [145]:
features = ['team_power', 'team_finishing', 'opponent_power', 'opponent_finishing', "team_standard_shots", "opponent_standard_shots"]
target_variable = 'total_sh'

In [146]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 5.055238297457684e-29
Error Mean: 0.3144736842105263


In [147]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [148]:
model_equation(coefficients, intercept)

'-2.33e-16 * team_power + 3.04e-16 * team_finishing + 8.69e-16 * opponent_power + 4.69e-16 * opponent_finishing + 1.00e+00 * team_standard_shots + 1.00e+00 * opponent_standard_shots - 8.53e-14'

In [149]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_standard_shots,opponent_standard_shots,team_name,team_id,opponent_name,opponent_id,total_sh,predicted,error
252,79,72,76,70,8,3,Aston Villa,2,Crystal Palace,1799,11,11,0
61,79,78,78,77,10,1,Arsenal,1,Leicester City,95,11,11,0
404,76,70,79,72,3,8,Crystal Palace,1799,Aston Villa,2,11,11,0
669,78,77,79,78,1,10,Leicester City,95,Arsenal,1,11,11,0
301,81,75,79,72,4,8,Tottenham Hotspur,18,Aston Villa,2,12,12,0


In [150]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 11 , Max: 45
Result: 4.227159496149049
Interval: [-5 -4 -3 -2 -1  0  1  2  3  4  5]


3

In [151]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_standard_shots,opponent_standard_shots,team_name,team_id,opponent_name,opponent_id,total_sh,predicted,randomness
668,78,77,82,81,19,26,Leicester City,95,Manchester Utd,11,45,44,48
189,84,79,75,70,30,15,Liverpool,9,Southampton,17,45,44,44
104,82,81,77,77,29,15,Manchester United,11,Everton,7,44,43,42
637,77,77,82,81,15,29,Everton,7,Manchester Utd,11,44,43,42
99,82,81,78,77,26,19,Manchester United,11,Leicester City,95,45,44,42


<br><br><br><br>
<h1 style="color:red">Shot Share</h1>

In [153]:
df["team_sh_share"] = df["team_standard_shots"] / df["total_sh"]

In [154]:
features = ['team_power', 'opponent_power']
target_variable = 'team_sh_share'

In [155]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, float)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 0.026681810590791683
Error Mean: 0.12985605019186


In [156]:
model_equation(coefficients, intercept)

'1.60e-02 * team_power - 1.45e-02 * opponent_power + 3.78e-01'

In [157]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [158]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,error
13,85,74,Manchester City,10,Brentford,1925,0.74359,0.667946,0.075644
1,85,74,Manchester City,10,Bournemouth,1943,0.863636,0.667946,0.195691
37,85,74,Manchester City,10,Brentford,1925,0.607143,0.667946,-0.060803
24,85,74,Manchester City,10,Bournemouth,1943,0.606061,0.667946,-0.061885
36,85,75,Manchester City,10,Brighton,1808,0.393939,0.653459,-0.25952


In [159]:
randomness_func, interval_func = randomness_proportional, interval_proportional
result = randomness_func(df, target_variable)
interval = interval_func(result, step=0.01)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0.041666666666666664 , Max: 0.9583333333333334
Result: 0.08333333333333333
Interval: [-0.093 -0.083 -0.073 -0.063 -0.053 -0.043 -0.033 -0.023 -0.013 -0.003
  0.007  0.017  0.027  0.037  0.047  0.057  0.067  0.077  0.087  0.097]


-0.03300000000000003

In [160]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: probability_protector(value + random_choice(interval)))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,randomness
37,85,74,Manchester City,10,Brentford,1925,0.607143,0.667946,0.744946
26,85,76,Manchester City,10,Crystal Palace,1799,0.764706,0.638973,0.735973
449,84,74,Chelsea,5,Brentford,1925,0.681818,0.651927,0.728927
31,85,76,Manchester City,10,Fulham,144,0.733333,0.638973,0.725973
163,84,76,Liverpool,9,Leeds United,8,0.611111,0.622955,0.719955


In [161]:
team_power = 82
opponent_power = 76

y = 1.34e-02 * team_power - 1.04e-02 * opponent_power + 2.72e-01
probability_protector(y+random_choice(interval)) # MODEL IS DONE!

0.5374

<br><br><br><br>
<h1 style="color:red">Shot on Target</h1>

In [164]:
features = ['team_power', 'team_finishing', 'opponent_power', 'team_standard_shots']
target_variable = 'team_standard_shots_on_target'

In [165]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 2.706647120396336
Error Mean: 1.3657894736842104


In [166]:
model_equation(coefficients, intercept)

'3.19e-03 * team_power - 1.82e-04 * team_finishing + 1.42e-03 * opponent_power + 2.90e-01 * team_standard_shots + 1.61e-01'

In [167]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [168]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,team_standard_shots,team_name,team_id,opponent_name,opponent_id,team_standard_shots_on_target,predicted,error
189,84,79,75,30,Liverpool,9,Southampton,17,8,9,-1
408,76,70,78,31,Crystal Palace,1799,Leicester City,95,9,9,0
63,79,78,74,31,Arsenal,1,Bournemouth,1943,9,9,0
215,75,70,74,32,Brighton & Hove Albion,1808,Brentford,1925,13,9,4
14,85,82,76,26,Manchester City,10,Leeds United,8,9,8,1


In [169]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0 , Max: 13
Result: 1.773270511652842
Interval: [-2 -1  0  1  2]


-1

In [170]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,team_standard_shots,team_name,team_id,opponent_name,opponent_id,team_standard_shots_on_target,predicted,randomness
215,75,70,74,32,Brighton & Hove Albion,1808,Brentford,1925,13,9,11
63,79,78,74,31,Arsenal,1,Bournemouth,1943,9,9,11
445,84,73,79,27,Chelsea,5,Aston Villa,2,7,8,10
85,82,81,81,28,Manchester United,11,Tottenham,18,10,8,9
357,76,76,76,23,Fulham,144,Crystal Palace,1799,10,7,9


<br><br><br><br>
<h1 style="color:purple">Shot Models Test</h1>

In [178]:
t = df.sort_values("team_standard_shots_on_target", ascending=False).iloc[0]

for column_name, cell_value in t.items():
    print(f"Column Name: {column_name}, Value: {cell_value}")


Column Name: Unnamed: 0, Value: 215
Column Name: league_id, Value: 13
Column Name: team_name, Value: Brighton & Hove Albion
Column Name: team_id, Value: 1808
Column Name: opponent_name, Value: Brentford
Column Name: opponent_id, Value: 1925
Column Name: team_power, Value: 75
Column Name: opponent_power, Value: 74
Column Name: team_date, Value: 2023-04-01
Column Name: opponent_date, Value: 2023-04-01
Column Name: team_time, Value: 15:00
Column Name: opponent_time, Value: 15:00
Column Name: team_round, Value: Matchweek 29
Column Name: opponent_round, Value: Matchweek 29
Column Name: team_day, Value: Sat
Column Name: opponent_day, Value: Sat
Column Name: team_venue, Value: Home
Column Name: opponent_venue, Value: Away
Column Name: team_result, Value: D
Column Name: opponent_result, Value: D
Column Name: team_goals_for, Value: 3
Column Name: opponent_goals_for, Value: 3
Column Name: team_goals_against, Value: 3
Column Name: opponent_goals_against, Value: 3
Column Name: team_opponent, Value

<br><br><br><br>
<h1 style="color:black">Lineup Test</h1>

In [None]:
lineup_test_ids = [230621, 235212, 155862, 207865, 252145, 199556, 234153, 230767, 231747, 158023, 190871]
test_team_df = []
for player_id in lineup_test_ids:
    player_df = players_df[players_df['player_id'] == player_id]
    test_team_df.append(player_df)
result_df = pd.concat(test_team_df, ignore_index=True)

In [None]:
TFR(result_df)

<br><br><br><br>
<h1 style="color:red">Pass Total Attempts</h1>

In [None]:
df = pd.read_html("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/passing/-Manchester-City")[0][:38]
df.sort_values(('For Manchester City', 'Date')).head()
df_cols = df.columns.map('.'.join)
new_cols = [col.split('.')[1] if 'For' in col else col for col in df_cols]
df.columns = new_cols
df.columns

In [None]:
df_log_array = []
current_stat = "passing"
last_matchweek = 38

for tindex in range(20):
    print(tindex)
    time.sleep(0.1)
    current_team = match_logs_stats_dict[current_stat][tindex]

    url = current_team["url"]
    print(url)
    
    df = FB_MatchLogs(url, last_matchweek=last_matchweek)["for"]
    df.columns = ColumnConverter(df, "For")
    
    team_names = df["Opponent"].unique().tolist()
    team_ids = [TeamNameIdConverter(tpr_data, team_name, league_id) for team_name in team_names]
    team_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])

    for i in range(last_matchweek):
        x = df.iloc[i]
        x_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])
        y_rating_data = TFRFromId(tfr_data, TeamNameIdConverter(tpr_data, x["Opponent"], league_id))
        lab_df_dict = {
            "team_name": current_team["club_team_name"],
            "team_id": current_team["club_team_id"],
            "opponent_name": x["Opponent"],
            "opponent_id": TeamNameIdConverter(tpr_data, x["Opponent"], league_id),
            
            "team_att_pass": x["Total.Att"],
            "team_cmp_pass": x["Total.Cmp"],
            
            "team_power": x_rating_data["power"],
            "opponent_power": y_rating_data["power"],
        }
        
        df_log_array.append(lab_df_dict)
        
df_log = pd.DataFrame(df_log_array)
league_name_for_csv_file = league_name
df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name_for_csv_file}@{current_stat}.csv")

In [None]:
df = df_log
features = ['team_power', 'opponent_power', 'team_att_pass']
target_variable = 'team_cmp_pass'

In [None]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

In [None]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [None]:
model_equation(coefficients, intercept)

In [None]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

In [None]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=12, expand_multiplier=3)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

In [None]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)