In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time
import requests
from bs4 import BeautifulSoup
import lxml
import json
from urllib.parse import quote

from futsim_funcs import TPR, TFR, TPSR, TGKR, TDR

In [2]:
import warnings
warnings.filterwarnings("ignore", message="The default value of numeric_only in DataFrame.mean is deprecated.", category=FutureWarning)

In [3]:
players_df = pd.read_csv("../fifa24_db/pdb_23.csv")

In [4]:
tpr_data = TPR(players_df) # Overall
tfr_data = TFR(players_df) # Shooting
tgkr_data = TGKR(players_df) # Goalkeeping
tdr_data = TDR(players_df) # Defense

In [5]:
tpr_data["data"].sort_values("power", ascending=False).head(n=5)

Unnamed: 0,league_id,club_team_id,league_name,club_name,power,mean_goalkeeping_diving,mean_goalkeeping_handling,mean_goalkeeping_kicking,mean_goalkeeping_positioning,mean_goalkeeping_reflexes,...,mean_power_long_shots,mean_mentality_aggression,mean_mentality_interceptions,mean_mentality_positioning,mean_mentality_vision,mean_mentality_penalties,mean_mentality_composure,mean_defending_marking_awareness,mean_defending_standing_tackle,mean_defending_sliding_tackle
84,13,10,Premier League,Manchester City,85,81.0,78.0,89.0,84.0,86.0,...,69,76,68,71,77,61,83,70,71,63
109,53,243,La Liga,Real Madrid,84,84.0,89.0,75.0,89.0,90.0,...,69,71,67,73,76,63,81,66,66,62
81,13,9,Premier League,Liverpool,84,86.0,85.0,85.0,90.0,89.0,...,66,78,71,70,75,64,83,68,72,66
30,13,5,Premier League,Chelsea,84,81.0,80.0,83.0,81.0,85.0,...,65,71,67,73,77,62,81,66,69,65
43,53,241,La Liga,FC Barcelona,83,86.0,85.0,87.0,85.0,90.0,...,62,72,68,72,76,63,81,66,67,63


<br><br><br><br>

# FBRef Match Logs URLs

In [6]:
leagues = ["Premier League", "Serie A", "Ligue 1", "LaLiga", "Bundesliga"]
fbref_league_dict = {
    "Premier League": {"name": "Premier League", "fbref_league_id": 9, "league_id": 13},
    "Serie A": {"name": "Serie A", "fbref_league_id": 11, "league_id": 31},
    "Ligue 1": {"name": "Ligue 1", "fbref_league_id": 13, "league_id": 16},
    "LaLiga": {"name": "LaLiga", "fbref_league_id": 12, "league_id": 53},
    "Bundesliga": {"name": "Bundesliga", "fbref_league_id": 20, "league_id": 19},   
}

In [7]:
def GetTeamCredentials(league_id, season="2022-2023"):
    league_url = f"https://fbref.com/en/comps/{league_id}/{season}"
    response = requests.get(league_url)
    soup = BeautifulSoup(response.content, "lxml")

    table = soup.find("table", {"class": "stats_table"})
    team_ids = []
    team_names = []

    for row in table.find_all("tr")[1:]:
        team_name = row.find("td", {"data-stat": "team"}).text
        team_id = row.find("td", {"data-stat": "team"}).a.get("href").split("/")[3]
        team_ids.append(team_id)
        team_names.append(team_name)
    return {
        "team_ids":team_ids,
        "team_names":team_names,
    }

In [8]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

In [9]:
def TPRFromId(tpr_data, club_team_id):
    power = tpr_data.query(f"club_team_id == {club_team_id}")["power"].iloc[0]
    return {
        "power": power,
    }

In [10]:
def TFRFromId(tfr_data, club_team_id):
    tfr_data = tfr_data["data"]
    power = tfr_data.query(f"club_team_id == {club_team_id}")["power"].iloc[0]
    finishing = tfr_data.query(f"club_team_id == {club_team_id}")["finishing"].iloc[0]
    return {
        "power": power,
        "finishing": finishing,
    }

In [11]:
def TeamNameIdConverter(tpr_data, team_name, league_id, print_output=False):
    # TeamNameIdConverter(tpr_data,"chelseas",13,1)
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()
    club_team_name, score, other = find_best_match(team_name, club_team_names)
    club_team_id = club_team_ids[club_team_names.index(club_team_name)]
    if print_output:
        print(f"ID for {club_team_name}:")
    return club_team_id

In [12]:
def FB_URLS(creds, tpr_data, stat_attribute, fbref_league_id, league_id, season="2022-2023"):
    tpr_data = tpr_data["data"]
    team_ids = creds["team_ids"]
    team_names = creds["team_names"]
    club_team_names = tpr_data.query(f"league_id == {league_id}")["club_name"].tolist()
    club_team_ids = tpr_data.query(f"league_id == {league_id}")["club_team_id"].tolist()

    for team_id, team_name in zip(team_ids, team_names):
        best_match = find_best_match(team_name, club_team_names)
        if best_match:
            club_team_name, score, other = best_match
            club_team_id = club_team_ids[club_team_names.index(club_team_name)]
            match_logs_url = f"https://fbref.com/en/squads/{team_id}/{season}/matchlogs/c{fbref_league_id}/{stat_attribute}/{team_name.replace(' ', '-')}"
            match_logs_stats_dict[stat_attribute].append({
                "id": team_id,
                "league_id": league_id,
                "fbref_league_id": fbref_league_id,
                "stat": stat_attribute,
                "team": team_name,
                "club_team_id": club_team_id,
                "club_team_name": club_team_name, 
                "url": match_logs_url
            })
        else:
            print(f"No match found for team: {team_name}")

<h1 style="color:orange">🦸‍♂️Must Parameters</h1>

In [13]:
match_logs_stats_dict = {
    "shooting":[],
    "passing":[],
    "defense":[],
    "keeper":[],
    "passing_types":[],
    "gca":[],
    "possession":[],
    "misc":[],
}

attributes = match_logs_stats_dict.keys()
league_name = "Premier League"
league_id = fbref_league_dict[league_name]["league_id"]
fbref_league_id = fbref_league_dict[league_name]["fbref_league_id"]
season = "2022-2023"
team_number_league = 20
last_matchweek = 38
creds = GetTeamCredentials(fbref_league_id, season)

for stat_attribute in attributes:
    FB_URLS(creds, tpr_data, stat_attribute, fbref_league_id, league_id)

<br><br><br><br>
<h1 style="color:green">Scraping FBRef</h1>

In [14]:
df = pd.read_html("https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City")[1][:38]
df.sort_values(('Against Manchester City', 'Date')).head()
df.columns

MultiIndex([('Against Manchester City',         'Date'),
            ('Against Manchester City',         'Time'),
            ('Against Manchester City',        'Round'),
            ('Against Manchester City',          'Day'),
            ('Against Manchester City',        'Venue'),
            ('Against Manchester City',       'Result'),
            ('Against Manchester City',           'GF'),
            ('Against Manchester City',           'GA'),
            ('Against Manchester City',     'Opponent'),
            (               'Standard',          'Gls'),
            (               'Standard',           'Sh'),
            (               'Standard',          'SoT'),
            (               'Standard',         'SoT%'),
            (               'Standard',         'G/Sh'),
            (               'Standard',        'G/SoT'),
            (               'Standard',         'Dist'),
            (               'Standard',           'FK'),
            (               'St

In [15]:
def test_stat_df(stat_attribute):
    df = pd.read_html(f"https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/{stat_attribute}/-Manchester-City")[0][:38]
    url = f"https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/{stat_attribute}/-Manchester-City"
    return url

In [16]:
def FB_MatchLogs(url, last_matchweek=38):
    encoded_url = quote(url, safe=':/')
    df=pd.read_html(encoded_url)
    return {
        "for": df[0][:last_matchweek],
        "against": df[1][:last_matchweek],
    }

In [17]:
def MatchLogs_MultiIndexColumnDict(df, stat_attribute=None):
    stat_columns_dict = {}
    for item in df.columns:
        right_side = item[1]
        stat_columns_dict[right_side] = item
    if stat_attribute:
        return stat_columns_dict[stat_attribute]
    return stat_columns_dict

In [18]:
match_logs_stats_dict["shooting"][0]

{'id': 'b8fd03ef',
 'league_id': 13,
 'fbref_league_id': 9,
 'stat': 'shooting',
 'team': ' Manchester City',
 'club_team_id': 10,
 'club_team_name': 'Manchester City',
 'url': 'https://fbref.com/en/squads/b8fd03ef/2022-2023/matchlogs/c9/shooting/-Manchester-City'}

<br><br><br><br>
<h1 style="color:aqua">Regression Boilerplate</h1>

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_linear_regression(df, features, target_variable, apply_dtype, test_size=0.19, random_state=42):
    X = df[features]
    y = df[target_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    df['predicted'] = model.predict(df[features])
    df['predicted'] = df['predicted'].apply(apply_dtype)
    
    mse = mean_squared_error(y_test, y_pred)
    coefficients = dict(zip(features, model.coef_))
    intercept = model.intercept_
    return model, mse, coefficients, intercept

def model_error(df, target_variable):
    df['error'] = df[target_variable] - df['predicted']
    return df['error'].apply(abs)

def model_head(df, features, target_variable, sort_parameter="team_power", sort_ascending=False, n=5):
    return df[features+[target_variable,"predicted","error"]].sort_values(sort_parameter, ascending=sort_ascending).head(n=n)

def model_equation(coefficients, intercept):
    equation = " + ".join([f"{coef:.2e} * {feat}" for feat, coef in coefficients.items()])
    equation = equation + f" + {intercept:.2e}"
    equation = equation.replace("+ -", "- ")
    return equation

def model_predict(**args):
    pass

def plot_predicted_vs_actual(df, target_variable, model=None, features=None):
    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df[target_variable], marker='o', linestyle='-', color="b", label='Actual')
    
    if model and features:
        plt.plot(df.index, df['predicted'], marker='o', linestyle='-', color="r", label='Predicted')
    plt.title('Line Plot of Predicted Values')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    plt.show()

In [20]:
def convert_equation_to_json(features, values, intercept):
    # Usage: convert_equation_to_json(features, coefficients.values(), intercept)
    equation_dict = {}
    for feature, value in zip(features, values):
        equation_dict[feature] = {
            "name": feature,
            "value": value
        }
    equation_dict["intercept"] = {"name":"intercept", "value":intercept}
    return equation_dict

<br>
<h1 style="color:aqua">Randomness Factor</h1>

In [21]:
def randomness_proportional(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_min) / (target_mean) 
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

def randomness_volume(df, target_variable):
    target_mean = df[target_variable].mean()
    target_std = df[target_variable].std()
    target_min = df[target_variable].min()
    target_max = df[target_variable].max()
    formula = (target_mean) / (target_std)
    print("Min:",target_min, ",", "Max:",target_max)
    return formula

In [22]:
def interval_proportional(value, step=0.05, expand_multiplier=1):
    lower_bound = round(value - step, 3)
    upper_bound = round(value + step, 3)
    numbers = np.arange(-upper_bound*expand_multiplier, upper_bound + step*expand_multiplier, step)
    return numbers

def interval_volume(value, step=1, expand_multiplier=1):
    lower_bound = int(value - step)
    upper_bound = int(value + step)
    numbers = np.arange(-upper_bound*expand_multiplier, upper_bound + step*expand_multiplier, step)
    return numbers

In [23]:
def probability_protector(value):
    if value < 0:
        return 0
    elif value > 1:
        return 1
    else:
        return value
    
def negativity_protector(value, raw_value):
    if value < 0:
        return raw_value
    else:
        return value

In [24]:
def random_choice(array):
    return np.random.choice(array)

<br><br><br><br>
<h1 style="color:pink">Match Logs DataFrame</h1>

In [25]:
def ColumnConverter(df, keyword):
    df_cols = df.columns.map('.'.join)
    new_cols = [col.split('.')[1] if keyword in col else col for col in df_cols]
    return new_cols

In [26]:
def prepare_expand_dict(column_dict, x, y):
    # example: {"unnamed.0.st: "}
    keys = column_dict.keys()
    values = column_dict.values()
    expanded_dict = {}
    for key, value in zip(keys, values):
        expanded_dict["team_"+key.lower()] = x[value]
        expanded_dict["opponent_"+key.lower()] = y[value]
    return expanded_dict

In [27]:
def prepare_rating_dict(ratings_array, team_id, suffix):
    dict_ = {}
    for rating in ratings_array:
        metric_data = rating["data"].query(f"club_team_id == {team_id}")[rating["name"]].iloc[0]
        dict_[suffix+"_"+rating["name"]] = metric_data
    return dict_

In [28]:
def generate_fbref_match_logs_array \
(match_logs_stats_dict, column_dict, tpr_data, \
 other_rating_data, league_id, team_number_league, last_matchweek, current_stat, opponent=True):
    
    df_log_array = []
    tpr_data = tpr_data["data"]
    
    for tindex in range(team_number_league):
        time.sleep(0.01)
        current_team = match_logs_stats_dict[current_stat][tindex]

        url = current_team["url"]
        print(url, tindex)

        match_logs = FB_MatchLogs(url, last_matchweek=last_matchweek)
        df0 = match_logs["for"]
        df0.columns = ColumnConverter(df0, "For")
        
        df1 = match_logs["against"]
        df1.columns = ColumnConverter(df1, "Against")
        
        team_names = df0["Opponent"].unique().tolist()
        team_ids = [TeamNameIdConverter(tpr_data, team_name, league_id) for team_name in team_names]
        team_rating_data = TFRFromId(tfr_data, current_team["club_team_id"])

        for i in range(last_matchweek):
            x = df0.iloc[i]
            y = df1.iloc[i]
            x_rating_data = prepare_rating_dict(other_rating_data, current_team["club_team_id"], suffix="team")
            y_rating_data = prepare_rating_dict(other_rating_data, TeamNameIdConverter(tpr_data, x["Opponent"], league_id), suffix="opponent")
            lab_df_dict = {
                "league_id": league_id,
                "team_name": current_team["club_team_name"],
                "team_id": current_team["club_team_id"],
                "opponent_name": x["Opponent"],
                "opponent_id": TeamNameIdConverter(tpr_data, x["Opponent"], league_id),
                "team_power": x_rating_data["team_power"],
                "opponent_power": y_rating_data["opponent_power"],
            }
            merged_dict = {**lab_df_dict, **prepare_expand_dict(column_dict, x, y), **x_rating_data, **y_rating_data}
            df_log_array.append(merged_dict)
    print("Data collecting has been completed! ✅")
    return df_log_array

<br><br><br><br>
<h1 style="color:yellow">"shooting"</h1>

In [29]:
current_stat = "shooting"

In [30]:
proper_terms = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
                'Opponent', 'Goals', 'Shots', 'Shots on Target',
                'Shots on Target Percentage', 'Goals per Shot', 'Goals per Shot on Target',
                'Distance Covered', 'Free Kicks', 'Penalties', 'Penalty Attempts',
                'Expected Goals', 'Non-Penalty Expected Goals', 'Expected Goals per Shot',
                'Goals minus Expected Goals', 'Non-Penalty Goals minus Expected Goals',
                'Match Report']
column_names = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
                'Opponent', 'Standard.Gls', 'Standard.Sh', 'Standard.SoT',
                'Standard.SoT%', 'Standard.G/Sh', 'Standard.G/SoT', 'Standard.Dist',
                'Standard.FK', 'Standard.PK', 'Standard.PKatt', 'Expected.xG',
                'Expected.npxG', 'Expected.npxG/Sh', 'Expected.G-xG',
                'Expected.np:G-xG', 'Unnamed: 24_level_0.Match Report']
proper_terms = [term.lower().replace(" ", "_") for term in proper_terms]

column_dict = dict(zip(proper_terms, column_names))

In [31]:
prepared_index_dict = column_dict
prepared_rating_data = [tpr_data, tfr_data]

In [32]:
# df_log_array = generate_fbref_match_logs_array \
# (match_logs_stats_dict, prepared_index_dict, tpr_data,  \
#  prepared_rating_data, league_id, team_number_league, last_matchweek, current_stat)
# df_log = pd.DataFrame(df_log_array)
# df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

df_log = pd.read_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

<br><br><br><br>
<h1 style="color:red">Total Shot Volume</h1>

In [33]:
df_log.columns

Index(['Unnamed: 0', 'league_id', 'team_name', 'team_id', 'opponent_name',
       'opponent_id', 'team_power', 'opponent_power', 'team_date',
       'opponent_date', 'team_time', 'opponent_time', 'team_round',
       'opponent_round', 'team_day', 'opponent_day', 'team_venue',
       'opponent_venue', 'team_result', 'opponent_result', 'team_gf',
       'opponent_gf', 'team_ga', 'opponent_ga', 'team_opponent',
       'opponent_opponent', 'team_goals', 'opponent_goals', 'team_shots',
       'opponent_shots', 'team_shots_on_target', 'opponent_shots_on_target',
       'team_shots_on_target_percentage',
       'opponent_shots_on_target_percentage', 'team_goals_per_shot',
       'opponent_goals_per_shot', 'team_goals_per_shot_on_target',
       'opponent_goals_per_shot_on_target', 'team_distance_covered',
       'opponent_distance_covered', 'team_free_kicks', 'opponent_free_kicks',
       'team_penalties', 'opponent_penalties', 'team_penalty_attempts',
       'opponent_penalty_attempts', 'tea

In [34]:
columns_to_drop = ['team_name','team_id','opponent_name','opponent_id']
df = df_log
df["total_shots"] = df["team_shots"] + df["opponent_shots"]

In [35]:
features = ['team_power', 'team_finishing', 'opponent_power', 'opponent_finishing', "team_shots", "opponent_shots"]
target_variable = 'total_shots'

In [36]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 1.9977222372217622e-29
Error Mean: 0.2657894736842105


In [37]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [38]:
model_equation(coefficients, intercept)

'1.00e-16 * team_power - 4.82e-19 * team_finishing + 2.66e-16 * opponent_power + 2.51e-16 * opponent_finishing + 1.00e+00 * team_shots + 1.00e+00 * opponent_shots - 6.04e-14'

In [39]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_shots,opponent_shots,team_name,team_id,opponent_name,opponent_id,total_shots,predicted,error
252,79,72,76,70,8,3,Aston Villa,2,Crystal Palace,1799,11,10,1
404,76,70,79,72,3,8,Crystal Palace,1799,Aston Villa,2,11,10,1
669,78,77,79,78,1,10,Leicester City,95,Arsenal,1,11,10,1
61,79,78,78,77,10,1,Arsenal,1,Leicester City,95,11,10,1
301,81,75,79,72,4,8,Tottenham Hotspur,18,Aston Villa,2,12,11,1


In [40]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 11 , Max: 45
Result: 4.227159496149049
Interval: [-5 -4 -3 -2 -1  0  1  2  3  4  5]


-1

In [41]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,opponent_finishing,team_shots,opponent_shots,team_name,team_id,opponent_name,opponent_id,total_shots,predicted,randomness
759,75,70,84,79,15,30,Southampton,17,Liverpool,9,45,45,50
189,84,79,75,70,30,15,Liverpool,9,Southampton,17,45,45,50
637,77,77,82,81,15,29,Everton,7,Manchester Utd,11,44,44,48
668,78,77,82,81,19,26,Leicester City,95,Manchester Utd,11,45,45,48
331,74,67,75,70,7,32,Brentford,1925,Brighton,1808,39,39,44


<br><br><br><br>
<h1 style="color:red">Shot Share</h1>

In [42]:
df["team_sh_share"] = df["team_shots"] / df["total_shots"]

In [43]:
features = ['team_power', 'opponent_power']
target_variable = 'team_sh_share'

In [44]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, float)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 0.026681810590791683
Error Mean: 0.12985605019186


In [45]:
model_equation(coefficients, intercept)

'1.60e-02 * team_power - 1.45e-02 * opponent_power + 3.78e-01'

In [46]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [47]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,error
13,85,74,Manchester City,10,Brentford,1925,0.74359,0.667946,0.075644
1,85,74,Manchester City,10,Bournemouth,1943,0.863636,0.667946,0.195691
37,85,74,Manchester City,10,Brentford,1925,0.607143,0.667946,-0.060803
24,85,74,Manchester City,10,Bournemouth,1943,0.606061,0.667946,-0.061885
36,85,75,Manchester City,10,Brighton,1808,0.393939,0.653459,-0.25952


In [48]:
randomness_func, interval_func = randomness_proportional, interval_proportional
result = randomness_func(df, target_variable)
interval = interval_func(result, step=0.01)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0.041666666666666664 , Max: 0.9583333333333334
Result: 0.08333333333333333
Interval: [-0.093 -0.083 -0.073 -0.063 -0.053 -0.043 -0.033 -0.023 -0.013 -0.003
  0.007  0.017  0.027  0.037  0.047  0.057  0.067  0.077  0.087  0.097]


-0.03300000000000003

In [49]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: probability_protector(value + random_choice(interval)))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,opponent_power,team_name,team_id,opponent_name,opponent_id,team_sh_share,predicted,randomness
24,85,74,Manchester City,10,Bournemouth,1943,0.606061,0.667946,0.764946
36,85,75,Manchester City,10,Brighton,1808,0.393939,0.653459,0.750459
432,84,74,Chelsea,5,Bournemouth,1943,0.625,0.651927,0.748927
3,85,76,Manchester City,10,Crystal Palace,1799,0.9,0.638973,0.725973
13,85,74,Manchester City,10,Brentford,1925,0.74359,0.667946,0.724946


In [50]:
team_power = 82
opponent_power = 70

y = 1.60e-02 * team_power - 1.45e-02 * opponent_power + 3.78e-01
probability_protector(y+random_choice(interval)) # MODEL IS DONE!

0.7619999999999998

<br><br><br><br>
<h1 style="color:red">Shot on Target</h1>

In [51]:
features = ['team_power', 'team_finishing', 'opponent_power', 'team_shots']
target_variable = 'team_shots_on_target'

In [52]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 2.706647120396336
Error Mean: 1.3657894736842104


In [53]:
model_equation(coefficients, intercept)

'3.19e-03 * team_power - 1.82e-04 * team_finishing + 1.42e-03 * opponent_power + 2.90e-01 * team_shots + 1.61e-01'

In [54]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [55]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=False, n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,team_shots,team_name,team_id,opponent_name,opponent_id,team_shots_on_target,predicted,error
189,84,79,75,30,Liverpool,9,Southampton,17,8,9,-1
408,76,70,78,31,Crystal Palace,1799,Leicester City,95,9,9,0
63,79,78,74,31,Arsenal,1,Bournemouth,1943,9,9,0
215,75,70,74,32,Brighton & Hove Albion,1808,Brentford,1925,13,9,4
14,85,82,76,26,Manchester City,10,Leeds United,8,9,8,1


In [56]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=1)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 0 , Max: 13
Result: 1.773270511652842
Interval: [-2 -1  0  1  2]


1

In [57]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=False).head(n=5)

Unnamed: 0,team_power,team_finishing,opponent_power,team_shots,team_name,team_id,opponent_name,opponent_id,team_shots_on_target,predicted,randomness
408,76,70,78,31,Crystal Palace,1799,Leicester City,95,9,9,11
189,84,79,75,30,Liverpool,9,Southampton,17,8,9,11
63,79,78,74,31,Arsenal,1,Bournemouth,1943,9,9,11
215,75,70,74,32,Brighton & Hove Albion,1808,Brentford,1925,13,9,10
296,81,75,74,24,Tottenham Hotspur,18,Bournemouth,1943,8,7,9


<br><br><br><br>
<h1 style="color:purple">Shot Models Test</h1>

<br><br><br><br>
<h1 style="color:black">Lineup Test</h1>

In [58]:
lineup_test_ids = [230621, 235212, 155862, 207865, 252145, 199556, 234153, 230767, 231747, 158023, 190871]
test_team_df = []
for player_id in lineup_test_ids:
    player_df = players_df[players_df['player_id'] == player_id]
    test_team_df.append(player_df)
result_df = pd.concat(test_team_df, ignore_index=True)

In [59]:
TFR(result_df)["data"]

Unnamed: 0,league_id,club_team_id,league_name,club_name,power,finishing
0,16,73,Ligue 1,Paris Saint Germain,85,79


<br><br><br><br>
<h1 style="color:yellow">"passing"</h1>

In [60]:
current_stat = "passing"

In [61]:
cols = FB_MatchLogs(test_stat_df(current_stat))["for"]
ColumnConverter(cols, "For")

['Date',
 'Time',
 'Round',
 'Day',
 'Venue',
 'Result',
 'GF',
 'GA',
 'Opponent',
 'Total.Cmp',
 'Total.Att',
 'Total.Cmp%',
 'Total.TotDist',
 'Total.PrgDist',
 'Short.Cmp',
 'Short.Att',
 'Short.Cmp%',
 'Medium.Cmp',
 'Medium.Att',
 'Medium.Cmp%',
 'Long.Cmp',
 'Long.Att',
 'Long.Cmp%',
 'Unnamed: 23_level_0.Ast',
 'Unnamed: 24_level_0.xAG',
 'Unnamed: 25_level_0.xA',
 'Unnamed: 26_level_0.KP',
 'Unnamed: 27_level_0.1/3',
 'Unnamed: 28_level_0.PPA',
 'Unnamed: 29_level_0.CrsPA',
 'Unnamed: 30_level_0.PrgP',
 'Unnamed: 31_level_0.Match Report']

In [62]:
column_names = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
                'Opponent', 'Total.Cmp', 'Total.Att', 'Total.Cmp%', 'Total.TotDist',
                'Total.PrgDist', 'Short.Cmp', 'Short.Att', 'Short.Cmp%', 'Medium.Cmp',
                'Medium.Att', 'Medium.Cmp%', 'Long.Cmp', 'Long.Att', 'Long.Cmp%',
                'Unnamed: 23_level_0.Ast', 'Unnamed: 24_level_0.xAG',
                'Unnamed: 25_level_0.xA', 'Unnamed: 26_level_0.KP',
                'Unnamed: 27_level_0.1/3', 'Unnamed: 28_level_0.PPA',
                'Unnamed: 29_level_0.CrsPA', 'Unnamed: 30_level_0.PrgP',
                'Unnamed: 31_level_0.Match Report']

proper_terms = ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
                'Opponent', 'Total_Completion', 'Total_Attempts', 'Total_Completion_Rate',
                'Total_Total_Distance', 'Total_Progressed_Distance', 'Short_Completion',
                'Short_Attempts', 'Short_Completion_Rate', 'Medium_Completion',
                'Medium_Attempts', 'Medium_Completion_Rate', 'Long_Completion',
                'Long_Attempts', 'Long_Completion_Rate', 'Assists', 'Expected_Goals_Against',
                'Expected_Assists', 'Key_Passes', 'Final_Third_Passes', 'Passes_Progressive',
                'Passes_Into_Final_Third', 'Crosses_And_Corners_Completion', 'Progressive_Passes',
                'Match_Report']

proper_terms = [term.lower().replace(" ", "_") for term in proper_terms]
column_dict = dict(zip(proper_terms, column_names))

prepared_index_dict = column_dict
prepared_rating_data = [tpr_data, tfr_data]

In [63]:
# df_log_array = generate_fbref_match_logs_array \
# (match_logs_stats_dict, prepared_index_dict, tpr_data, \
#  prepared_rating_data, league_id, team_number_league, last_matchweek, current_stat)
# df_log = pd.DataFrame(df_log_array)
# df_log.to_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

df_log = pd.read_csv(f"../@blacksmith/match_logs/fbref_match_logs_{league_name}@{current_stat}.csv")

In [64]:
df_log.columns

Index(['Unnamed: 0', 'league_id', 'team_name', 'team_id', 'opponent_name',
       'opponent_id', 'team_power', 'opponent_power', 'team_date',
       'opponent_date', 'team_time', 'opponent_time', 'team_round',
       'opponent_round', 'team_day', 'opponent_day', 'team_venue',
       'opponent_venue', 'team_result', 'opponent_result', 'team_gf',
       'opponent_gf', 'team_ga', 'opponent_ga', 'team_opponent',
       'opponent_opponent', 'team_total_completion',
       'opponent_total_completion', 'team_total_attempts',
       'opponent_total_attempts', 'team_total_completion_rate',
       'opponent_total_completion_rate', 'team_total_total_distance',
       'opponent_total_total_distance', 'team_total_progressed_distance',
       'opponent_total_progressed_distance', 'team_short_completion',
       'opponent_short_completion', 'team_short_attempts',
       'opponent_short_attempts', 'team_short_completion_rate',
       'opponent_short_completion_rate', 'team_medium_completion',
       '

<br><br><br><br>
<h1 style="color:red">Pass Total Attempts</h1>

In [65]:
df = df_log
features = ['team_power', 'opponent_power', 'team_total_attempts']
target_variable = 'team_total_completion'

In [66]:
model, mse, coefficients, intercept = train_linear_regression(df, features, target_variable, int)
mean_error = model_error(df, target_variable).mean()
std_error = model_error(df, target_variable).std()
print("Mean Squared Error:", mse)
print("Error Mean:", mean_error)

Mean Squared Error: 250.68588265269466
Error Mean: 12.889473684210527


In [67]:
# plot_predicted_vs_actual(df, target_variable, model=model, features=features)

In [68]:
model_equation(coefficients, intercept)

'1.21e+00 * team_power + 1.07e+00 * opponent_power + 9.78e-01 * team_total_attempts - 2.70e+02'

In [69]:
model_head(df, features+columns_to_drop, target_variable, sort_parameter="predicted", sort_ascending=True, n=5)

Unnamed: 0,team_power,opponent_power,team_total_attempts,team_name,team_id,opponent_name,opponent_id,team_total_completion,predicted,error
606,76,79,183,Nottingham Forest,14,Arsenal,1,84,85,-1
601,76,84,181,Nottingham Forest,14,Liverpool,9,93,88,5
556,74,79,207,AFC Bournemouth,1943,Arsenal,1,119,106,13
682,78,78,225,Leicester City,95,Newcastle Utd,13,133,127,6
642,77,75,234,Everton,7,Brighton,1808,149,131,18


In [70]:
randomness_func, interval_func = randomness_volume, interval_volume
result = randomness_func(df, target_variable)
interval = interval_func(result, step=15, expand_multiplier=3)

print("Result:", result)
print("Interval:", interval)
random_choice(interval)

Min: 84 , Max: 817
Result: 2.920027908663988
Interval: [-51 -36 -21  -6   9  24  39  54]


-21

In [71]:
df["randomness"] = df["predicted"].copy()
df["randomness"] = df["predicted"].apply(lambda value: negativity_protector(value + random_choice(interval), value))

df[features+columns_to_drop+[target_variable,"predicted","randomness"]].sort_values("randomness", ascending=True).head(n=5)

Unnamed: 0,team_power,opponent_power,team_total_attempts,team_name,team_id,opponent_name,opponent_id,team_total_completion,predicted,randomness
601,76,84,181,Nottingham Forest,14,Liverpool,9,93,88,37
606,76,79,183,Nottingham Forest,14,Arsenal,1,84,85,109
642,77,75,234,Everton,7,Brighton,1808,149,131,110
753,75,79,257,Southampton,17,Arsenal,1,162,156,120
522,79,76,235,West Ham United,19,Fulham,144,145,136,130
