In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from futsim_funcs import TPR,TFR,TPSR

In [51]:
import warnings
warnings.filterwarnings("ignore", message="The default value of numeric_only in DataFrame.mean is deprecated.", category=FutureWarning)

In [52]:
players_df = pd.read_csv("../fifa24_db/pdb_23.csv")

<br><br><br><br>

# Matchday Goal Count

In [53]:
data_eng = {
    'Goals': [2, 3, 1, 4, 5, 0, 6, 7, 8, 9],
    'Percentage': [22.9, 20.8, 18.4, 15.0, 8.2, 6.1, 4.7, 2.6, 0.8, 0.5],
    'Count': [87, 79, 70, 57, 31, 23, 18, 10, 3, 2],
    'League': ['eng'] * 10  # All rows have the same league 'eng'
}

df_eng = pd.DataFrame(data_eng)

In [54]:
data_ita = {
    'Goals': [2, 3, 1, 4, 5, 0, 6, 7],
    'Percentage': [24.9, 22.5, 18.7, 14.5, 7.3, 7.3, 3.5, 1.4],
    'Count': [72, 65, 54, 42, 21, 21, 10, 4],
    'League': ['ita'] * 8  # All rows have the same league 'ita'
}

df_ita = pd.DataFrame(data_ita)

In [55]:
data_esp = {
    'Goals': [2, 1, 3, 4, 0, 5, 6, 7, 8],
    'Percentage': [26.3, 19.7, 18.3, 11.1, 8.7, 7.6, 4.8, 2.4, 1.0],
    'Count': [76, 57, 53, 32, 25, 22, 14, 7, 3],
    'League': ['esp'] * 9  # All rows have the same league 'esp'
}

df_esp = pd.DataFrame(data_esp)

In [56]:
data_ger = {
    'Goals': [2, 3, 4, 1, 5, 6, 0, 7, 8, 9],
    'Percentage': [22.6, 21.4, 20.5, 11.1, 9.4, 7.7, 3.8, 2.1, 0.9, 0.4],
    'Count': [53, 50, 48, 26, 22, 18, 9, 5, 2, 1],
    'League': ['ger'] * 10  # All rows have the same league 'ger'
}

df_ger = pd.DataFrame(data_ger)

In [57]:
data_fra = {
    'Goals': [3, 2, 4, 1, 0, 5, 6, 8, 7],
    'Percentage': [23.5, 23.1, 16.2, 16.2, 10.3, 6.4, 3.0, 0.9, 0.4],
    'Count': [55, 54, 38, 38, 24, 15, 7, 2, 1],
    'League': ['fra'] * 9  # All rows have the same league 'fra'
}

df_fra = pd.DataFrame(data_fra)

In [58]:
# List of DataFrames for each league
dfs = [df_eng, df_ita, df_esp, df_ger, df_fra]

# Concatenate DataFrames
df_merged = pd.concat(dfs)
df_merged.drop(columns=["Percentage"], inplace=True)

# Group by 'Goals' and sum up the 'Count' for each goal
df_merged = df_merged.groupby('Goals').agg({'Count': 'sum'}).reset_index()
total_count = df_merged['Count'].sum()

In [None]:
df_merged['Percentage'] = df_merged['Count'] / total_count * 100

In [None]:
goals_per_match = df_merged.copy()

In [None]:
goals_per_match

In [None]:
# Plot the histogram
ax = sns.histplot(data=goals_per_match, x="Goals", weights="Count", kde=True, bins=range(11), discrete=True)

# Calculate the average
average_goals = (goals_per_match["Goals"] * goals_per_match["Count"]).sum() / goals_per_match["Count"].sum()

# Add average line
ax.axvline(average_goals, color='red', linestyle='--', label=f'Average: {average_goals:.2f}')
ax.legend()

# Add percentage values on bars
total_count = goals_per_match["Count"].sum()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{(height / total_count * 100):.2f}%', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom')

# Set x-axis ticks from 0 to 9
plt.xticks(range(10))
plt.show()

In [None]:
goals_count = goals_per_match['Goals'].tolist()
goals_count_probabilities = goals_per_match['Percentage'].div(100).tolist()  # Convert percentage to probability

def MGC(goals_count, goals_count_probabilities):
    '''
    Matchday Goal Count
    '''
    random_goal = np.random.choice(goals_count, p=goals_count_probabilities)
    return random_goal

for i in range(1):
    random_goal = MGC(goals_count, goals_count_probabilities)
    print("-----",random_goal,"-----")

<br><br><br><br>

# Matchday Actions Count[ ]

<h1 style="color:Blue">Shot</h1>
<h3 style="color:orange">Goal/SoT</h3>

In [None]:
data = {
    'Competition Name': ['de Bundesliga', 'es La Liga', 'fr Ligue 1', 'eng Premier League', 'it Serie A'],
    'Gls': [1.55, 1.22, 1.35, 1.37, 1.24],
    'Sh': [12.58, 12.17, 12.06, 12.51, 12.56],
    'SoT': [4.42, 3.98, 4.26, 4.15, 3.92],
    'SoT%': [35.1, 32.7, 35.3, 33.2, 31.2],
    'G/Sh': [0.11, 0.09, 0.10, 0.10, 0.09],
    'G/SoT': [0.32, 0.28, 0.28, 0.31, 0.29],
    'Dist': [17.4, 17.9, 17.4, 16.9, 17.9],
    'FK': [0.41, 0.43, 0.41, 0.40, 0.45],
    'PK': [0.14, 0.11, 0.15, 0.10, 0.11],
    'PKatt': [0.18, 0.15, 0.19, 0.13, 0.14],
    'xG': [1.45, 1.30, 1.41, 1.42, 1.26],
    'npxG': [1.31, 1.18, 1.27, 1.32, 1.15],
    'npxG/Sh': [0.11, 0.10, 0.11, 0.11, 0.09],
    'G-xG': ['+0.07', '-0.11', '-0.09', '-0.09', '-0.05'],
    'np:G-xG': ['+0.07', '-0.10', '-0.09', '-0.08', '-0.05']
}

df_shooting = pd.DataFrame(data)
df_shooting

In [None]:
x = df_shooting
shots_per_match = np.mean(x["Sh"])
sot_percentage = np.sum(x["SoT"]) / np.sum(x["Sh"])
goal_percentage = np.sum(x["Gls"]) / np.sum(x["SoT"])

In [None]:
column_width = 12
print(f"{'Sh/Match':<{column_width}} {'SoT/Sh':<{column_width}} {'Gls/SoT':<{column_width}}")
print(f"{shots_per_match:<{column_width}.2f} {sot_percentage:<{column_width}.2f} {goal_percentage:<{column_width}.2f}")

In [None]:
def transform_rating(rating, average_rating=70, advantage=1, disadvantage=-1.5):
    # Calculate the difference between the rating and the average
    diff = rating - average_rating
    
    # Apply advantage and disadvantage based on the difference
    if diff > 0:
        # Apply advantage for ratings higher than average
        transformed_rating = average_rating + diff * (1 + advantage)
    else:
        # Apply disadvantage for ratings lower than average
        transformed_rating = average_rating + diff * (1 - disadvantage)
    
    # Scale the transformed rating to a range between 0 and 1
    scaled_rating = (transformed_rating - 60) / (80 - 60)
    
    # Ensure the result is between 0 and 1
    return min(scaled_rating, 1) / 20

# Test the function with ratings 80 and 60
rating_80 = transform_rating(80)
rating_60 = transform_rating(60)

print("Transformed rating for 80:", rating_80)
print("Transformed rating for 60:", rating_60)

In [None]:
finishing = 80

target_p = 0.33 + transform_rating(finishing)
goal_p = 0.30 + transform_rating(finishing)

In [None]:
sot_count_trials=[]
goal_count_trials=[]
shot_count = 13 
for i in range(shot_count):
    goal_counter=0
    sot_counter=0
    shot_on_target = np.random.choice([1, 0], p=[target_p, 1-target_p])
    #print("-----",shot_on_target,"-----")
    if shot_on_target:
        #print("@Shot on Target")
        goal_attempt = np.random.choice([1, 0], p=[goal_p, 1-goal_p])
        sot_counter+=1
        if goal_attempt:
            #print("⚽⚽⚽GOOOOOOOOOOOOOOOOOOOOOOOOOOOL")
            goal_counter+=1
    sot_count_trials.append(sot_counter)
    goal_count_trials.append(goal_counter)    
    
print(shot_count, "Shots")
print(sum(sot_count_trials), "Shot on Target")
print(sum(goal_count_trials), "Goals")

ratio = sum(goal_count_trials) / sum(sot_count_trials)
goal_per_sot = ratio if ratio==0 or shot_count else 0
print("G/SoT:", goal_per_sot )

In [None]:
sum(goals_per_match.Goals * goals_per_match.Count)

In [None]:
TFR(players_df, 18).sort_values("power", ascending=False).head()

In [None]:
q = players_df["club_name"].str.contains("chelsea", case=False)
players_df[q].sort_values(by="overall", ascending=False).head(n=3)

<br><br><br><br>

# REGRESSION📈 

### Importing Data

In [None]:
url = "https://fbref.com/en/comps/Big5/2022-2023/shooting/squads/2022-2023-Big-5-European-Leagues-Stats"
fbref_data = pd.read_html( url, header=1)[0]

fifa_data = TFR(players_df)
fifa_data["league_id"] = fifa_data["league_id"].astype(str)
top5_leagues_id = [str(i) for i in [13,53,19,31,16]]
q = fifa_data["league_id"].str.contains('|'.join(top5_leagues_id))

fifa_data = fifa_data[q]

### Merging FIFA & FBRef

In [None]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

def mapped_data(fifa_data, fbref_data):
    fbref_to_fifa_mapping = {fb_name: find_best_match(fb_name, fifa_data['club_name'])[0] for fb_name in fbref_data['Squad']}
    fbref_to_fifa_mapping = {key: value for key, value in fbref_to_fifa_mapping.items() if value is not None}
    fbref_data['Mapped_club_name'] = fbref_data['Squad'].map(fbref_to_fifa_mapping)
    merged_data = pd.merge(fbref_data, fifa_data, how='inner', left_on='Mapped_club_name', right_on='club_name')
    return merged_data

all_team_ids = []
# lst = mapped_data(fifa_data, fbref_data)[["Squad","club_name","club_team_id"]]
# print(len(merged_ids))
# for i in zip(lst.Squad, lst.club_name, lst.club_team_id):
#     print(i) 

In [None]:
data = mapped_data(fifa_data, fbref_data).copy()
data.columns

### Linear Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

feature_columns = ["power","finishing"]
target_variable = "G/SoT"

X = data[feature_columns]
y = data[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

### Test & Error

In [None]:
predicted_g_sot_all = model.predict(X)
data['Predicted'] = predicted_g_sot_all
data["Error"] = data['G/SoT'] - predicted_g_sot_all
err = data["Error"].mean()
print("Error for","Finishing:", str(err)[:6])

finishing = 75
power = 84

input_data = pd.DataFrame({
    "power": [power],
    "finishing": [finishing]
})
predicted_g_sot = model.predict(input_data)
print("predicted_g_sot", predicted_g_sot)

data[["club_team_id","Squad","G/SoT","Predicted","power"]].sort_values("Predicted",ascending=False).head(n=5)

<br><br><br><br>

<h1 style="color:Blue">Pass</h1>
<h3 style="color:orange">Pass Accuracy</h3>

In [None]:
TPSR(players_df)

In [None]:
data = {
    "Competition Name": ["de Bundesliga", "es La Liga", "fr Ligue 1", "eng Premier League", "it Serie A"],
    "Cmp_Total": [371.1, 372.5, 402.6, 392.8, 384.1],
    "Att_Total": [484.7, 471.5, 498.9, 495.5, 483.8],
    "Cmp%_Total": [76.6, 79.0, 80.7, 79.3, 79.4],
    "TotDist_Total": [6823, 6665, 7167, 6833, 6882],
    "PrgDist_Total": [2620, 2455, 2556, 2425, 2502],
    "Cmp_Short": [156.4, 170.4, 173.7, 182.8, 170.8],
    "Att_Short": [183.2, 192.2, 199.7, 207.4, 193.8],
    "Cmp%_Short": [85.4, 88.7, 87.0, 88.1, 88.1],
    "Cmp_Medium": [163.1, 150.3, 172.3, 162.3, 160.5],
    "Att_Medium": [193.9, 176.5, 198.9, 189.3, 187.2],
    "Cmp%_Medium": [84.1, 85.2, 86.6, 85.7, 85.7],
    "Cmp_Long": [42.4, 41.6, 42.3, 37.5, 42.4],
    "Att_Long": [80.1, 76.1, 72.9, 71.4, 76.0],
    "Cmp%_Long": [52.9, 54.7, 58.0, 52.5, 55.8],
    "Ast": [1.09, 0.86, 0.89, 0.96, 0.89],
    "xAG": [1.03, 0.94, 0.97, 1.03, 0.92],
    "xA": [1.11, 0.90, 0.99, 0.88, 0.80],
    "A-xAG": [0.05, -0.09, -0.09, -0.09, -0.04],
    "KP": [9.3, 9.2, 9.0, 9.4, 9.6],
    "1/3": [27.9, 30.0, 32.8, 30.3, 28.4],
    "PPA": [7.8, 7.4, 8.0, 8.1, 7.7],
    "CrsPA": [2.1, 2.2, 2.0, 1.9, 2.2],
    "PrgP": [35.9, 36.8, 41.5, 37.8, 35.7]
}

df_passing = pd.DataFrame(data)
df_passing.columns

### Importing Data

In [None]:
url = "https://fbref.com/en/comps/Big5/2022-2023/passing/squads/2022-2023-Big-5-European-Leagues-Stats"
fbref_data = pd.read_html( url, header=1)[0]

fifa_data = TPSR(players_df)
fifa_data["league_id"] = fifa_data["league_id"].astype(str)
top5_leagues_id = [str(i) for i in [13,53,19,31,16]]
q = fifa_data["league_id"].str.contains('|'.join(top5_leagues_id))

fifa_data = fifa_data[q]

### Merging FIFA & FBRef

In [None]:
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

def mapped_data(fifa_data, fbref_data):
    fbref_to_fifa_mapping = {fb_name: find_best_match(fb_name, fifa_data['club_name'])[0] for fb_name in fbref_data['Squad']}
    fbref_to_fifa_mapping = {key: value for key, value in fbref_to_fifa_mapping.items() if value is not None}
    fbref_data['Mapped_club_name'] = fbref_data['Squad'].map(fbref_to_fifa_mapping)
    merged_data = pd.merge(fbref_data, fifa_data, how='inner', left_on='Mapped_club_name', right_on='club_name')
    return merged_data

all_team_ids = []
# lst = mapped_data(fifa_data, fbref_data)[["Squad","club_name","club_team_id"]]
# for i in zip(lst.Squad, lst.club_name, lst.club_team_id):
#     print(i) 

In [None]:
data = mapped_data(fifa_data, fbref_data).copy()
data.columns

In [None]:
data["pass_accuracy"] = data["Cmp%"]/100
data["pass_attempts"] = (data["Att"] / data["90s"]).apply(int)

### Linear Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

feature_columns = ["power","passing"]
target_variable = "pass_accuracy"

X = data[feature_columns]
y = data[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

### Error

In [None]:
predicted_g_sot_all = model.predict(X)
data['Predicted'] = predicted_g_sot_all
data["Error"] = data['pass_accuracy'] - predicted_g_sot_all
err = data["Error"].mean()
print(err)

### Test

In [None]:
passing = 77
power = 79

input_data = pd.DataFrame({
    "power": [power],
    "passing": [passing],
})

predicted = model.predict(input_data)
data[["Squad", "pass_accuracy", "Predicted", "passing"]].sort_values("Predicted", ascending=False).head(n=5)

<br><br><br><br>

<h3 style="color:orange">Pass Attempts</h3>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

feature_columns = ["power","passing"]
target_variable = "pass_attempts"

X = data[feature_columns]
y = data[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
predicted = model.predict(X).astype(int)
data['Predicted'] = predicted
data["error"] = data['pass_attempts'] - predicted

error = data["error"].apply(abs)
error_mean = error.mean()
print("Error for","Passing.pass_attempts:", int(error_mean))

power = 84
passing = 80

input_data = pd.DataFrame({ 
    "power": [power],
    "passing": [passing],
})
predicted = model.predict(input_data)
print("predicted", predicted)

### Randomness Factor

In [None]:
random_interval = np.linspace(-45,45,91,dtype="int64")
random_choices = np.random.choice(random_interval, size=98)

data['choice'] = random_choices
data['generated'] = data['Predicted'] - data['choice']

In [None]:
data[["club_team_id","Squad","power","passing","pass_attempts","Predicted", "error", "choice", "generated"]] \
.sort_values("power",ascending=False).head(n=5)

In [None]:
f = data["error"].apply(abs)

In [None]:
f.describe()

# Next Goal Probability

In [None]:
def NGP(home_rating, away_rating):
    min_rating = 70
    max_rating = 84 # DENEMELER 78, 80
    # Min-Max Normalization
    normalized_home_rating = (home_rating - min_rating) / (max_rating - min_rating)
    normalized_away_rating = (away_rating - min_rating) / (max_rating - min_rating)
    rating_diff = normalized_home_rating - normalized_away_rating
    ngp_prob = 1 / (1 + np.exp(-rating_diff))
    return ngp_prob

#Test
home_rating = 83
away_rating = 79
probability = NGP(home_rating, away_rating)
print("Next Goal Probability:", probability)