In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings

In [76]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))
warnings.simplefilter(action='ignore', category=(FutureWarning))

In [77]:
fm_df = pd.read_csv("players_db/fm23/fm23db_processed.csv")

In [78]:
fm_df[["Club","Based","Division","Club_id","League_id"] + list(fm_df.select_dtypes(exclude="object").columns)].head()

Unnamed: 0,Club,Based,Division,Club_id,League_id,UID,Height,Age,Caps,Acceleration,...,tpr_ML,tpr_MR,tpr_WBR,tpr_WBL,tpr_without_shifting,tpr_normalized,tpr_general,tpr_AVERAGE,tpr_Best,Error_tpr
0,FC Bayern,Germany (Bundesliga),Bundesliga,3704,185,92039023,177,27,68,64,...,46,46,99,49,99,95,72,92,99,0
1,FC Bayern,Germany (Bundesliga),Bundesliga,3704,185,85100467,174,30,87,79,...,81,58,55,70,99,93,70,91,98,1
2,FC Bayern,Germany (Bundesliga),Bundesliga,3704,185,35011448,185,32,115,59,...,69,96,47,46,99,95,71,92,99,0
3,FC Bayern,Germany (Bundesliga),Bundesliga,3704,185,8718372,193,36,112,54,...,41,41,41,41,99,95,64,96,99,4
4,FC Bayern,Germany (Bundesliga),Bundesliga,3704,185,91104807,189,27,44,59,...,61,55,50,58,93,91,69,89,97,1


In [79]:
fm_df.Best_Pos

0        DM
1       AML
2       AMC
3        GK
4        MC
       ... 
6417    AMR
6418     GK
6419    AMR
6420     DC
6421     GK
Name: Best_Pos, Length: 6422, dtype: object

In [80]:
fm_df.Club.value_counts(sort=True)

Club
Hellas Verona          74
Sassuolo               73
Internazionale         71
Benfica                71
Vizela                 68
                       ..
Sparta                 24
FC Volendam            23
Paris Saint-Germain    23
Ajax                   22
Clermont               22
Name: count, Length: 153, dtype: int64

In [81]:
fm_df.iloc[0].to_dict()

{'UID': 92039023,
 'Inf': nan,
 'Name': 'Joshua Kimmich',
 'DoB': '1995-02-08',
 'Nat': 'GER',
 'Division': 'Bundesliga',
 'Club': 'FC Bayern',
 'Based': 'Germany (Bundesliga)',
 'Preferred Foot': 'Right',
 'Right Foot': 'Very Strong',
 'Left Foot': 'Fairly Strong',
 'Position': 'DR,WBR,DM,MC',
 'Height': 177,
 'Weight': '75 kg',
 'Age': 27,
 'Transfer Value': '€96M - €119M',
 'Wage': '€1,496,000 p/m',
 'AT Apps': '253',
 'AT Gls': '23',
 'Team': nan,
 'Caps': 68,
 'Yth Apps': '14',
 'Style': 'Leader',
 'Rc Injury': '-',
 'Best_Role': 'Deep Lying Playmaker',
 'Best Duty': 'Defend',
 'Best_Pos': 'DM',
 'Acceleration': 64,
 'Aerial_Reach': 4,
 'Aggression': 74,
 'Agility': 59,
 'Anticipation': 74,
 'Balance': 64,
 'Bravery': 74,
 'Command_of_Area': 9,
 'Communication': 9,
 'Composure': 84,
 'Concentration': 69,
 'Corners': 74,
 'Crossing': 84,
 'Decisions': 79,
 'Determination': 99,
 'Dribbling': 59,
 'Eccentricity': 14,
 'Finishing': 54,
 'First_Touch': 74,
 'Flair': 64,
 'Free_Kick_Tak

In [82]:
rating_attrs = ["Club","gk","def","pas","dri","fin","sta","str","hed","men","iq"]

# Formations

In [83]:
def Formation_Dict(formation="4-3-3"):
    # test_formation = { 'GK': 1, 'DC': 2, 'DL': 1, 'DR': 1, 'DM': 1, 'MC': 2, 'AML': 1, 'AMR': 1, 'ST': 1 }
    test_formation = { 'GK': 1, 'DC': 3, 'WBL': 1, 'WBR': 1, 'DM': 0, 'MC': 3, 'AML': 0, 'AMR': 0, 'ST': 2 }
    return test_formation

<br><br><br><br>
# Rating Functions

In [84]:
def Quantile(n=16, index=0):
    data_numeric = pd.DataFrame(list(range(n)))
    quantile_intervals = [0, 0.25, 0.40, 0.65, 0.75, 1.0]
    qval = data_numeric.quantile(quantile_intervals)
    return int(qval.iloc[index]) 

In [85]:
def Quarter_Rating(df, n, index, current_attribute):
    quantile = Quantile(n=n, index=index)
    output = df.nlargest(quantile, current_attribute)[current_attribute].mean()
    return output

<br><br><br><br>
# Club Powers

In [86]:
def TPR(df, n, lineup=False, print_club=False, coef_dict={}):
    '''
    Team Power Rating
    '''
    # Choose Attribute
    current_attribute = "tpr"
    zoom_df = df
    
    # Group by club
    club_groups = df.groupby('Club')
    club_rating_dict = {}

    if not lineup:
        for club, group in club_groups:
            top_players = group.nlargest(n, 'tpr')
            p1 = Quarter_Rating(top_players, n, 1, current_attribute)
            p2 = Quarter_Rating(top_players, n, 2, current_attribute)
            p3 = Quarter_Rating(top_players, n, 3, current_attribute)
            p4 = Quarter_Rating(top_players, n, 4, current_attribute)
            p5 = Quarter_Rating(top_players, n, 5, current_attribute)
            club_rating_dict[club] = int(p1*coef_dict["p1"] + p2*coef_dict["p2"] + p3*coef_dict["p3"] + p4*coef_dict["p4"] + p5*coef_dict["p5"])
            
    else: # Lineup_df is active
        for club, group in club_groups:
            positions = Formation_Dict(formation="4-3-3")
            
            selected_players = []
            used_players = set()

            for position, count in positions.items():
                # position_group = group[group['Best_Pos'] == position] # This variable selects only Best_Pos
                position_group = group[group['Position'].apply(lambda x: any(position in x.split(",") for i in x.split(",") if i == position))]
                
                if position_group.empty:
                    # position_group = group[group['Position'].apply(lambda x: position in x)]
                    position_group = group[group['Position'].apply(lambda x: any(position in x.split(",") for i in x.split(",") if i == position))]
                
                position_group = position_group[~position_group.index.isin(used_players)]
                top_position_players = position_group.nlargest(count, f'tpr_{position}')
                
                selected_players.extend(top_position_players.index.tolist())
                used_players.update(top_position_players.index.tolist())

            # Ensure we have exactly 11 players 
            while len(selected_players) < 11:
                remaining_players = group[~group.index.isin(used_players)]
                if remaining_players.empty:
                    break
                next_best_player = remaining_players.nlargest(1, f'tpr_{position}')
                selected_players.extend(next_best_player.index.tolist())
                used_players.update(next_best_player.index.tolist())

            lineup_players = group.loc[selected_players].sort_values(by="Pos_Rank")
            p1 = Quarter_Rating(lineup_players, n, 1, current_attribute)
            p2 = Quarter_Rating(lineup_players, n, 2, current_attribute)
            p3 = Quarter_Rating(lineup_players, n, 3, current_attribute)
            p4 = Quarter_Rating(lineup_players, n, 4, current_attribute)
            p5 = Quarter_Rating(lineup_players, n, 5, current_attribute)
            club_rating_dict[club] = int(p1*coef_dict["p1"] + p2*coef_dict["p2"] + p3*coef_dict["p3"] + p4*coef_dict["p4"] + p5*coef_dict["p5"])

            if club == print_club:
                print(club, "XI =", len(lineup_players))
                print(lineup_players[["Name","Best_Pos","tpr"]])
                
    club_df = pd.DataFrame(list(club_rating_dict.items()), columns=['Club', current_attribute])
    return club_df.sort_values(current_attribute, ascending=False)

In [None]:
example_club="Liverpool"
output = TPR(fm_df, n=16, lineup=True, print_club=example_club,
             coef_dict={ 'p1': 0.30, 'p2': 0.35, 'p3': 0.15, 'p4': 0.15, 'p5': 0.05 }).head(n=10)
output

In [88]:
example_pos="tpr_ST"
fm_df.query(f"Club == '{example_club}'").sort_values(by=example_pos,ascending=False)[["Name","Best_Pos","Position","tpr",example_pos]] \
.head(n=5)

Unnamed: 0,Name,Best_Pos,Position,tpr,tpr_ST
3194,Mohamed Salah,AMR,"AMR,AML,ST",92,95
3220,Roberto Firmino,ST,"AMR,AML,AMC,ST",84,92
3451,Darwin Núñez,ST,ST,80,92
3249,Diogo Jota,AML,"AMR,AML,ST",82,90
3577,Fábio Carvalho,AML,"AMR,AML,AMC,ST",77,85


<h1 style="color:white;background:green;">  Goalkeeper</h1>

In [89]:
def TGK(df, n=1):
    '''
    Team Goalkeeping Rating
    '''
    # Choose Attribute
    current_attribute = "gk"
    zoom_df = df[df['Best_Pos'] == 'GK'].copy()
    
    # Group by club
    club_groups = zoom_df.groupby('Club')
    club_rating_dict = {}

    for club, group in club_groups:
        top_players = group.nlargest(n, current_attribute)
        average_rating = top_players[current_attribute].mean()
        club_rating_dict[club] = int(average_rating)
        
    club_df = pd.DataFrame(list(club_rating_dict.items()), columns=['Club', "GK"])
    return club_df.sort_values("GK", ascending=False)

<h1 style="color:aqua;background:blue;">  Pitcher</h1>

In [90]:
gk_attributes = [ "Aerial_Reach","Command_of_Area", "Communication","Eccentricity", "First_Touch", "Handling", "Kicking", "One_on_Ones", "Punching", "Reflexes", "Rushing_Out", "Throwing", "Passing" ]
all_attributes = ['Acceleration','Aerial_Reach','Aggression','Agility','Anticipation','Balance','Bravery','Command_of_Area','Communication','Composure','Concentration','Corners','Crossing','Decisions','Determination','Dribbling','Eccentricity','Finishing','First_Touch','Flair','Free_Kick_Taking','Handling','Heading','Jumping_Reach','Kicking','Leadership','Long_Shots','Long_Throws','Marking','Natural_Fitness','Off_the_Ball','One_on_Ones','Pace','Passing','Penalty_Taking','Positioning','Punching','Reflexes','Rushing_Out','Stamina','Strength','Tackling','Teamwork','Technique','Throwing','Vision','Work_Rate']
non_gk_attributes = list(set(all_attributes) - set(gk_attributes))
gk_pitcher_common_attributes = ["Passing","First_Touch"]
other_attributes = [ "tpr" ]
# mean_compiled_attributes = ["def","pas","dri","fin","sta","str","hed","men","iq"]
pitcher_attributes = np.concatenate((other_attributes, non_gk_attributes, gk_pitcher_common_attributes))

In [91]:
len(pitcher_attributes)

37

In [92]:
def TPR_MACHINE(current_attribute, df, n=16, coef_dict={}, storage_df=None):
    '''
    Team Defending Rating
    current_attribute: chosen attribute
    df: Main dataframe we get data from
    n: best n players in the club
    coef_dict: coefficients
    storage_dict: storage dictionary
    '''
    
    # Group by club
    club_groups = df.groupby('Club')
    club_rating_dict = {}

    for club, group in club_groups:
        top_players = group.nlargest(n, current_attribute)
        p1 = Quarter_Rating(top_players, n, 1, current_attribute)
        p2 = Quarter_Rating(top_players, n, 2, current_attribute)
        p3 = Quarter_Rating(top_players, n, 3, current_attribute)
        p4 = Quarter_Rating(top_players, n, 4, current_attribute)
        p5 = Quarter_Rating(top_players, n, 5, current_attribute)
        rating_value = int(p1*coef_dict["p1"] + p2*coef_dict["p2"] + p3*coef_dict["p3"] + p4*coef_dict["p4"] + p5*coef_dict["p5"])
        where_id = storage_df[storage_df.Club==club].iloc[0].name
        storage_df.at[where_id, current_attribute] = rating_value
    return None

In [93]:
pitcher_attributes

array(['tpr', 'Anticipation', 'Stamina', 'Balance', 'Bravery', 'Marking',
       'Pace', 'Off_the_Ball', 'Heading', 'Crossing', 'Vision',
       'Tackling', 'Decisions', 'Natural_Fitness', 'Positioning',
       'Jumping_Reach', 'Acceleration', 'Dribbling', 'Finishing', 'Flair',
       'Free_Kick_Taking', 'Agility', 'Penalty_Taking', 'Aggression',
       'Long_Throws', 'Concentration', 'Composure', 'Technique',
       'Corners', 'Leadership', 'Teamwork', 'Strength', 'Determination',
       'Work_Rate', 'Long_Shots', 'Passing', 'First_Touch'], dtype='<U16')

In [94]:
# rating_coefficients = { 'p1': 0.35, 'p2': 0.25, 'p3': 0.20, 'p4': 0.15, 'p5': 0.05 }
# rating_coefficients = { 'p1': 0.35, 'p2': 0.25, 'p3': 0.15, 'p4': 0.15, 'p5': 0.10 }
rating_coefficients = { 'p1': 0.20, 'p2': 0.20, 'p3': 0.35, 'p4': 0.15, 'p5': 0.10 }

In [95]:
teams_df = fm_df[["Club","Based","Division","Club_id","League_id"]].drop_duplicates()
teams_df = pd.merge(TGK(fm_df, n=1), teams_df, how="inner", on="Club")

for attribute in pitcher_attributes:
    TPR_MACHINE(attribute, fm_df, 16, rating_coefficients, teams_df)
    print(f"TPR Machine operational. Attribute: ---{attribute}--- integrated.")

TPR Machine operational. Attribute: ---tpr--- integrated.
TPR Machine operational. Attribute: ---Anticipation--- integrated.
TPR Machine operational. Attribute: ---Stamina--- integrated.
TPR Machine operational. Attribute: ---Balance--- integrated.
TPR Machine operational. Attribute: ---Bravery--- integrated.
TPR Machine operational. Attribute: ---Marking--- integrated.
TPR Machine operational. Attribute: ---Pace--- integrated.
TPR Machine operational. Attribute: ---Off_the_Ball--- integrated.
TPR Machine operational. Attribute: ---Heading--- integrated.
TPR Machine operational. Attribute: ---Crossing--- integrated.
TPR Machine operational. Attribute: ---Vision--- integrated.
TPR Machine operational. Attribute: ---Tackling--- integrated.
TPR Machine operational. Attribute: ---Decisions--- integrated.
TPR Machine operational. Attribute: ---Natural_Fitness--- integrated.
TPR Machine operational. Attribute: ---Positioning--- integrated.
TPR Machine operational. Attribute: ---Jumping_Reach

<h1 style="color:white;background:purple;">  Normalization</h1>

In [96]:
def Normalize(X, min_val, max_val):
    min_col = X.min()
    max_col = X.max()
    value = (X - min_col) / (max_col - min_col) * (max_val - min_val) + min_val
    return value.astype(int)

In [97]:
attributes_to_standardize = np.concatenate((pitcher_attributes,["GK"]))
for attribute in attributes_to_standardize:
    teams_df[attribute] = Normalize(teams_df[attribute], 50, 92)

In [98]:
teams_df.sort_values(by="tpr",ascending=False)[["Club","tpr"]].head(n=10)

Unnamed: 0,Club,tpr
0,FC Bayern,92
2,Liverpool,92
3,Manchester City,89
51,Paris Saint-Germain,89
6,Real Madrid,87
1,FC Barcelona,87
13,Chelsea,84
26,AC Milan,82
20,Internazionale,82
9,Juventus,79


<br>
<h4 style="color:green;">  Export as CSV</h4>

In [99]:
teams_df.to_csv("players_db/fm23/team_ratings.csv", index=False)

<br><br><br><br>
# Matching FBref Names

In [8]:
import pandas as pd
from rapidfuzz import process
def find_best_match(name, choices):
    return process.extractOne(name, choices)

In [9]:
top5_leagues = ['Ligue 1 Uber Eats', 'English Premier Division', 'Italian Serie A',
       'Spanish First Division', 'Bundesliga'] 
top5_url = "https://fbref.com/en/comps/Big5/2022-2023/shooting/squads/2022-2023-Big-5-European-Leagues-Stats"

In [10]:
data = pd.read_html(top5_url)
teams_df_2 = pd.read_csv("players_db/fm23/team_ratings.csv")

In [11]:
fbref_teams = data[0][('Unnamed: 1_level_0', 'Squad')].tolist()
fm_teams = teams_df_2[teams_df_2.Division.isin(top5_leagues)]["Club"].tolist()

In [12]:
for club_name in fm_teams:
    answer, score, other = find_best_match(club_name, fbref_teams)
    where_id = teams_df_2[teams_df_2.Club==club_name].iloc[0].name
    teams_df_2.at[where_id, 'fbref_name'] = answer
teams_df_2.to_csv("players_db/fm23/team_ratings.csv", index=False)