In [37]:
import numpy as np
import re
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

In [38]:
df_23 = pd.read_csv('./Dataset/FIFA23_official_data.csv')
df_22 = pd.read_csv('./Dataset/FIFA22_official_data.csv')


In [39]:
# Filter df_22 and df_23 to keep only the common club names
common_clubs = set(df_22['Club']).intersection(set(df_23['Club']))
df_22_common = df_22[df_22['Club'].isin(common_clubs)]
df_23_common = df_23[df_23['Club'].isin(common_clubs)]

# Merge the filtered DataFrames
merged = pd.merge(df_22_common, df_23_common, on='ID', how='inner', suffixes=('_22', '_23'))

In [40]:
transfers = merged[merged['Club_22'] != merged['Club_23']][['ID', 'Name_22','Club_22','Club_23',"Best Position","BestPosition"]]

In [41]:
transfers

Unnamed: 0,ID,Name_22,Club_22,Club_23,Best Position,BestPosition
2,176580,L. Suárez,Atlético de Madrid,Club Nacional de Football,ST,ST
6,181291,G. Wijnaldum,Paris Saint-Germain,Roma,CM,CM
15,212462,Alex Telles,Manchester United,Sevilla FC,LB,LB
21,200458,L. Digne,Everton,Aston Villa,LB,LB
22,208574,F. Kostić,Eintracht Frankfurt,Juventus,LM,LM
...,...,...,...,...,...,...
9823,258928,V. Sinisalo,Aston Villa,Burton Albion,GK,GK
9840,257138,B. Nna Noukeu,Crawley Town,Stoke City,GK,GK
9845,263373,J. Searle,Swansea City,Barnsley,GK,GK
9846,259718,F. Gebhardt,FC Basel 1893,Hallescher FC,GK,GK


In [42]:
transfers.to_csv('./rec_extended_data/ground_truth.csv')

## Approach 1: KNN

In [43]:
Unimportant_features  = ["Photo", "Flag", "Club Logo", "Wage", "Special", "International Reputation", "Work Rate", "Body Type", "Real Face", "Jersey Number", "Contract Valid Until", "Best Overall Rating","Joined","Loaned From"]
df_22.drop(Unimportant_features, axis = 1, inplace = True)


In [44]:
#One - Hot Encoding the Preferred Foot
df_22 = pd.get_dummies(df_22,columns = ["Preferred Foot"])

In [45]:
detailed_features = ['Age','Overall', 'Potential', 'Weak Foot', 'Skill Moves', 'Height', 'Weight', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Preferred Foot_Left', 'Preferred Foot_Right']

In [46]:
df_22[detailed_features] = df_22[detailed_features].fillna(0)

In [47]:
# Clean height column
feet_inches_re = re.compile(r"(\d)'(\d+)")

# Define a function to convert feet'inches to cm
def feet_inches_to_cm(s):
    match = feet_inches_re.match(s)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return round((feet*12 + inches) * 2.54, 2)
    else:
        return float(s.replace('cm', ''))
df_22['Height'] = pd.to_numeric(df_22['Height'].apply(feet_inches_to_cm))

In [48]:
# Clean Weight Column
def convert_weight(weight_str):
    if 'kg' in weight_str:
        # If weight is in kg, remove the 'kg' suffix and return as float
        return float(weight_str.replace('kg', ''))
    elif 'lbs' in weight_str:
        # If weight is in lbs, convert to kg and return as float
        return float(weight_str.replace('lbs', '')) * 0.453592
    else:
        print("oops")
        # Return NaN if weight format is not recognized
        return pd.np.nan

# Apply the function to the Weight column and create a new column 'Weight_kg'
df_22['Weight'] = df_22['Weight'].apply(convert_weight)

In [49]:
df = df_22.select_dtypes(include = 'number')
df.shape

(16710, 45)

In [50]:
# Compute the correlation matrix
corr = df.corr()

In [51]:
df = df.fillna(0)

In [52]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Separate the ID column and scale the data
X = df.drop('ID', axis=1).fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the KNN model
knn = NearestNeighbors() # don't specify the number of neighbors yet, we will set it in the function
knn.fit(X_scaled)

# Define a function to get the top k closest neighbors of a player
def get_top_k_neighbors(player_id, k=5):
    knn.set_params(n_neighbors=k+1) # add 1 to include the input player in the neighbors
    player_row = df[df['ID'] == player_id].drop('ID', axis=1).fillna(0)
    player_row_scaled = scaler.transform(player_row)
    distances, indices = knn.kneighbors(player_row_scaled)
    neighbor_ids = df.iloc[indices[0]][1:]['ID'].values # exclude the first index which is the input player
    return df[df['ID'].isin(neighbor_ids)].reset_index(drop=True)

In [53]:
real_transfers = pd.read_csv('./rec_extended_data/ground_truth.csv')
real_transfers

Unnamed: 0.1,Unnamed: 0,ID,Name_22,Club_22,Club_23,Best Position,BestPosition
0,2,176580,L. Suárez,Atlético de Madrid,Club Nacional de Football,ST,ST
1,6,181291,G. Wijnaldum,Paris Saint-Germain,Roma,CM,CM
2,15,212462,Alex Telles,Manchester United,Sevilla FC,LB,LB
3,21,200458,L. Digne,Everton,Aston Villa,LB,LB
4,22,208574,F. Kostić,Eintracht Frankfurt,Juventus,LM,LM
...,...,...,...,...,...,...,...
3562,9823,258928,V. Sinisalo,Aston Villa,Burton Albion,GK,GK
3563,9840,257138,B. Nna Noukeu,Crawley Town,Stoke City,GK,GK
3564,9845,263373,J. Searle,Swansea City,Barnsley,GK,GK
3565,9846,259718,F. Gebhardt,FC Basel 1893,Hallescher FC,GK,GK


In [54]:
def find_top_players(df_22, real_transfers):
    top_players_dict = {}
    id_list = []

    for index, row in real_transfers.iterrows():
        club_23 = row["Club_23"]
        ID = row["ID"]
        best_position = row["BestPosition"]
        club_df = df_22[(df_22["Club"] == club_23) & (df_22["Best Position"] == best_position)]
        if club_df.empty:
            # print(f"No players found for {club_23} in {best_position} for {row['Name_22']}")
            # real_transfers = real_transfers.drop(index=index)
            pass
        else:
            # Sort the filtered dataframe by the Overall column in descending order
            sorted_df = club_df.sort_values(by="Overall", ascending=False)

            # Take the top 2 players and store their IDs in the dictionary
            top_players_dict[(club_23, best_position,ID)] = list(sorted_df.head(2)["ID"])
            id_list.append(ID)
    return top_players_dict,id_list

In [55]:
top_players_dict,id_list= find_top_players(df_22, real_transfers)

In [56]:
len(id_list)

3029

In [57]:
len(real_transfers)

3567

In [58]:
len(top_players_dict)

3001

In [59]:
final_real_transgers_df = real_transfers[real_transfers["ID"].isin(id_list)]
final_real_transgers_df

Unnamed: 0.1,Unnamed: 0,ID,Name_22,Club_22,Club_23,Best Position,BestPosition
0,2,176580,L. Suárez,Atlético de Madrid,Club Nacional de Football,ST,ST
1,6,181291,G. Wijnaldum,Paris Saint-Germain,Roma,CM,CM
2,15,212462,Alex Telles,Manchester United,Sevilla FC,LB,LB
3,21,200458,L. Digne,Everton,Aston Villa,LB,LB
4,22,208574,F. Kostić,Eintracht Frankfurt,Juventus,LM,LM
...,...,...,...,...,...,...,...
3562,9823,258928,V. Sinisalo,Aston Villa,Burton Albion,GK,GK
3563,9840,257138,B. Nna Noukeu,Crawley Town,Stoke City,GK,GK
3564,9845,263373,J. Searle,Swansea City,Barnsley,GK,GK
3565,9846,259718,F. Gebhardt,FC Basel 1893,Hallescher FC,GK,GK


### Top 5

In [60]:
pred_5 =0 
total_5 =0
for key, value in top_players_dict.items():
    club_23 = key[0]
    best_position = key[1]
    ID = key[2]
    top_player_ids = value
    total_5 = total_5 +1

    for id in top_player_ids:
        top_5_neighbors = get_top_k_neighbors(id)["ID"].tolist()
        if ID in top_5_neighbors:
            pred_5 = pred_5 +1
            print("Player has matched",ID,df_22[df_22["ID"]==ID]["Name"])


    

Player has matched 230658 111    Arthur
Name: Name, dtype: object
Player has matched 233419 261    Raphinha
Name: Name, dtype: object
Player has matched 222028 838    J. Weigl
Name: Name, dtype: object
Player has matched 231887 890    Y. Yazıcı
Name: Name, dtype: object
Player has matched 204876 905    Clerc
Name: Name, dtype: object
Player has matched 226166 972    N. Mukiele
Name: Name, dtype: object
Player has matched 193290 1309    M. Braithwaite
Name: Name, dtype: object
Player has matched 213813 1466    A. Fransson
Name: Name, dtype: object
Player has matched 237512 2409    G. Togni
Name: Name, dtype: object
Player has matched 243631 2800    M. Camara
Name: Name, dtype: object
Player has matched 202695 3121    J. Tarkowski
Name: Name, dtype: object
Player has matched 221705 3394    K. Dempsey
Name: Name, dtype: object
Player has matched 215353 4314    L. Alario
Name: Name, dtype: object
Player has matched 245247 4654    F. Domínguez
Name: Name, dtype: object
Player has matched 25

### Top 10

In [61]:
pred_10 =0 
total_10 =0
for key, value in top_players_dict.items():
    club_23 = key[0]
    best_position = key[1]
    ID = key[2]
    top_player_ids = value
    total_10 = total_10 +1

    for id in top_player_ids:
        top_5_neighbors = get_top_k_neighbors(id,10)["ID"].tolist()

        if ID in top_5_neighbors:
            pred_10 = pred_10 +1
            print("Player has matched",ID,df_22[df_22["ID"]==ID]["Name"])

Player has matched 230658 111    Arthur
Name: Name, dtype: object
Player has matched 239231 234    Cucurella
Name: Name, dtype: object
Player has matched 233419 261    Raphinha
Name: Name, dtype: object
Player has matched 245279 581    Reguilón
Name: Name, dtype: object
Player has matched 212678 825    L. Augustinsson
Name: Name, dtype: object
Player has matched 222028 838    J. Weigl
Name: Name, dtype: object
Player has matched 231887 890    Y. Yazıcı
Name: Name, dtype: object
Player has matched 204876 905    Clerc
Name: Name, dtype: object
Player has matched 226166 972    N. Mukiele
Name: Name, dtype: object
Player has matched 211575 984    André Gomes
Name: Name, dtype: object
Player has matched 193290 1309    M. Braithwaite
Name: Name, dtype: object
Player has matched 200601 1351    Yoon Bit Garam
Name: Name, dtype: object
Player has matched 213813 1466    A. Fransson
Name: Name, dtype: object
Player has matched 229237 1756    M. Akanji
Name: Name, dtype: object
Player has matched 

### Top 25

In [62]:
pred_25 =0 
total_25 =0
for key, value in top_players_dict.items():
    club_23 = key[0]
    best_position = key[1]
    ID = key[2]
    top_player_ids = value
    total_25 = total_25 +1

    for id in top_player_ids:
        top_5_neighbors = get_top_k_neighbors(id,25)["ID"].tolist()
        if ID in top_5_neighbors:
            pred_25 = pred_25 +1
            print("Player has matched",ID,df_22[df_22["ID"]==ID]["Name"])

Player has matched 212462 16    Alex Telles
Name: Name, dtype: object
Player has matched 208574 23    F. Kostić
Name: Name, dtype: object
Player has matched 188545 33    R. Lewandowski
Name: Name, dtype: object
Player has matched 207439 81    L. Paredes
Name: Name, dtype: object
Player has matched 230658 111    Arthur
Name: Name, dtype: object
Player has matched 231943 182    Richarlison
Name: Name, dtype: object
Player has matched 239231 234    Cucurella
Name: Name, dtype: object
Player has matched 236480 240    Y. Bissouma
Name: Name, dtype: object
Player has matched 227535 258    R. Bentancur
Name: Name, dtype: object
Player has matched 233419 261    Raphinha
Name: Name, dtype: object
Player has matched 224081 296    K. Phillips
Name: Name, dtype: object
Player has matched 247394 297    D. Kulusevski
Name: Name, dtype: object
Player has matched 192679 519    Escudero
Name: Name, dtype: object
Player has matched 242577 545    R. Faivre
Name: Name, dtype: object
Player has matched 245

### Top 50

In [63]:
pred_50 =0 
total_50 =0
for key, value in top_players_dict.items():
    club_23 = key[0]
    best_position = key[1]
    ID = key[2]
    top_player_ids = value
    total_50 = total_50 +1

    for id in top_player_ids:
        top_5_neighbors = get_top_k_neighbors(id,50)["ID"].tolist()
        # print(top_5_neighbors)
        if ID in top_5_neighbors:
            pred_50 = pred_50 +1
            print("Player has matched",ID,df_22[df_22["ID"]==ID]["Name"])

Player has matched 212462 16    Alex Telles
Name: Name, dtype: object
Player has matched 208574 23    F. Kostić
Name: Name, dtype: object
Player has matched 188545 33    R. Lewandowski
Name: Name, dtype: object
Player has matched 207439 81    L. Paredes
Name: Name, dtype: object
Player has matched 230658 111    Arthur
Name: Name, dtype: object
Player has matched 231943 182    Richarlison
Name: Name, dtype: object
Player has matched 239231 234    Cucurella
Name: Name, dtype: object
Player has matched 236480 240    Y. Bissouma
Name: Name, dtype: object
Player has matched 193474 247    I. Gueye
Name: Name, dtype: object
Player has matched 227535 258    R. Bentancur
Name: Name, dtype: object
Player has matched 233419 261    Raphinha
Name: Name, dtype: object
Player has matched 241637 280    A. Tchouaméni
Name: Name, dtype: object
Player has matched 224081 296    K. Phillips
Name: Name, dtype: object
Player has matched 224081 296    K. Phillips
Name: Name, dtype: object
Player has matched 2

### Evaulations

In [64]:
(pred_5/total_5)*100

0.9996667777407532

In [65]:
(pred_10/total_10)*100

1.7660779740086638

In [66]:
(pred_25/total_25)*100

4.165278240586471

In [67]:
(pred_50/total_50)*100

7.830723092302566