In [20]:
import pandas as pd
import streamlit as st
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler

# Load the data
data_file = 'fifa22data.csv'
df = pd.read_csv(data_file)
df = df.dropna(subset=["ValueEUR"])
df = df.drop_duplicates(subset=["FullName"])
df = df.set_index('FullName')

In [73]:
df[df['ValueEUR'] == df['ValueEUR'].max()]

Unnamed: 0_level_0,Unnamed: 0,ID,Name,Age,Height,Weight,PhotoUrl,Nationality,Flag,Club,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
FullName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kylian Mbappé,3,231747,K. Mbappé,22,182,73,https://cdn.sofifa.net/players/231/747/22_240.png,France,https://cdn.sofifa.net/teams/1335/180.png,Paris Saint-Germain,...,92,84,92,70,66,70,66,57,66,21


In [68]:
import numpy as np
np.quantile(df['ValueEUR'], .90)

5000000.0

In [22]:
#st.markdown("### Player Recommendation")

# PLayer Selection
player = "Lionel Messi" #st.selectbox('Type First Player: ', options=list(df[:500].index.values))

# Add a slider for selecting the maximum ValueEUR
max_value = int(df['ValueEUR'].mean())#st.slider(
    #"Select the Maximum Player Value:",
    #min_value=int(df['ValueEUR'].min()),
    #max_value=int(df['ValueEUR'].max()),
    #value=int(df['ValueEUR'].max())
#)

# Filter the DataFrame based on the selected maximum ValueEUR
df_filtered = df[df['ValueEUR'] <= max_value]

# Ensure the player selected is in the filtered DataFrame; if not, add them
if player not in df_filtered.index:
    df_filtered = pd.concat([df.loc[[player]], df_filtered])

df_columns = ['Age', 'Height', 'Weight', 'Overall', 'Potential', 'Growth', 'TotalStats',
                'BaseStats', 'ValueEUR', 'WageEUR', 'ReleaseClause', 'IntReputation',
                'WeakFoot', 'SkillMoves', 'PaceTotal', 'ShootingTotal', 'PassingTotal', 'DribblingTotal',
                'DefendingTotal', 'PhysicalityTotal', 'Crossing', 'Finishing', 'HeadingAccuracy',
                'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
                'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
                'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
                'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
                'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
                'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
                'STRating', 'LWRating', 'LFRating', 'CFRating', 'RFRating', 'RWRating',
                'CAMRating', 'LMRating', 'CMRating', 'RMRating', 'LWBRating',
                'CDMRating', 'RWBRating', 'LBRating', 'CBRating', 'RBRating', 'GKRating']

knn_df = df_filtered[:1500][df_columns]

In [23]:
# Step 1: Normalize the feature data
# StandardScaler will normalize the features (mean=0 and variance=1)
scaler = StandardScaler()
knn_scaled = scaler.fit_transform(knn_df)

# Step 2: Create a DataFrame from the normalized data
# This step is useful if you want to maintain a pandas DataFrame structure
# after scaling. It keeps the original index and column names.
knn_final = pd.DataFrame(data=knn_scaled, index=knn_df.index, columns=knn_df.columns)

# Step 3: Convert the DataFrame to a sparse matrix
# csr_matrix is used here to create a compressed sparse row matrix from knn_final,
# which can be more efficient for fitting KNN models especially when dealing with
# a large amount of data and features.
feature_matrix = csr_matrix(knn_final.values)

# Step 4: Initialize and fit the KNN model
# Here, we use the cosine metric and brute force algorithm. You can adjust
# these parameters based on your specific needs or to experiment with performance.
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(feature_matrix)

player_list = []
rec_list = []

In [24]:
for _ in knn_final.index:
    player_data = knn_final.loc[_, :].values.reshape(1, -1)
    
    # Skip the current iteration if the player_data shape is not (1, 71)
    if player_data.shape[1] != 71:
        print(f"Skipping {_} due to unexpected shape: {player_data.shape}")
        continue

    distances, indices = knn_model.kneighbors(knn_final.loc[_, :].values.reshape(1, -1), n_neighbors=11)

    for elem in range(0, len(distances.flatten())):
        if elem == 0:
            # For the first element, which is the player itself, append to player_list
            player_list.append([player])
        else:
            # For other elements, which are the recommended neighbors, append to rec_list
            rec_list.append([_, elem, knn_final.index[indices.flatten()[elem]], distances.flatten()[elem]])

In [25]:
player

'Lionel Messi'

In [45]:
rec_df = pd.DataFrame(rec_list, columns=['search_player', 'rec_number', 'rec_player', 'distance_score'])

top_recs = list(rec_df[rec_df['search_player'] == player]['rec_player'])

In [46]:
top_recs

['Romain Hamouma',
 'Ryan Babel',
 'Scott Arfield',
 'José Sosa',
 'Pierrick Capelle',
 'Steven Davis',
 'Elías Hernández',
 'Daniel Didavi',
 'Marco Benassi',
 'Domenico Criscito']