# Player Recommendation System

## Data Cleaning and Preprocessing

In [1]:
#pd.set_option("display.max_columns", 100)

The tools to be used are loaded.

In [2]:
import numpy as np
import pandas as pd

The csv file containing the football player data is transferred into the notebook.

In [3]:
df = pd.read_csv("FM 2023.csv")

The dataset consists of 8452 players and contains 98 columns. These columns provide information about the player's general and football-related characteristics. There are many features under headings such as technical, physical and mental.

In [4]:
df.shape

(8452, 98)

In [5]:
df.columns

Index(['Name', 'Position', 'Age', 'ca', 'pa', 'Nationality', 'Club', 'Corners',
       'Crossing', 'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking',
       'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing',
       'Penalty Taking', 'Tackling', 'Technique', 'Aggressiion',
       'Anticipation', 'Bravery', 'Composure', 'Concentration', 'Vision',
       'Decision', 'Determination', 'Flair', 'Leadership', 'Off The Ball',
       'Position.1', 'Teamwork', 'Work Rate', 'Acceleration', 'Agility',
       'Balance', 'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina',
       'Strength', 'Stability', 'Foul', 'Contest performance', 'Injury',
       'diversity', 'Aerial Reach', 'Command Of Area', 'Communication',
       'Eccentricity', 'Handling', 'Kicking', 'One On Ones', 'Reflexes',
       'Rushing Out', 'Punching', 'Throwing', 'Adaptation', 'Ambition',
       'Argue', 'Loyal', 'Resistant to stress', 'Professional',
       'Sportsmanship', 'Emotional control', 'GK', 'DL', 

Columns that will not be used are removed from the data set.

In [6]:
df.drop(columns=["Race", "Colour of skin", "UID", "Current reputation",
                 "Domestic reputation", "World reputation"], inplace=True)

Check which columns have empty values.

In [7]:
df.columns[df.isna().sum() != 0]

Index(['Club', 'Salary', 'Rental club'], dtype='object')

Players who are without a club have been identified and the club column of these players has been filled in as free player.

In [8]:
df.loc[df['Club'].isnull(), 'Club'] = "Free Player"

It appears that the players whose salary column is empty are free agents. A prediction model will be developed for this situation.

In [9]:
df.loc[df['Salary'].isnull(), 'Club'] 

822     Free Player
1219    Free Player
1257    Free Player
1325    Free Player
1514    Free Player
           ...     
7209    Free Player
7817    Free Player
7868    Free Player
8196    Free Player
8252    Free Player
Name: Club, Length: 107, dtype: object

The rental club column of players playing for their own club is filled in.

In [10]:
df.loc[df['Rental club'].isnull(), 'Rental club'] = "absent"

Long and unnecessary column names have been shortened.

In [11]:
df.rename(columns={"Free Kick Taking":"Free_kick", "Penalty Taking":"Penalty", 
                   "Number of national team appearances" : "match_nat_team", 
                   "Goals scored for the national team":"Gol_nat_team"}, inplace=True)

In [12]:
df

Unnamed: 0,Name,Position,Age,ca,pa,Nationality,Club,Corners,Crossing,Dribbling,...,Weight,Left Foot,Right Foot,Values,RCA,Date of birth,match_nat_team,Gol_nat_team,Salary,Rental club
0,Kevin De Bruyne,M/AM RLC,31,189,189,Belgium,Manchester City,14,19,15,...,68,16,20,347975206,181,1991/6/28,91,24,394372.0,absent
1,Kylian Mbappé,AM/S RL,23,188,197,France,Paris Saint-Germain,13,13,18,...,73,10,20,347975206,172,1998/12/20,57,27,1035616.0,absent
2,Robert Lewandowski,S,33,186,190,Poland,Barcelona,3,8,13,...,81,13,20,347975206,183,1988/8/21,132,76,345204.0,absent
3,Erling Haaland,S,22,185,195,"Norway,England",Manchester City,7,10,14,...,88,20,11,347975206,185,2000/7/21,21,20,394372.0,absent
4,Mohamed Salah,AM/S RL,30,185,187,Egypt,Liverpool,12,14,17,...,72,20,8,347975206,181,1992/6/15,85,47,405971.0,absent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8447,Joe Ashton,D L,16,45,135,England,Burnley,3,4,5,...,0,20,7,124343,41,2006/6/26,0,0,203.0,absent
8448,River Ries,S,17,45,135,Germany,Karlsruher SC,2,2,11,...,0,8,20,42749,33,2005/4/3,0,0,402.0,absent
8449,Halilcan Doğan,D C,23,45,-75,Turkey,Osmaniyespor Futbol Kulübü,1,1,1,...,0,8,20,5199,41,1999/2/11,0,0,82.0,absent
8450,Adijat Sefer,S,17,45,135,Germany,TSG Hoffenheim,3,4,12,...,0,7,20,70999,40,2005/1/20,0,0,646.0,absent


## Model Training

In [13]:
unique_positions = df['Position'].str.split(expand=True).stack().unique()

print(unique_positions)

['M/AM' 'RLC' 'AM/S' 'RL' 'S' 'C' 'GK' 'D' 'RC' 'M' 'D/DM/M' 'LC' 'DM'
 'AM' 'L' 'DM/M/AM' 'DM/M' 'D/WB' 'WB/M/AM' 'D/WB/M' 'R' 'D/DM' 'M/AM/S'
 'D/WB/M/AM' 'WB/AM' 'WB/DM/M/AM' 'D/WB/DM/M' 'D/WB/DM' 'D/WB/DM/M/AM'
 'D/WB/AM' 'WB/DM/M' 'D/M/AM/S' 'D/M/AM' 'WB/M' 'M/S' 'D/M' 'DM/M/AM/S'
 'WB/M/AM/S' 'WB' 'D/AM' 'WB/AM/S' 'DM/AM' 'D/S']


Explanations of position abbreviation

**GK** - Goalkeeper  
**D** - Defender  
**WB** - Wingback   
**DM** - Defensive Midfielder   
**M** - Midfielder  
**AM** - Attacking Midfielder   
**S** - Striker   
 
**C** - Center  
**L** - Left  
**R** - Right  
**RC** - Right Center  
**LC** - Left Center  
**RL** - Right Left  
**RLC** - Right Left Center

**GK** - Goalkeeper  
**DL** - Left Fullback (Defender)   
**DC** - Center Back (Defender)  
**DR** - Right Fullback (Defender)  
**WBL** - Left Wingback (Defender/Midfielder)   
**WBR** - Right Wingback (Defender/Midfielder)  
**DM** - Defensive Midfielder  
**ML** - Left Midfielder  
**MC** - Center Midfielder   
**MR** - Right Midfielder  
**AML** - Left Attacking Midfielder   
**AMC** - Center Attacking Midfielder   
**AMR** - Right Attacking Midfielder  
**ST** - Striker  

The features are divided into 5 parts: technical, mental, physical, goalkeeping and other.

In [14]:
Technical = ['Corners','Crossing', 'Dribbling', 'Finishing', 'First Touch',
             'Free_kick','Heading', 'Long Shots', 'Long Throws', 'Marking',
             'Passing','Penalty', 'Tackling', 'Technique']
Mental = ['Aggressiion','Anticipation', 'Bravery', 'Composure', 'Concentration',
          'Vision', 'Decision', 'Determination', 'Flair', 'Leadership', 'Off The Ball',
          'Position.1', 'Teamwork', 'Work Rate']
Physical = ['Acceleration', 'Agility', 'Balance', 'Jumping Reach', 'Natural Fitness', 
            'Pace', 'Stamina', 'Strength']
Goolkeeping = ['Aerial Reach', 'Command Of Area', 'Communication', 'Eccentricity',
               'Handling', 'Kicking', 'One On Ones', 'Reflexes','Rushing Out',
               'Punching', 'Throwing']
Other_Feature = ['Stability', 'Foul', 'Contest performance', 'Injury', 'diversity', 
                 'Adaptation', 'Ambition', 'Argue', 'Loyal','Resistant to stress',
                 'Professional', 'Sportsmanship','Emotional control']

By using the position column, which position the player is better at and on which side of that position is added to the data in separate columns.

In [15]:
best_position = df.Position.apply(lambda x: x.split("/")[0] if '/' in x else (x.split(' ')[0] if ' ' in x else x))
df.insert(2,"Best_Position", best_position)

In [16]:
side = df.Position.apply(lambda x: x.split()[-1] if ' ' in x else "-")
df.insert(3,"Side", side)

In [17]:
df

Unnamed: 0,Name,Position,Best_Position,Side,Age,ca,pa,Nationality,Club,Corners,...,Weight,Left Foot,Right Foot,Values,RCA,Date of birth,match_nat_team,Gol_nat_team,Salary,Rental club
0,Kevin De Bruyne,M/AM RLC,M,RLC,31,189,189,Belgium,Manchester City,14,...,68,16,20,347975206,181,1991/6/28,91,24,394372.0,absent
1,Kylian Mbappé,AM/S RL,AM,RL,23,188,197,France,Paris Saint-Germain,13,...,73,10,20,347975206,172,1998/12/20,57,27,1035616.0,absent
2,Robert Lewandowski,S,S,-,33,186,190,Poland,Barcelona,3,...,81,13,20,347975206,183,1988/8/21,132,76,345204.0,absent
3,Erling Haaland,S,S,-,22,185,195,"Norway,England",Manchester City,7,...,88,20,11,347975206,185,2000/7/21,21,20,394372.0,absent
4,Mohamed Salah,AM/S RL,AM,RL,30,185,187,Egypt,Liverpool,12,...,72,20,8,347975206,181,1992/6/15,85,47,405971.0,absent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8447,Joe Ashton,D L,D,L,16,45,135,England,Burnley,3,...,0,20,7,124343,41,2006/6/26,0,0,203.0,absent
8448,River Ries,S,S,-,17,45,135,Germany,Karlsruher SC,2,...,0,8,20,42749,33,2005/4/3,0,0,402.0,absent
8449,Halilcan Doğan,D C,D,C,23,45,-75,Turkey,Osmaniyespor Futbol Kulübü,1,...,0,8,20,5199,41,1999/2/11,0,0,82.0,absent
8450,Adijat Sefer,S,S,-,17,45,135,Germany,TSG Hoffenheim,3,...,0,7,20,70999,40,2005/1/20,0,0,646.0,absent


In [18]:
from mlxtend.frequent_patterns import apriori, association_rules

Which features should be good in which positions were determined using the a priori algorithm.

In [19]:
def feature_selection(position, side):
    
    if position == "GK":
        features = Goolkeeping + Mental + Physical + Other_Feature
    else:
        features = Technical + Mental + Physical + Other_Feature
    
    data = df[(df['Best_Position'] == position) & (df['Side'] == side)][features] > 10

    frequent_itemsets = apriori(data, min_support=0.5, use_colnames=True)

    #rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

    # Sonuçları gösterin
    #display(frequent_itemsets)
    return frequent_itemsets

frequent_itemsets = feature_selection("GK", "-")

In [20]:
frequent_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) > 10)]
frequent_itemsets = frequent_itemsets.sort_values(by='itemsets', ascending=False)
frequent_itemsets

Unnamed: 0,support,itemsets
174690,0.502152,"(Bravery, Aerial Reach, Handling, Agility, Pos..."
174691,0.503587,"(Bravery, Aerial Reach, Handling, Strength, Ag..."
174692,0.503587,"(Bravery, Aerial Reach, Handling, Agility, Pos..."
174693,0.515065,"(Bravery, Aerial Reach, Handling, Agility, Pro..."
174694,0.500717,"(Bravery, Aerial Reach, Handling, Agility, Pro..."
174695,0.500717,"(Bravery, Aerial Reach, Handling, Agility, Pro..."
174696,0.500717,"(Bravery, Aerial Reach, Handling, Strength, Ag..."
174697,0.506456,"(Bravery, Aerial Reach, Handling, Agility, Pro..."
