In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import random

In [2]:
cols = list(range(2,26))
temp_df = pd.read_csv(r'My_Players.csv', usecols=cols)

## data cleaning part 🧹🚯

In [3]:
df = temp_df.drop(['Version', 'Price', 'WR','Game_Stats', 'Base_Stats', 'Popularity', 'Rat',
                   'Weak_Foot','Skils'],axis=1) 
# Getting rid of unnecessary couloms and dropping duplicates

In [4]:
df = df.drop_duplicates()
df

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Height,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing
0,CF,93,89,86,96,27,62,"169cm | 5'6""",94.0,95.0,25.0,92.0,90.0,90.0,94.0
1,LW,93,93,81,91,32,79,"185cm | 6'0""",93.0,63.0,22.0,91.0,94.0,81.0,95.0
2,ST,76,91,81,86,34,86,"195cm | 6'4""",86.0,41.0,25.0,86.0,77.0,83.0,91.0
3,RM,93,86,83,92,32,64,"180cm | 5'10""",93.0,91.0,29.0,89.0,93.0,84.0,85.0
4,GK,87,85,92,86,58,90,"193cm | 6'3""",43.0,35.0,25.0,25.0,61.0,20.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822,RM,81,66,68,82,39,56,"173cm | 5'8""\nLean (67kg)",86.0,83.0,40.0,68.0,74.0,72.0,70.0
7823,CDM,87,58,66,72,74,81,"183cm | 6'0""\nLean (70kg)",81.0,81.0,77.0,68.0,88.0,70.0,62.0
7824,RM,87,71,61,80,52,68,"189cm | 6'2""\nHigh & Average+ (79kg)",84.0,73.0,53.0,80.0,89.0,58.0,80.0
7825,LB,89,60,69,75,69,69,"176cm | 5'9""\nLean (69kg)",81.0,74.0,71.0,70.0,89.0,65.0,60.0


In [5]:
split_height = df["Height"].str.split("cm",n=1,expand=True)
split_height.rename(columns={0:'Player_Height',1:'Delete'})
# I created a new table to separate the height from the rest of the string

Unnamed: 0,Player_Height,Delete
0,169,"| 5'6"""
1,185,"| 6'0"""
2,195,"| 6'4"""
3,180,"| 5'10"""
4,193,"| 6'3"""
...,...,...
7822,173,"| 5'8""\nLean (67kg)"
7823,183,"| 6'0""\nLean (70kg)"
7824,189,"| 6'2""\nHigh & Average+ (79kg)"
7825,176,"| 5'9""\nLean (69kg)"


In [6]:
df[['Player_Height','Delete']] = df['Height'].str.split("cm",n=1,expand=True)
df = df.drop(['Delete', 'Height'],axis=1)
df['Player_Height'] = df['Player_Height'].str.replace(r'\D', '').astype(int) # delete the characters that not number
df['Player_Height'].describe(include='all')
df.drop(df.loc[df["Player_Height"] < 150].index, inplace=True)
# Removed players who are under 150 cm from the table
# Added the new table to the main table and deleted the unnecessary columns

  df['Player_Height'] = df['Player_Height'].str.replace(r'\D', '').astype(int) # delete the characters that not number


In [7]:
df.drop(df.loc[df["Agility"] == '-'].index, inplace=True)
# Delete the characters that not number

In [8]:
df.drop(df.loc[df["Position"] == 'GK'].index, inplace=True)
# Removed the goalkeepers from the table

In [9]:
position_dict = {"ST":1 ,"CF":1 ,"RF":2, "RW":2, "LF":2, "LW":2, "LM":2, "RM":2, "CAM":2, "CM":3,
                  "CDM":3, "LB":4, "RB":4, "LWB":4, "RWB":4, "CB":5}
df['Position'] = df['Position'].replace(position_dict).astype(int)
# Combining several positions and giving them values

In [10]:
df.dropna(inplace=True)
df['Marking'] = df['Marking'].astype(int)
df['Balance'] = df['Balance'].astype(int)
df['Agility'] = df['Agility'].astype(int)
df['Positioning'] = df['Positioning'].astype(int)
df['Player_Height'] = df['Player_Height'].astype(int)
df['Sprint_Speed'] = df['Sprint_Speed'].astype(int)
df['Vision'] = df['Vision'].astype(int)
df['Finishing'] = df['Finishing'].astype(int)

# Changed the type to int

In [11]:
df

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height
0,1,93,89,86,96,27,62,94,95,25,92,90,90,94,169
1,2,93,93,81,91,32,79,93,63,22,91,94,81,95,185
2,1,76,91,81,86,34,86,86,41,25,86,77,83,91,195
3,2,93,86,83,92,32,64,93,91,29,89,93,84,85,180
5,1,83,87,79,88,42,79,86,60,30,88,79,84,91,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822,2,81,66,68,82,39,56,86,83,40,68,74,72,70,173
7823,3,87,58,66,72,74,81,81,81,77,68,88,70,62,183
7824,2,87,71,61,80,52,68,84,73,53,80,89,58,80,189
7825,4,89,60,69,75,69,69,81,74,71,70,89,65,60,176


## the players position has changed to: 
 * ST = 1
 * CF = 1
 * RF = 2
 * RW = 2
 * LF = 2
 * LW = 2
 * LM = 2
 * RM = 2
 * CAM = 3
 * CM = 3
 * CDM = 3
 * LB = 4
 * RB = 4
 * LWB = 4
 * RWB = 4
 * CB = 5

In [12]:
df.to_csv("Players_2.csv")

In [13]:
cols1 = list(range(1,16))
df_normalize1 = pd.read_csv(r'Players_2.csv', usecols=cols1)

In [14]:
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 2].sample(frac=0.58).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 1].sample(frac=0.31).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 3].sample(frac=0.33).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 5].sample(frac=0.24).index)

In [15]:
df_normalize1['Position'].value_counts()

3    822
1    807
4    799
2    798
5    796
Name: Position, dtype: int64

In [16]:
def normalize_column(values):
    min = np.min(values - 1)
    max = np.max(values)
    norm = (values - min) / (max - min)
    return(pd.DataFrame(norm))
# normalize_column by min max method

In [17]:
attack_contribution  = []

df_normalize1['Player_Height'] = normalize_column(df_normalize1['Player_Height']) 
df_normalize1['Agility'] = normalize_column(df_normalize1['Agility']) 
df_normalize1['Balance'] = normalize_column(df_normalize1['Balance'])
df_normalize1['Marking'] = normalize_column(df_normalize1['Marking'])
df_normalize1['Positioning'] = normalize_column(df_normalize1['Positioning']) 
df_normalize1['Sprint_Speed'] = normalize_column(df_normalize1['Sprint_Speed']) 
df_normalize1['Vision'] = normalize_column(df_normalize1['Vision'])
df_normalize1['Finishing'] = normalize_column(df_normalize1['Finishing'])
# normalize the rest of my data

for row in range(4022):  # run on the rows
    sum = 0
    for col in range(1,7):
        sum += df_normalize1.iloc[row ,col]
        
    for col in range(1,7):
        df_normalize1.iloc[row ,col] /= sum
              
    attack_contribution.append((df_normalize1.iloc[row ,1] + df_normalize1.iloc[row ,2] + df_normalize1.iloc[row ,4] + df_normalize1.iloc[row ,3] + df_normalize1.iloc[row ,7]) -
                      (df_normalize1.iloc[row ,5] + df_normalize1.iloc[row ,9]))
                
df_normalize1['attack_contribution'] = attack_contribution
df_normalize1['attack_contribution'] = normalize_column(df_normalize1['attack_contribution']) # normalize the attack_contribution

# i found tha percentage of every main stat from the sum of the stats

In [18]:
df_normalize1.describe()

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height,attack_contribution
count,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0
mean,2.994779,0.179327,0.15561,0.166976,0.177177,0.145805,0.175104,0.675509,0.621329,0.592684,0.668247,0.703673,0.66498,0.620989,0.507425,0.673953
std,1.411299,0.025667,0.030891,0.015894,0.017493,0.048815,0.022455,0.165015,0.198625,0.267559,0.195425,0.160536,0.154072,0.200972,0.156291,0.147413
min,1.0,0.079903,0.048257,0.105572,0.105263,0.044226,0.106888,0.012987,0.014286,0.011765,0.0125,0.014286,0.0125,0.011628,0.022727,0.307618
25%,2.0,0.16372,0.135135,0.157895,0.166667,0.095455,0.160094,0.584416,0.485714,0.329412,0.5875,0.6,0.5875,0.488372,0.386364,0.561537
50%,3.0,0.180139,0.158817,0.166667,0.176339,0.161798,0.174364,0.701299,0.657143,0.717647,0.73125,0.714286,0.6875,0.662791,0.5,0.647599
75%,4.0,0.197332,0.180693,0.176339,0.189573,0.183245,0.189443,0.792208,0.757143,0.811765,0.8,0.828571,0.775,0.77907,0.636364,0.809357
max,5.0,0.255556,0.246106,0.248603,0.225722,0.262687,0.253086,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
df_normalize1.corr()

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height,attack_contribution
Position,1.0,-0.296155,-0.865156,-0.382717,-0.713657,0.899438,0.400236,-0.467854,-0.344122,0.847388,-0.765669,-0.244775,-0.508957,-0.834525,0.271747,-0.83623
Pace,-0.296155,1.0,0.104596,-0.186188,0.389042,-0.504925,-0.360555,0.575614,0.426804,-0.472308,0.178224,0.897245,-0.122946,0.147731,-0.387008,0.581681
Shooting,-0.865156,0.104596,1.0,0.462778,0.639331,-0.855145,-0.461837,0.397037,0.29338,-0.787433,0.804112,0.095734,0.57128,0.942424,-0.239474,0.772231
Passing,-0.382717,-0.186188,0.462778,1.0,0.607997,-0.424502,-0.68244,0.388426,0.410158,-0.341529,0.519319,-0.15292,0.817602,0.4435,-0.390266,0.442904
Dribbling,-0.713657,0.389042,0.639331,0.607997,1.0,-0.816303,-0.759,0.729003,0.629889,-0.735266,0.674698,0.305788,0.592903,0.637819,-0.552753,0.863427
Defending,0.899438,-0.504925,-0.855145,-0.424502,-0.816303,1.0,0.516023,-0.587077,-0.431072,0.948766,-0.74344,-0.413214,-0.475806,-0.812522,0.364845,-0.9598
Physicality,0.400236,-0.360555,-0.461837,-0.68244,-0.759,0.516023,1.0,-0.770735,-0.735351,0.375108,-0.586928,-0.388974,-0.6516,-0.509772,0.685508,-0.626825
Agility,-0.467854,0.575614,0.397037,0.388426,0.729003,-0.587077,-0.770735,1.0,0.828716,-0.450984,0.592354,0.610467,0.488602,0.495794,-0.735743,0.751869
Balance,-0.344122,0.426804,0.29338,0.410158,0.629889,-0.431072,-0.735351,0.828716,1.0,-0.292067,0.499727,0.460127,0.470607,0.377248,-0.857193,0.570718
Marking,0.847388,-0.472308,-0.787433,-0.341529,-0.735266,0.948766,0.375108,-0.450984,-0.292067,1.0,-0.603369,-0.311817,-0.331047,-0.70935,0.252933,-0.924501


In [20]:
df_normalize1.to_csv("Players_2_Normalized.csv")