In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import random

In [2]:
cols = list(range(2,26))
temp_df = pd.read_csv(r'My_Players.csv', usecols=cols)

## data cleaning part 🧹🚯

In [3]:
df = temp_df.drop(['Version', 'Price', 'WR','Game_Stats', 'Base_Stats', 'Popularity', 'Rat',
                   'Weak_Foot','Skils'],axis=1) 
# Getting rid of unnecessary couloms and dropping duplicates

In [4]:
df = df.drop_duplicates()
df

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Height,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing
0,CF,93,89,86,96,27,62,"169cm | 5'6""",94.0,95.0,25.0,92.0,90.0,90.0,94.0
1,LW,93,93,81,91,32,79,"185cm | 6'0""",93.0,63.0,22.0,91.0,94.0,81.0,95.0
2,ST,76,91,81,86,34,86,"195cm | 6'4""",86.0,41.0,25.0,86.0,77.0,83.0,91.0
3,RM,93,86,83,92,32,64,"180cm | 5'10""",93.0,91.0,29.0,89.0,93.0,84.0,85.0
4,GK,87,85,92,86,58,90,"193cm | 6'3""",43.0,35.0,25.0,25.0,61.0,20.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822,RM,81,66,68,82,39,56,"173cm | 5'8""\nLean (67kg)",86.0,83.0,40.0,68.0,74.0,72.0,70.0
7823,CDM,87,58,66,72,74,81,"183cm | 6'0""\nLean (70kg)",81.0,81.0,77.0,68.0,88.0,70.0,62.0
7824,RM,87,71,61,80,52,68,"189cm | 6'2""\nHigh & Average+ (79kg)",84.0,73.0,53.0,80.0,89.0,58.0,80.0
7825,LB,89,60,69,75,69,69,"176cm | 5'9""\nLean (69kg)",81.0,74.0,71.0,70.0,89.0,65.0,60.0


In [5]:
split_height = df["Height"].str.split("cm",n=1,expand=True)
split_height.rename(columns={0:'Player_Height',1:'Delete'})
# I created a new table to separate the height from the rest of the string

Unnamed: 0,Player_Height,Delete
0,169,"| 5'6"""
1,185,"| 6'0"""
2,195,"| 6'4"""
3,180,"| 5'10"""
4,193,"| 6'3"""
...,...,...
7822,173,"| 5'8""\nLean (67kg)"
7823,183,"| 6'0""\nLean (70kg)"
7824,189,"| 6'2""\nHigh & Average+ (79kg)"
7825,176,"| 5'9""\nLean (69kg)"


In [6]:
df[['Player_Height','Delete']] = df['Height'].str.split("cm",n=1,expand=True)
df = df.drop(['Delete', 'Height'],axis=1)
df['Player_Height'] = df['Player_Height'].str.replace(r'\D', '').astype(int) # delete the characters that not number
df['Player_Height'].describe(include='all')
df.drop(df.loc[df["Player_Height"] < 150].index, inplace=True)
# Removed players who are under 150 cm from the table
# Added the new table to the main table and deleted the unnecessary columns

  df['Player_Height'] = df['Player_Height'].str.replace(r'\D', '').astype(int) # delete the characters that not number


In [7]:
df.drop(df.loc[df["Agility"] == '-'].index, inplace=True)
# Delete the characters that not number

In [8]:
df.drop(df.loc[df["Position"] == 'GK'].index, inplace=True)
# Removed the goalkeepers from the table

In [9]:
position_dict = {"ST":1 ,"CF":1 ,"RF":2, "RW":2, "LF":2, "LW":2, "LM":2, "RM":2, "CAM":2, "CM":3,
                  "CDM":3, "LB":4, "RB":4, "LWB":4, "RWB":4, "CB":5}
df['Position'] = df['Position'].replace(position_dict).astype(int)
# Combining several positions and giving them values

In [10]:
df.dropna(inplace=True)
df['Marking'] = df['Marking'].astype(int)
df['Balance'] = df['Balance'].astype(int)
df['Agility'] = df['Agility'].astype(int)
df['Positioning'] = df['Positioning'].astype(int)
df['Player_Height'] = df['Player_Height'].astype(int)
df['Sprint_Speed'] = df['Sprint_Speed'].astype(int)
df['Vision'] = df['Vision'].astype(int)
df['Finishing'] = df['Finishing'].astype(int)

# Changed the type to int

In [11]:
df

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height
0,1,93,89,86,96,27,62,94,95,25,92,90,90,94,169
1,2,93,93,81,91,32,79,93,63,22,91,94,81,95,185
2,1,76,91,81,86,34,86,86,41,25,86,77,83,91,195
3,2,93,86,83,92,32,64,93,91,29,89,93,84,85,180
5,1,83,87,79,88,42,79,86,60,30,88,79,84,91,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7822,2,81,66,68,82,39,56,86,83,40,68,74,72,70,173
7823,3,87,58,66,72,74,81,81,81,77,68,88,70,62,183
7824,2,87,71,61,80,52,68,84,73,53,80,89,58,80,189
7825,4,89,60,69,75,69,69,81,74,71,70,89,65,60,176


## the players position has changed to: 
 * ST = 1
 * CF = 1
 * RF = 2
 * RW = 2
 * LF = 2
 * LW = 2
 * LM = 2
 * RM = 2
 * CAM = 3
 * CM = 3
 * CDM = 3
 * LB = 4
 * RB = 4
 * LWB = 4
 * RWB = 4
 * CB = 5

In [12]:
df.to_csv("Players_2.csv")

In [13]:
cols1 = list(range(1,16))
df_normalize1 = pd.read_csv(r'Players_2.csv', usecols=cols1)

In [14]:
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 2].sample(frac=0.58).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 1].sample(frac=0.31).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 3].sample(frac=0.33).index)
df_normalize1 = df_normalize1.drop(df_normalize1[df_normalize1['Position'] == 5].sample(frac=0.24).index)

In [15]:
df_normalize1['Position'].value_counts()

3    822
1    807
4    799
2    798
5    796
Name: Position, dtype: int64

In [16]:
def normalize_column(values):
    min = np.min(values - 1)
    max = np.max(values)
    norm = (values - min) / (max - min)
    return(pd.DataFrame(norm))
# normalize_column by min max method

In [17]:
attack_contribution  = []

df_normalize1['Player_Height'] = normalize_column(df_normalize1['Player_Height']) 
df_normalize1['Agility'] = normalize_column(df_normalize1['Agility']) 
df_normalize1['Balance'] = normalize_column(df_normalize1['Balance'])
df_normalize1['Marking'] = normalize_column(df_normalize1['Marking'])
df_normalize1['Positioning'] = normalize_column(df_normalize1['Positioning']) 
df_normalize1['Sprint_Speed'] = normalize_column(df_normalize1['Sprint_Speed']) 
df_normalize1['Vision'] = normalize_column(df_normalize1['Vision'])
df_normalize1['Finishing'] = normalize_column(df_normalize1['Finishing'])
# normalize the rest of my data

for row in range(4022):  # run on the rows
    sum = 0
    for col in range(1,7):
        sum += df_normalize1.iloc[row ,col]
        
    for col in range(1,7):
        df_normalize1.iloc[row ,col] /= sum
              
    attack_contribution.append((df_normalize1.iloc[row ,1] + df_normalize1.iloc[row ,2] + df_normalize1.iloc[row ,4] + df_normalize1.iloc[row ,3] + df_normalize1.iloc[row ,7]) -
                      (df_normalize1.iloc[row ,5] + df_normalize1.iloc[row ,9]))
                
df_normalize1['attack_contribution'] = attack_contribution
df_normalize1['attack_contribution'] = normalize_column(df_normalize1['attack_contribution']) # normalize the attack_contribution

# i found tha percentage of every main stat from the sum of the stats

In [18]:
df_normalize1.describe()

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height,attack_contribution
count,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0,4022.0
mean,2.994779,0.178958,0.155478,0.166758,0.177138,0.146261,0.175407,0.69208,0.613805,0.59362,0.667423,0.700767,0.663718,0.620032,0.508538,0.676342
std,1.411299,0.025441,0.030889,0.016057,0.017636,0.048848,0.022783,0.171906,0.203396,0.267899,0.196645,0.160811,0.155278,0.201801,0.156396,0.148009
min,1.0,0.0775,0.050667,0.105714,0.105263,0.04401,0.104762,0.013333,0.014493,0.011765,0.0125,0.014286,0.0125,0.011628,0.022727,0.307075
25%,2.0,0.163542,0.134956,0.157623,0.166667,0.096244,0.160088,0.6,0.478261,0.341176,0.5875,0.6,0.5875,0.476744,0.386364,0.565712
50%,3.0,0.179775,0.158011,0.166667,0.176471,0.161981,0.174583,0.72,0.652174,0.717647,0.725,0.714286,0.6875,0.662791,0.5,0.650891
75%,4.0,0.196764,0.180905,0.176039,0.189725,0.183623,0.190045,0.813333,0.753623,0.811765,0.8,0.814286,0.775,0.77907,0.636364,0.81013
max,5.0,0.241206,0.246106,0.249258,0.240356,0.271186,0.256798,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
df_normalize1.corr()

Unnamed: 0,Position,Pace,Shooting,Passing,Dribbling,Defending,Physicality,Agility,Balance,Marking,Positioning,Sprint_Speed,Vision,Finishing,Player_Height,attack_contribution
Position,1.0,-0.291591,-0.866351,-0.380277,-0.714768,0.897729,0.39671,-0.456513,-0.335282,0.847496,-0.76586,-0.237896,-0.50329,-0.836893,0.26455,-0.834579
Pace,-0.291591,1.0,0.113362,-0.180652,0.376035,-0.499817,-0.362494,0.569345,0.418313,-0.468015,0.194613,0.89709,-0.11016,0.159071,-0.375184,0.581631
Shooting,-0.866351,0.113362,1.0,0.458184,0.643825,-0.86115,-0.457306,0.390073,0.288676,-0.793786,0.799966,0.101263,0.562056,0.94171,-0.231324,0.776481
Passing,-0.380277,-0.180652,0.458184,1.0,0.615665,-0.42468,-0.690268,0.405213,0.429125,-0.326385,0.521484,-0.144632,0.816662,0.445862,-0.403193,0.444637
Dribbling,-0.714768,0.376035,0.643825,0.615665,1.0,-0.814137,-0.755216,0.721158,0.632865,-0.725488,0.684534,0.291781,0.60096,0.647584,-0.548994,0.861156
Defending,0.897729,-0.499817,-0.86115,-0.42468,-0.814137,1.0,0.511118,-0.572314,-0.422301,0.948603,-0.748643,-0.405849,-0.475317,-0.818505,0.349948,-0.958215
Physicality,0.39671,-0.362494,-0.457306,-0.690268,-0.755216,0.511118,1.0,-0.781363,-0.745381,0.356578,-0.594175,-0.392813,-0.660651,-0.514978,0.691394,-0.627735
Agility,-0.456513,0.569345,0.390073,0.405213,0.721158,-0.572314,-0.781363,1.0,0.833176,-0.426758,0.599632,0.605277,0.50288,0.496594,-0.740014,0.746789
Balance,-0.335282,0.418313,0.288676,0.429125,0.632865,-0.422301,-0.745381,0.833176,1.0,-0.275961,0.508146,0.450269,0.484814,0.38261,-0.860544,0.571369
Marking,0.847496,-0.468015,-0.793786,-0.326385,-0.725488,0.948603,0.356578,-0.426758,-0.275961,1.0,-0.603703,-0.30356,-0.319403,-0.71321,0.231251,-0.917145


In [20]:
df_normalize1.to_csv("Players_2_Normalized.csv")