# Clean dataset

In [190]:
import time
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import preprocessing


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

In [191]:
# define the columns that will be dropped
dropped_columns = ['player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'club_name', 'league_name',
                  'club_position', 'club_loaned_from', 'club_joined', 'nationality_name', 'nation_position',
                  'preferred_foot', 'work_rate', 'body_type', 'real_face', 'player_tags', 'player_traits',
                  'player_face_url', 'club_logo_url', 'club_flag_url', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf',
                  'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                  'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'nation_logo_url', 'nation_flag_url']

# define the columns that will be used
selected_columns = ['sofifa_id', 'overall', 'age', 'height_cm', 'weight_kg',
                    'weak_foot', 'skill_moves', 'pace', 'shooting', 'passing', 'dribbling',
                    'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy',
                    'attacking_short_passing', 'attacking_volleys', 'skill_dribbling','skill_curve',
                    'skill_fk_accuracy', 'skill_long_passing' , 'skill_ball_control', 'movement_acceleration',
                    'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance',
                    'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
                    'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision',
                    'mentality_penalties', 'mentality_composure', 'defending_marking_awareness',
                    'defending_standing_tackle', 'defending_sliding_tackle']

In [192]:
def make_injury_prone_array(x_data):
    res = np.zeros((len(x_data),), dtype=int)
    for i in range(len(x_data)):
        if np.isin('Injury Prone',x_data[i]): res[i] = 0
        else: res [i] = 1
    return res

In [193]:
t0 = time.time()

# read the 2017 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_17.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values

X_2017 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2017))
#data = data.drop('player_traits', axis = 1)
#X_2017 = data.iloc[:, :-2].values


# create a vector with zeros (same length as X_2017)
# these are the labels for the 2017 data
Y_2017 = np.zeros((len(X_2017),), dtype=int)

# read the 2018 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_18.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values
X_2018 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2018))
#data = data.drop('player_traits', axis = 1)
#X_2018 = data.iloc[:, :-2].values

# read the 2019 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_19.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values
X_2019 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2019))
#data = data.drop('player_traits', axis = 1)
#X_2019 = data.iloc[:, :-2].values

# read the 2020 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_20.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values
X_2020 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2020))
#data = data.drop('player_traits', axis = 1)
#X_2020 = data.iloc[:, :-2].values

# read the 2021 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_21.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values
X_2021 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2021))
#data = data.drop('player_traits', axis = 1)
#X_2021 = data.iloc[:, :-2].values

# read the 2022 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_22.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
data = data.dropna()     #drop all rows that have any NaN values
X_2022 = data.iloc[:, :-2].values
#data = data.assign(injury_prone = make_injury_prone_array(X_2022))
#data = data.drop('player_traits', axis = 1)
#X_2022 = data.iloc[:, :-2].values

t1 = time.time()
print("Selection-time: " + str(round(t1 - t0, 2)) + " seconds")

Selection-time: 1.36 seconds


In [194]:
# select the column with the average rating
AVG = X_2022[:, 1]

# highest AVG in the game
max_AVG = np.amax(AVG)

# rating such that 95% of data is lower than this rating 
rating_highest_percent = np.percentile(AVG, 90)
# rating such that 10% of data is lower than this rating 
rating_lowest_percent = np.percentile(AVG, 10)

print(rating_highest_percent)

75.0


In [195]:
# loop over every row in the 2022 data
# if a player has an average rating that is higher or equal to the rating such that 95% of data 
# is lower than this rating , then mark this player as a positive example in the 2017 data
# the first element of a row is a unique id (for a given player, this id is the same in every version of fifa)
t0 = time.time()

for i in range(0, len(X_2018)):
    if (int(X_2018[i][1]) >= rating_highest_percent):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2018[i][0]):
                Y_2017[j] = 1

for i in range(0, len(X_2019)):
    if (int(X_2019[i][1]) >= rating_highest_percent):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2019[i][0]):
                Y_2017[j] = 1

for i in range(0, len(X_2020)):
    if (int(X_2020[i][1]) >= rating_highest_percent):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2020[i][0]):
                Y_2017[j] = 1
                
for i in range(0, len(X_2021)):
    if (int(X_2021[i][1]) >= rating_highest_percent):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2021[i][0]):
                Y_2017[j] = 1
                
for i in range(0, len(X_2022)):
    if (int(X_2022[i][1]) >= rating_highest_percent):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2022[i][0]):
                Y_2017[j] = 1
                
t1 = time.time()
print("Labelling-time: " + str(round(t1-t0, 2)) + " seconds")
                
# print the number of positive and negative examples
print(f"Number of positive examples: {Counter(Y_2017)[1]}")
print(f"Number of unlabeled examples: {Counter(Y_2017)[0]}")

Labelling-time: 104.64 seconds
Number of positive examples: 2680
Number of unlabeled examples: 12923


In [196]:
# select sofifa_id_column
sofifa_id = X_2017[:, 0]

# remove sofifa_id column
X_2017 = np.delete(X_2017, 0, 1)

In [197]:
# normalize values
min_max_scaler = preprocessing.MinMaxScaler()
X_2017 = min_max_scaler.fit_transform(X_2017)
print("Normalized values:")
print(X_2017)

'''
# select most important features based on chi-squared method
number_of_features = 30
chi_selector = SelectKBest(chi2, k=number_of_features)
chi_selector.fit(X_2017, Y_2017)
chi_support = chi_selector.get_support()

# keep only relevent features in X_2017
column_indices_to_drop = np.where(chi_support == False)
new_X_2017 = np.zeros((X_2017.shape[0],number_of_features))
for i in range(len(X_2017)):
    new_X_2017[i] = np.delete(X_2017[i], column_indices_to_drop)
X_2017 = new_X_2017
'''

# add sofifa_id column
X_2017 = np.hstack((sofifa_id[:, np.newaxis], X_2017))

# the labels (that indicate if an instance is positive or negative (unlabeled in this case)) 
# are stacked together into one matrix
X_2017_array = np.array(X_2017)
Y_2017_array = np.array(Y_2017)
result = np.column_stack((X_2017_array, Y_2017_array))

# every occurence of NaN is replaced by zero
result[np.isnan(result)] = 0

# the data is cleaned and written to a file
np.savetxt("clean_data/players_17_clean.csv", result, delimiter=",")

Normalized values:
[[1.         0.55555556 0.6122449  ... 0.87058824 0.89473684 0.14634146]
 [0.97959184 0.48148148 0.30612245 ... 0.74117647 1.         0.03658537]
 [0.95918367 0.48148148 0.55102041 ... 0.87058824 0.85526316 0.24390244]
 ...
 [0.         0.07407407 0.57142857 ... 0.4        0.42105263 0.32926829]
 [0.         0.11111111 0.20408163 ... 0.17647059 0.43421053 0.34146341]
 [0.         0.07407407 0.30612245 ... 0.10588235 0.35526316 0.06097561]]
False
True
[ True False False False False  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True  True False  True False  True False  True  True  True  True  True
  True  True  True]
39
39
(array([ 1,  2,  3,  4,  6, 22, 26, 28, 30]),)
[[1.         1.         1.         ... 0.87058824 0.89473684 0.14634146]
 [0.97959184 0.66666667 0.97435897 ... 0.74117647 1.         0.03658537]
 [0.95918367 0.66666667 0.97435897 ... 0.87058824 0.85526316 0.24390244]
 ...
 [0.      

In [198]:
# creating a train and a test set

# cleaned data that was created in the previous block is read and randomly devided into two classes: train and test
# 2/3 of the examples are in the training set
# 1/3 of the examples are in the test set
clean_data = pd.read_csv('clean_data/players_17_clean.csv')
training_data = clean_data.sample(frac=0.66, random_state=25)
testing_data = clean_data.drop(training_data.index)

testing_data = testing_data.values
training_data = training_data.values

# because negative labeled examples are in fact unlabeled examples in our case, we can't be sure that they are 
# indeed negative
# if a player doesn't appear in fifa 2022, we can label him as negative
# if a player belongs to the top 10% worst players in fifa 2022, we can also label him as negative
before = len(testing_data)
t0 = time.time()

index = []

# rating_lowest_percent is divided by 100
for i in range(0, len(testing_data)):
    if (int(testing_data[i][-1]) == 0):
        for j in range(0, len(X_2022)):
            if (X_2022[j][0] == testing_data[i][0]) and (X_2022[j][1] > rating_lowest_percent):
                index.append(i)
                break
                
testing_data = np.delete(testing_data, index, 0)

t1 = time.time()
print("Create-test-set-time: " + str(round(t1-t0, 2)) + " seconds")

after = len(testing_data)
print(str(before - after) + " elements were removed from the test set")

# remove sofifa_id column
training_data = np.delete(training_data, 0, 1)
testing_data = np.delete(testing_data, 0, 1)

# print the number of training and test examples
print(f"Number of training examples: {training_data.shape[0]}")
print(f"Number of testing examples: {testing_data.shape[0]}")

# write the training and test examples to two seperate files
np.savetxt("clean_data/players_17_clean_train.csv", training_data, delimiter=",")
np.savetxt("clean_data/players_17_clean_test.csv", testing_data, delimiter=",")

Create-test-set-time: 43.9 seconds
1490 elements were removed from the test set
Number of training examples: 10297
Number of testing examples: 3815
