# Clean dataset

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import preprocessing
import time

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

In [2]:
# define the columns that will be dropped
dropped_columns = ['player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'club_name', 'league_name',
                  'club_position', 'club_loaned_from', 'club_joined', 'nationality_name', 'nation_position',
                  'preferred_foot', 'work_rate', 'body_type', 'real_face', 'player_tags', 'player_traits',
                  'player_face_url', 'club_logo_url', 'club_flag_url', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf',
                  'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                  'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'nation_logo_url', 'nation_flag_url']

selected_columns = ['sofifa_id', 'overall', 'height_cm', 'league_level', 'nationality_id', 'weak_foot', 'skill_moves',
                    'pace', 'shooting', 'passing', 'dribbling',
                    'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 
                    'skill_dribbling','skill_curve', 'movement_acceleration','movement_agility',
                    'power_shot_power', 'power_jumping','mentality_aggression', 'mentality_interceptions',
                    'mentality_positioning', 'mentality_vision']

t0 = time.time()

# read the 2017 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_17.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2017 = data.iloc[:, :-2].values

# create a vector with zeros (same length as X_2017)
# these are the labels for the 2017 data
Y_2017 = np.zeros((len(X_2017),), dtype=int)

# read the 2018 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_18.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2018 = data.iloc[:, :-2].values

# read the 2019 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_19.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2019 = data.iloc[:, :-2].values

# read the 2020 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_20.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2020 = data.iloc[:, :-2].values

# read the 2021 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_21.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2021 = data.iloc[:, :-2].values

# read the 2022 data from the .csv file and drop the columns defined above
data = pd.read_csv('original_data/players_22.csv')
#data = data.drop(dropped_columns, axis = 1)
data = data[selected_columns]
X_2022 = data.iloc[:, :-2].values

t1 = time.time()
print("Selection-time: " + str(t1-t0))

# loop over every row in the 2022 data
# if a player has an average rating that is higher or equal to 80, then mark this player 
#     as a positive example in the 2017 data
# the first element of a row is a unique id (for a given player, this id is the same in every version of fifa)
rating = 80

t0 = time.time()

for i in range(0, len(X_2018)):
    if (int(X_2018[i][1]) >= rating):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2018[i][0]):
                Y_2017[j] = 1

for i in range(0, len(X_2019)):
    if (int(X_2019[i][1]) >= rating):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2019[i][0]):
                Y_2017[j] = 1

for i in range(0, len(X_2020)):
    if (int(X_2020[i][1]) >= rating):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2020[i][0]):
                Y_2017[j] = 1
                
for i in range(0, len(X_2021)):
    if (int(X_2021[i][1]) >= rating):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2021[i][0]):
                Y_2017[j] = 1
                
for i in range(0, len(X_2022)):
    if (int(X_2022[i][1]) >= rating):
        for j in range(0, len(X_2017)):
            if (X_2017[j][0] == X_2022[i][0]):
                Y_2017[j] = 1
t1 = time.time()
print("Labelling-time: " + str(t1-t0))
                
# print the number of positive and negative examples
print(f"Number of positive examples: {Counter(Y_2017)[1]}")
print(f"Number of unlabeled examples: {Counter(Y_2017)[0]}")

# remove sofifa_id column
X_2017 = np.delete(X_2017, 0, 1)

# normalize values
min_max_scaler = preprocessing.MinMaxScaler()
X_2017 = min_max_scaler.fit_transform(X_2017)
print("Normalized values:")
print(X_2017)

# the labels (that indicate if an instance is positive or negative (unlabeled in this case)) are 
#    stacked together into one matrix
X_2017_array = np.array(X_2017)
Y_2017_array = np.array(Y_2017)
result = np.column_stack((X_2017_array, Y_2017_array))

# every occurence of NaN is replaced by zero
result[np.isnan(result)] = 0

# the data is cleaned and written to a file
np.savetxt("clean_data/players_17_clean.csv", result, delimiter=",")

Selection-time: 1.5498151779174805
Labelling-time: 27.159194946289062
Number of positive examples: 862
Number of unlabeled examples: 16734
Normalized values:
[[1.         0.57692308 0.         ... 0.97560976 0.64893617 0.28888889]
 [0.97959184 0.28846154 0.         ... 0.64634146 0.4893617  0.21111111]
 [0.95918367 0.73076923 0.         ... 0.76829268 0.28723404 0.3       ]
 ...
 [0.         0.53846154 1.         ... 0.43902439 0.5212766  0.47777778]
 [0.         0.19230769 1.         ... 0.47560976 0.34042553 0.27777778]
 [0.         0.28846154 1.         ... 0.40243902 0.27659574 0.16666667]]


In [3]:
# creating a train and a test set

# cleaned data that was created in the previous block is read and randomly devided into two classes: train and test
# 2/3 of the examples are in the training set
# 1/3 of the examples are in the test set
clean_data = pd.read_csv('clean_data/players_17_clean.csv')
training_data = clean_data.sample(frac=0.66, random_state=25)
testing_data = clean_data.drop(training_data.index)

# because negative labeled examples are in fact unlabeled examples in our case, we can't be sure
#     that they are indeed negative, so they are deleted from the test set
# if the algorithm would indicate that an example it has never seen is a positive example, we can only be sure
#     if that is indeed a positive example, if it is an unlabeled one, we are not sure

# --> ik ben niet helemaal meer zeker of deze redenering nog klopt als we de spelers die in de jaren
#        2018, 2019, 2020 of 2021 boven de 80 halen ook als positief labelen
#test = []
#for i in range(0, len(testing_data)):
#    if (int(testing_data.iloc[i][-1]) == 0):
#        test.append(testing_data.index[i])
#testing_data = testing_data.drop(test)

# print the number of training and test examples
print(f"Number of training examples: {training_data.shape[0]}")
print(f"Number of testing examples: {testing_data.shape[0]}")

# write the training and test examples to two seperate files
training_data.to_csv("clean_data/players_17_clean_train.csv", index=False)
testing_data.to_csv("clean_data/players_17_clean_test.csv", index=False)

Number of training examples: 11613
Number of testing examples: 5982
