# Table of contents

* [Cleaning](#cleaning)
* [x-y-split](#split)
* [Linear Regression](#lg)
* [Model Validation - Metrics](#metrics)

# Cleaning <a class="anchor" id="cleaning"></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

fifa = pd.read_csv("fifa21_train.csv")
#fifa.head()
fifa.columns[:50]

FileNotFoundError: [Errno 2] No such file or directory: 'fifa21_train.csv'

In [None]:
fifa.columns[50:]

In [None]:
#fifa.dtypes
fifa.shape

In [None]:
# standardizing header names

cols_1 = []
for column in fifa.columns:
    cols_1.append(column.lower())
fifa.columns = cols_1
fifa.columns = fifa.columns.str.replace(" ","_")
fifa.head()

In [None]:
fifa.columns[:50]

In [None]:
fifa.columns[50:]

In [None]:
# deleting "+_" values + converting to float
fifa[['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk',]]


for col in fifa[['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk',]]:
   fifa[col] = fifa[col].str.split("+", n = 1, expand = True)[0].astype("float")
  

In [None]:
# checking

fifa.head()


In [None]:
# mean for composure NaN values

mean_median_composure = fifa['composure'].mean()
fifa['composure'] = fifa['composure'].fillna(mean_median_composure)

In [None]:
# dropping unneccesary columns

fifa = fifa.drop(["loan_date_end"],axis =1) # too many NaNs
fifa = fifa.drop(["id"],axis =1)
fifa = fifa.drop(["growth"],axis =1) # low correlation with ova
fifa = fifa.drop(["team_&_contract"],axis =1)
fifa = fifa.drop(["contract"],axis =1)
fifa = fifa.drop(["w/f"],axis =1) # unknown feature
fifa = fifa.drop(["sm"],axis =1) # see above
fifa = fifa.drop(["ir"],axis =1) # see above
fifa = fifa.drop(["height"],axis =1) # useless imperial measurement system

# dropping columns "attacking" through "goalkeeping" -> sum of total stats
fifa = fifa[["name","age","nationality","club","bp","position",
             "weight","foot","joined","value","wage","release_clause","marking",
            "total_stats","base_stats","a/w","d/w","pac","sho","pas","dri",
            "def","phy","hits",'ls','st','rs','lw','lf','cf','rf','rw','lam',
             'cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb',
             'lb','lcb','cb','rcb','rb','gk',"ova"]]
fifa.head()

In [None]:
# wage: replacing € + K, converting to float
fifa["wage"] = fifa['wage'].apply(lambda x: x.replace('€', ''))
fifa["wage"] = fifa['wage'].apply(lambda x: x.replace('K', '000'))
fifa['wage'] = fifa['wage'].astype('float')
fifa['wage']

In [None]:
# release_clause: replacing € + K + M, converting to float

fifa["release_clause"] = fifa['release_clause'].apply(lambda x: x.replace('.', ''))
fifa["release_clause"] = fifa['release_clause'].apply(lambda x: x.replace('€', ''))
fifa["release_clause"] = fifa['release_clause'].apply(lambda x: x.replace('K', '000'))
fifa["release_clause"] = fifa['release_clause'].apply(lambda x: x.replace('M', '000000'))
fifa['release_clause'] = fifa["release_clause"].astype('float')
fifa["release_clause"]

In [None]:
# value: replacing € + K + M, converting to float

fifa["value"] = fifa['value'].apply(lambda x: x.replace('.', ''))
fifa["value"] = fifa['value'].apply(lambda x: x.replace('€', ''))
fifa["value"] = fifa['value'].apply(lambda x: x.replace('K', '000'))
fifa["value"] = fifa['value'].apply(lambda x: x.replace('M', '000000'))
fifa['value'] = fifa['value'].astype('float')
fifa["value"]

In [None]:
# weight: replacing lbs, converting to float

fifa["weight"] = fifa['weight'].apply(lambda x: x.replace('lbs', ''))
fifa['weight'] = fifa['weight'].astype('float')
fifa["weight"]


In [None]:
# converting hits to numeric
fifa['hits'] =  pd.to_numeric(fifa['hits'], errors='coerce')
#fifa['hits'] = fifa['hits'].astype('float')
fifa['hits'].isna().sum()

In [None]:
# checking for NaNs

fifa[fifa.columns[fifa.isna().any()]]

In [None]:
# dropping columns with NaN values: club, joined, loan date end, position, a/w, d/w, hits
# only a very small percentage of data
print("Before:", fifa.shape)

fifa = fifa[fifa['club'].isna()==False]
fifa = fifa[fifa['joined'].isna()==False]
fifa = fifa[fifa['position'].isna()==False]
fifa = fifa[fifa['a/w'].isna()==False]
fifa = fifa[fifa['d/w'].isna()==False]
fifa = fifa[fifa['hits'].isna()==False]

print("After: ", fifa.shape)


In [None]:
#checking again for NaNs

fifa[fifa.columns[fifa.isna().any()]]

In [None]:
fifa.shape

In [None]:
fifa.head()

In [None]:
# # visualizing
# for col in fifa:
#     num_all = sns.histplot(data=fifa,x=col,kde=True)
#     plt.title(col)
#     plt.show(num_all)

In [None]:
# scaling down value, wage and release clause

def log_transfom_clean1(x):
    x = np.log10(x)
    if np.isfinite(x):
        return x
    else:
        return 0

value_log = fifa['value'].apply(lambda x :log_transfom_clean1(x))
wage_log = fifa['wage'].apply(lambda x :log_transfom_clean1(x))
release_clause_log = fifa['release_clause'].apply(lambda x :log_transfom_clean1(x))

# checking new scaling on a plot
sns.displot(value_log)
sns.displot(wage_log)
sns.displot(release_clause_log)
plt.show()

# adding _log to fifa database

fifa["value_log"] = value_log
fifa["wage_log"] = wage_log
fifa["release_clause_log"] = release_clause_log

# dropping original columns

fifa = fifa.drop(["value"], axis = 1)
fifa = fifa.drop(["wage"], axis = 1)
fifa = fifa.drop(["release_clause"], axis = 1)

fifa.head()

In [None]:
# placeholder cell for bonus

In [None]:
# saving cleaned data

#fifa.to_csv("fifa_cleaned.csv")

 #### end of cleaning
 
 

# x -y - split <a class="anchor" id="split"></a>

In [None]:
# splitting 
#fifa = pd.read("fifa_cleaned.csv")

y = fifa['ova'] # target 
X = fifa.drop(['ova'], axis = 1) # features

fifa_cat = X.select_dtypes(include="object")
fifa_num = X.select_dtypes(include=np.number)

### Normalizing numerical data

In [None]:
fifa_num

In [None]:
# scaling numerical features

from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(fifa_num)
X_normalized = transformer.transform(fifa_num)

#print(x_normalized.shape)
#X_normalized
fifa_num_minmax = pd.DataFrame(X_normalized, columns=fifa_num.columns)
# checking
fifa_num_minmax.describe().T

### Normalizing categorical data

In [None]:
fifa_cat.describe().T

In [None]:
# dropping columns with too many unique values

fifa_cat = fifa_cat.drop(["name"], axis = 1)
fifa_cat = fifa_cat.drop(["nationality"], axis = 1)
fifa_cat = fifa_cat.drop(["club"], axis = 1)
fifa_cat = fifa_cat.drop(["position"], axis = 1)
fifa_cat = fifa_cat.drop(["joined"], axis = 1)

fifa_cat

In [None]:
# encoding all columns of fifa_cat
from sklearn.preprocessing import OneHotEncoder


encoder = OneHotEncoder(drop='first').fit(fifa_cat) # first drops first column of every category
encoded = encoder.transform(fifa_cat).toarray()
cols = encoder.get_feature_names(input_features=fifa_cat.columns)
#cols
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()

In [None]:
# concat both fifa_num and fifa_cat: fifa_num_minmax + onehot_encoded

fifa_whole = pd.concat([fifa_num_minmax, onehot_encoded], axis=1)

#### end of normalizing

# Linear Regression <a class="anchor" id="lg"></a>

## Train-test split

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(fifa_whole, y, test_size=0.2, random_state=42)

print("X_train ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train:", y_train.shape)
print("y_test: ", y_test.shape)

In [None]:
# train/fit the model

lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

#### end of regression

# Model Validation - Metrics <a class="anchor" id="metrics"></a>

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# r2 score
predictions_test = lm.predict(X_test)
print("r2 Score of test Data: ",round(r2_score(y_test, predictions_test),3))
predictions_train = lm.predict(X_train)
print("r2 Score of train Data: ",round(r2_score(y_train, predictions_train),3), "\n")

# mse
y_pred = lm.predict(fifa_whole)
print("mean squared error: ",round(mean_squared_error(y_pred,y),3))

# rmse
print("rooted mean squared error: ", round(np.sqrt(mean_squared_error(y_pred,y)),3),"\n")

# mae
mae_test = mean_absolute_error(y_test, predictions_test)
print("mean absolute error of test data:",round(mae_test,3))
mae_train = mean_absolute_error(y_train, predictions_train)
print("mean absolute error of train data:", round(mae_train,3))

#### end of metrics

In [None]:
# New Data