# Regression Models
## FIFA 19 Player Dataset

This dataset, obtained from Kaggle (https://www.kaggle.com/karangadiya/fifa19), contains information from all of the soccer players registered in the game FIFA 19. Aside from their names, age, nationality and club, there are other specific attributes such as their overall score, potential, stamina, and specific abilities. Before cleaning, this set contains 18,207 rows (corresponding to individual players), and 89 columns (representing their attributes); however, a cleaned version of the dataset is used in this notebook to generate regression models to predict a soccer player's salary.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# @hidden_cell
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_eaec5b590319418ab3d2d1b79f362c89 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='QwhRprRgu5G2cudhASELP-vkb8DPRhsfFVYA8cq3kXEH',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_eaec5b590319418ab3d2d1b79f362c89.get_object(Bucket='ibmmlprofessionalcertification-donotdelete-pr-6wah3vm91cbjum',Key='cleaned_fifa19.csv')['Body']

# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

data_lr = pd.read_csv(body,index_col='Name')

In [3]:
data_lr.head()

Unnamed: 0_level_0,Age,Nationality,Overall,Potential,Club,Value (in Euros),Wage (in Euros),Preferred Foot,Skill Moves,Position,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L. Messi,31,Argentina,94,94,FC Barcelona,110500000.0,565000.0,Left,4,RF,...,75,96,33,28,26,6,11,15,14,8
Cristiano Ronaldo,33,Portugal,94,94,Juventus,77000000.0,405000.0,Right,5,ST,...,85,95,28,31,23,7,11,15,14,11
Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,118500000.0,290000.0,Right,5,LW,...,81,94,27,24,33,9,9,15,15,11
De Gea,27,Spain,91,93,Manchester United,72000000.0,260000.0,Right,1,GK,...,40,68,15,21,13,90,85,87,88,94
K. De Bruyne,27,Belgium,91,92,Manchester City,102000000.0,355000.0,Right,4,RCM,...,79,88,68,58,51,15,13,5,10,13


In [3]:
data_lr.drop(['Nationality', 'Club', 'Value (in Euros)', 'Overall'], axis=1, inplace=True)
data_lr= data_lr[data_lr['Wage (in Euros)']>0]
data_lr.shape

(17912, 42)

### Encoding the data

In [4]:
mask = data_lr.dtypes == 'O'
obj_cols= data_lr.columns[mask]
enc_cols = (data_lr[obj_cols].iloc[:,0].unique().shape[0]-1) + (data_lr[obj_cols].iloc[:,1].unique().shape[0]-1)
print('Columns to be encoded: ', obj_cols.values)
print('Num of new columns: ', enc_cols)

Columns to be encoded:  ['Preferred Foot' 'Position']
Num of new columns:  27


In [5]:
mask = data_lr.dtypes == 'O'
obj_cols= data_lr.columns[mask]
data_enc =pd.DataFrame(index=data_lr.index.values)

from sklearn.preprocessing import OneHotEncoder
    
#To avoid multicolinearity
ohe= OneHotEncoder()

for i in range(len(obj_cols)):
    
    col = data_lr[obj_cols].columns.values[i]

    #Transform the data
    new_data = ohe.fit_transform(data_lr[[col]])

    #Get the names of the categories
    cats = ohe.categories_

    #Create a column name for each OHE column by value
    new_cols = ['_'.join([col,cat]) for cat in cats[0]]

    #Create a new df
    new_df = pd.DataFrame(new_data.toarray(), columns=new_cols, index=data_lr.index.values)
    print('The column '+ new_df.columns.values[0] + ' was dropped')
    new_df.drop(new_df.columns.values[0],axis=1,inplace=True) 

    #Append the new data to the df
    data_enc = pd.concat([data_enc, new_df], axis=1)


The column Preferred Foot_Left was dropped
The column Position_CAM was dropped


In [6]:
data_enc.head()

Unnamed: 0,Preferred Foot_Right,Position_CB,Position_CDM,Position_CF,Position_CM,Position_GK,Position_LAM,Position_LB,Position_LCB,Position_LCM,...,Position_RB,Position_RCB,Position_RCM,Position_RDM,Position_RF,Position_RM,Position_RS,Position_RW,Position_RWB,Position_ST
L. Messi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Cristiano Ronaldo,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Neymar Jr,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
De Gea,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
K. De Bruyne,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train-Test-Split

In [7]:
non_obj_cols= data_lr.columns[data_lr.dtypes != 'O']
data = pd.concat([data_lr[non_obj_cols],data_enc],axis=1)

In [8]:
new_order=['Age', 'Potential', 'Skill Moves',
       'Height (cm)', 'Weight (kg)', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Preferred Foot_Right', 'Position_CB', 'Position_CDM',
       'Position_CF', 'Position_CM', 'Position_GK', 'Position_LAM',
       'Position_LB', 'Position_LCB', 'Position_LCM', 'Position_LDM',
       'Position_LF', 'Position_LM', 'Position_LS', 'Position_LW',
       'Position_LWB', 'Position_RAM', 'Position_RB', 'Position_RCB',
       'Position_RCM', 'Position_RDM', 'Position_RF', 'Position_RM',
       'Position_RS', 'Position_RW', 'Position_RWB', 'Position_ST','Wage (in Euros)']
data = data[new_order]
data.head()

Unnamed: 0_level_0,Age,Potential,Skill Moves,Height (cm),Weight (kg),Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,...,Position_RCB,Position_RCM,Position_RDM,Position_RF,Position_RM,Position_RS,Position_RW,Position_RWB,Position_ST,Wage (in Euros)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L. Messi,31,94,4,170,71.55,84,95,70,90,86,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,565000.0
Cristiano Ronaldo,33,94,5,187,82.35,84,94,89,81,87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,405000.0
Neymar Jr,26,93,5,175,67.5,79,87,62,84,84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,290000.0
De Gea,27,93,1,193,75.6,17,13,21,50,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,260000.0
K. De Bruyne,27,92,4,180,69.3,93,82,55,92,82,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,355000.0


In [9]:
from sklearn.model_selection import train_test_split

X= data.drop('Wage (in Euros)', axis=1)
y= data['Wage (in Euros)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

### Y_train Distribution

Check the distribution for y_train

In [10]:
from scipy.stats.mstats import normaltest

def is_normal(data):
    
    p=normaltest(data)[1]
    
    if p>0.05:
        s='Normal'
    else:
        s='Not-normal'
    return s

In [11]:
is_normal(y_train)

'Not-normal'

In [12]:
from numpy import log1p

y_train_log= y_train.apply(np.log1p)
is_normal(y_train_log)

'Not-normal'

In [13]:
from scipy.stats import boxcox

result = boxcox(y_train)
y_train_bc = result[0]
lam = result[1]

is_normal(y_train_bc)

'Not-normal'

In [14]:
y_train_sqrt= y_train.apply(np.sqrt)
is_normal(y_train_sqrt)

'Not-normal'

In [15]:
y_train_inv=y_train.apply(lambda x: 1/x)
is_normal(y_train_inv)

'Not-normal'

It was not possible to normalize y_train so let's proceed.

### Linear Regression

In [16]:
from sklearn.metrics import mean_squared_error
def rmse(ytrue, ypredicted):
    return np.sqrt(mean_squared_error(ytrue, ypredicted))

In [17]:
#Linear
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

lr = LinearRegression()
s= StandardScaler()

estimator = Pipeline([('scaler',s),('regression',lr)])
estimator = estimator.fit(X_train, y_train)
y_pred_lr=estimator.predict(X_test)

lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = rmse(y_test,y_pred_lr)

print('R2: %0.3f' %lr_r2)
print('RMSE: %0.2f' %lr_rmse)

R2: 0.345
RMSE: 17893.36


### Polynomial Regression

In [24]:
#Polynomial
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict

kf = KFold(shuffle=True, random_state=1, n_splits=5)

degrees = [1, 2]

scores = []

lr = LinearRegression()
s= StandardScaler()

for degree in degrees:
    pf = PolynomialFeatures(degree)
    
    estimator = Pipeline([
        ("scaler", s),
        ('poly_features', pf),
        ("linear_regression", lr)])

    predictions = cross_val_predict(estimator, X_train, y_train, cv = kf)
    score = [r2_score(y_train, predictions),rmse(y_train, predictions)]
    scores.append(score)
    

In [25]:
list(zip(degrees,scores))

[(1, [0.341154225033098, 17985.54431709678]),
 (2, [-6.359946681660528e+18, 55880262899296.7])]

In [22]:
#Poly-2
from sklearn.preprocessing import PolynomialFeatures

lr = LinearRegression()
s= StandardScaler()
pf = PolynomialFeatures(degree=2)

estimator = Pipeline([('scaler',s),('poly_features', pf),('regression',lr)])
estimator = estimator.fit(X_train, y_train)
y_pred_pr=estimator.predict(X_test)

pr_r2 = r2_score(y_test, y_pred_pr)
pr_rmse = rmse(y_test,y_pred_pr)

print('R2: %0.3f' %pr_r2)
print('RMSE: %0.2f' %pr_rmse)

R2: -957006370230998528.000
RMSE: 21635659084992.98


### LASSO

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

kf = KFold(shuffle=True, random_state=1, n_splits=5)

# Same estimator as before
estimator = Pipeline([("scaler", StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2)),
        ("lasso_regression", Lasso())])

params = {'lasso_regression__alpha': np.linspace(100,1000, num=10)}

grid = GridSearchCV(estimator, params, cv=kf)

#Fit and find the best parameters
grid.fit(X_train, y_train)
print(grid.best_score_, grid.best_params_)

0.5978865482302449 {'lasso_regression__alpha': 400.0}


In [55]:
#Find the error
y_pred_ls = grid.predict(X_test)
ls_r2 = r2_score(y_test, y_pred_ls)
ls_rmse = rmse(y_test,y_pred_ls)

print('R2: %0.3f' %ls_r2)
print('RMSE: %0.2f' %ls_rmse)

R2: 0.617
RMSE: 13684.82


### Ridge

In [62]:
 np.linspace(4600,5500, num=10)

array([4600., 4700., 4800., 4900., 5000., 5100., 5200., 5300., 5400.,
       5500.])

In [63]:
#from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

kf = KFold(shuffle=True, random_state=1, n_splits=5)

# Same estimator as before
estimator = Pipeline([("scaler", StandardScaler()),
        ('poly_features', PolynomialFeatures(degree=2)),
        ("ridge_regression", Ridge())])

params = {'ridge_regression__alpha': np.linspace(4600,5500, num=10)}

grid = GridSearchCV(estimator, params, cv=kf)

#Fit and find the best parameters
grid.fit(X_train, y_train)
print(grid.best_score_, grid.best_params_)

0.5792348193884709 {'ridge_regression__alpha': 4600.0}


In [64]:
#Find the error
y_pred_rr = grid.predict(X_test)
rr_r2 = r2_score(y_test, y_pred_rr)
rr_rmse = rmse(y_test,y_pred_rr)

print('R2: %0.3f' %rr_r2)
print('RMSE: %0.2f' %rr_rmse)

R2: 0.578
RMSE: 14373.35


### Scores

In [98]:
rmse_vals = [lr_rmse, pr_rmse, ls_rmse, rr_rmse]
rmse_vals = [int(num) for num in rmse_vals]

labels = ['Linear','Polynomial (deg=2)' , 'Lasso', 'Ridge']

rmse_df = pd.Series(rmse_vals, index=labels).to_frame()
rmse_df.rename(columns={0: 'RMSE'}, inplace=1)
rmse_df['RMSE']=rmse_df['RMSE'].apply(lambda x: '%.0f'%x if x<=100000 else '> 100000')

In [94]:
r2_vals = [lr_r2, pr_r2, ls_r2, rr_r2]

r2_df = pd.Series(r2_vals, index=labels).to_frame()
r2_df.rename(columns={0: 'R^2'}, inplace=1)
r2_df['R^2']=r2_df['R^2'].apply(lambda x: '%.2f'%x if x>=0 else '< 0')

In [100]:
score_df = pd.concat([rmse_df,r2_df], axis=1)
score_df

Unnamed: 0,RMSE,R^2
Linear,17893,0.35
Polynomial (deg=2),> 100000,< 0
Lasso,13684,0.62
Ridge,14373,0.58
