# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor
import pickle
from sklearn.ensemble import RandomForestRegressor
import json


In [4]:
# load the dataset
data = pd.read_csv('male_players+%28legacy%29.csv', low_memory=False)
data.head(2)

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/150002,15,2,2014-09-18,L. Messi,Lionel Andrés Messi Cuccittini,CF,93,95,...,62+3,62+3,62+3,54+3,45+3,45+3,45+3,54+3,15+3,https://cdn.sofifa.net/players/158/023/15_120.png
1,20801,/player/20801/c-ronaldo-dos-santos-aveiro/150002,15,2,2014-09-18,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"LW, LM",92,92,...,63+3,63+3,63+3,57+3,52+3,52+3,52+3,57+3,16+3,https://cdn.sofifa.net/players/020/801/15_120.png


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Columns: 110 entries, player_id to player_face_url
dtypes: float64(18), int64(45), object(47)
memory usage: 135.6+ MB


In [6]:
data.describe()

Unnamed: 0,player_id,fifa_version,fifa_update,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
count,161583.0,161583.0,161583.0,161583.0,161583.0,159530.0,159822.0,161583.0,161583.0,161583.0,...,128722.0,161583.0,161583.0,161583.0,161583.0,161583.0,161583.0,161583.0,161583.0,17969.0
mean,214484.722353,19.125514,2.0,65.699071,70.744008,2326770.0,10855.409768,25.123181,181.240205,75.235031,...,57.816892,45.757957,47.669996,45.698588,16.52961,16.274918,16.140374,16.288861,16.636973,39.14909
std,34928.608856,2.559318,0.0,7.040855,6.259121,6005746.0,21941.656285,4.670207,6.750148,7.000456,...,12.329739,20.453699,21.336404,20.935273,17.67047,16.834294,16.476466,16.998697,17.980143,10.503788
min,2.0,15.0,2.0,40.0,40.0,1000.0,500.0,16.0,154.0,49.0,...,3.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,12.0
25%,199159.0,17.0,2.0,61.0,66.0,325000.0,2000.0,21.0,176.0,70.0,...,50.0,26.0,27.0,25.0,8.0,8.0,8.0,8.0,8.0,31.0
50%,220621.0,19.0,2.0,66.0,70.0,725000.0,4000.0,25.0,181.0,75.0,...,59.0,50.0,54.0,52.0,11.0,11.0,11.0,11.0,11.0,41.0
75%,236958.0,21.0,2.0,70.0,75.0,1800000.0,10000.0,28.0,186.0,80.0,...,66.0,63.0,66.0,64.0,14.0,14.0,14.0,14.0,14.0,46.0
max,271817.0,23.0,2.0,94.0,95.0,194000000.0,575000.0,54.0,208.0,110.0,...,96.0,94.0,94.0,95.0,91.0,92.0,95.0,92.0,94.0,68.0


# Data Preprocessing


### Remove identifier columns

In [2]:
# DEFINE A FUNCTION TO REMOVE IDENTIFIER COLUMNS

def remove_identifiers(data, identifier_attributes):
    data.drop(columns= identifier_attributes, inplace = True)



In [8]:
# create a list of identifier columns, names or links to be removed
identifier_columns = [
    "player_id",
    "player_url",
    "fifa_version",
    "fifa_update",
    "fifa_update_date",
    "short_name",
    "long_name",
    "league_id",
    "league_name",
    "club_team_id",
    "club_name",
    "club_position",
    "club_jersey_number",
    "club_loaned_from",
    "club_joined_date",
    "nationality_id",
    "nationality_name",
    "nation_team_id",
    "nation_position",
    "nation_jersey_number",
    "player_face_url"
]

# remove the identifiers
remove_identifiers(data, identifier_columns)

### Remove columns with more 30% missing data

In [3]:
# DEFINE A FUNCTION TO REMOVE MORE THAN 30% MISSING DATA
def remove_empty_columns(data):
    # set the threshold to 30%
    threshold = int(len(data) * 0.3)

    # get the number of missing values per column
    null_values_per_column = pd.DataFrame(data[data.columns].isnull().sum(), columns=['null_value_count'], index=data.columns)

    # get the columns with missing values that exceed the threshold
    columns_exceed_threshold = null_values_per_column[null_values_per_column['null_value_count']>threshold]

    # drop the columns with missing values that exceed the threshold
    data.drop(columns= columns_exceed_threshold.index, inplace=True)


In [10]:
# remove the columns with more than 30% missing data
remove_empty_columns(data)

In [14]:
data.shape

(161583, 85)

### Impute missing data

In [4]:
def impute_missing_data(data):
    # separate the data into categorical and numerical data types
    categorical_columns =  data.select_dtypes(['category', 'object']).columns
    numerical_columns = data.select_dtypes('number').columns


    # create an imputer for categorical data and fill in missing data with the most frequent value
    cat_impute = SimpleImputer(strategy='most_frequent')
    data[categorical_columns] = pd.DataFrame(cat_impute.fit_transform(data[categorical_columns]), columns=categorical_columns, index = data.index)


    # create an instance of the SimpleImputer for numerical data: fill in missing data with the average value
    num_impute = SimpleImputer(strategy='mean')
    data[numerical_columns] = pd.DataFrame(cat_impute.fit_transform(data[numerical_columns]), columns = numerical_columns, index=data.index)




In [16]:
# call the function for imputing numerical data
impute_missing_data(data)

In [17]:
data.shape

(161583, 85)

# Feature Engineering

### Calculate effective positional rating

In [5]:
# define a function that calcutates the effective rating for each position and change the data type for the ratings from object to integers
def calculate_effective_positional_rating(rating):
    return eval(rating) # calculate the effective score for each positional rating

def positional_rating(data, positions):
    # calculate the effective rating for each columns in positions
    for pos in positions:
        data[pos] = data[pos].apply(calculate_effective_positional_rating) # pass in columns with the positional rating


In [19]:
# select the columns with ratings for each position
position_ratings = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']

# call the function to calculate effective ratings
positional_rating(data, position_ratings)

In [20]:
# view the calculated effective ratings
data[position_ratings].head()

Unnamed: 0,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,92,92,92,95,93,93,93,95,95,95,...,65,65,65,65,57,48,48,48,57,18
1,92,92,92,92,92,92,92,92,92,92,...,66,66,66,66,60,55,55,55,60,19
2,87,87,87,90,90,90,90,90,90,90,...,67,67,67,67,58,49,49,49,58,17
3,90,90,90,87,89,89,89,87,89,89,...,68,68,68,64,59,58,58,58,59,20
4,41,41,41,39,40,40,40,39,39,39,...,43,43,43,39,39,41,41,41,39,90


In [21]:
data.shape

(161583, 85)

### Encode Categorical Data

In [6]:

def encode_categorical_data(data):
    # Extract the categorical features
    categorical_col = data.select_dtypes(['object', 'category']).columns

    # Encode the categorical values with OneHotEncoder
    ohe = OneHotEncoder(sparse_output=True)  # Use sparse=True for memory efficiency
    encoded_data = ohe.fit_transform(data[categorical_col])

    # Convert encoded_data to DataFrame with the proper column names
    encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data,
                                                   columns=ohe.get_feature_names_out(categorical_col),
                                                   index=data.index)

    # Drop the categorical features from data
    data.drop(columns=categorical_col, inplace=True)

    # Concatenate encoded_df with data
    comb_encoded_data = pd.concat([encoded_df, data], axis=1)

    print('...Encoding categorical variables')

    return comb_encoded_data


In [25]:
# call the function for encoding data
data = encode_categorical_data(data)

---Done preprocessing data




In [26]:
data.shape

(161583, 79)

# Feature Subset

In [7]:
def create_feature_subset(data):
  # get the correlation coeffients between all variables in a dataframe
  corr_matrix = data.corr()

  # extract the correlation coeffients between features and the target(overall rating)
  correlation = pd.DataFrame(corr_matrix['overall'])
  correlation = correlation.rename(columns={'overall': 'corr_coeff'}) # change the name of the column to corr_coeff
  print("\nCorrelation coeffients:\n ", correlation['corr_coeff'].sort_values())

  # set a threshold for the correlation value to 40%
  threshold = 0.4

  # select features that are above the threshold
  important_features = correlation[abs(correlation['corr_coeff']) > threshold] # variables with stronger positive correlation

  # get the array of variables
  important_features = np.array(important_features.index)
  print("\n Features with the strongest correlation:\n", important_features)

  return data[important_features] # return the dataframe with the strongest correlation to the target


In [43]:
data = create_feature_subset(data) # new dataframe with the features showing the features with the greatest correlation to the target
data.columns # view the feature subset with the most important features

league_level           -0.222646
goalkeeping_kicking    -0.015521
goalkeeping_diving     -0.013603
goalkeeping_reflexes   -0.011783
goalkeeping_handling   -0.010862
                          ...   
wage_eur                0.606763
passing                 0.626631
potential               0.695362
movement_reactions      0.845753
overall                 1.000000
Name: corr_coeff, Length: 79, dtype: float64


Index(['overall', 'potential', 'value_eur', 'wage_eur', 'age',
       'international_reputation', 'shooting', 'passing', 'dribbling',
       'physic', 'attacking_short_passing', 'skill_curve',
       'skill_long_passing', 'skill_ball_control', 'movement_reactions',
       'power_shot_power', 'power_long_shots', 'mentality_vision',
       'mentality_composure', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
       'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm',
       'cdm', 'rdm', 'rwb', 'lb', 'rb'],
      dtype='object')

# Create Data Preprocessing Pipeline

In [31]:
# create the function for preprocessing data
def preprocess_data(data, identifier_columns, positions):

  # remove identifier columns
  remove_identifiers(data, identifier_columns)

  # remove attributes with more than 50% missing data
  remove_empty_columns(data)

  # impute missing data
  print('...imputing missing data')
  impute_missing_data(data)

  # calculate effective positional rating
  print('...calculating effective positional ratings')
  positional_rating(data, positions)

  # encode categorical data
  data = encode_categorical_data(data)

  # extract the feature subset
  print('...create feature subset')
  data = create_feature_subset(data)

  print('...Data processing complete')

  return data


# Feature Scaling

In [45]:
# separate the target from the features
Y = data['overall'] # get the target
X = data.drop(columns = ['overall']) # extract the feautres

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X = scaler.fit_transform(X)

In [46]:
X.shape

(161583, 41)

In [47]:
Y.shape

(161583,)

In [48]:
X

array([[ 3.87531516, 16.44970076, 24.68933283, ...,  0.72434133,
         0.17779242,  0.17779242],
       [ 3.39601316, 12.84777768, 16.67689972, ...,  0.79820405,
         0.39441525,  0.39441525],
       [ 3.0764785 ,  8.74326069, 12.09836652, ...,  0.87206677,
         0.25000003,  0.25000003],
       ...,
       [-2.0360761 , -0.36876705, -0.46970713, ..., -0.45746223,
        -0.39986848, -0.39986848],
       [-0.11886813, -0.3620658 , -0.46970713, ..., -1.41767762,
        -1.48298267, -1.48298267],
       [-1.23723945, -0.36876705, -0.46970713, ..., -1.04836401,
        -0.83311416, -0.83311416]])

# Model Training

In [52]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Model Selection

In [63]:

def model_training(x_train, x_test, y_train, y_test):
    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'Elastic Net': ElasticNet(),
        'Decision Tree': DecisionTreeRegressor(),
        'Bayesian Ridge': BayesianRidge(),
        'SGD Regressor': SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
    }

    # Define cross-validation strategy
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Use MAE as the scoring metric
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=True)

    # Dictionary to store the mean scores for each model
    model_scores = {}


    # Train and evaluate each model with cross-validation
    for name, model in models.items():
        print(f"Model: {name}")
        scores = cross_val_score(model, x_train, y_train, cv=cv, scoring=mae_scorer)
        mean_score = np.mean(scores) # caluculate the average mean absolute error for all folds
        model_scores[name] = mean_score # add the mae for each model to the dict
        print(f"MAE: Mean = {mean_score:.4f}, Std = {scores.std():.4f}")  # print the mean score and the standard deviation
        print("-" * 32) # print a new line


    # Identify the best model based on the mean MAE score
    best_model_name = min(model_scores, key=model_scores.get)  # Choose model with a lower MAE score
    best_model_score = model_scores[best_model_name]

    print(f"Best Model: {best_model_name}")
    print(f"Best Model Score (MAE): {best_model_score:.4f}")  

    return models[best_model_name] # return the best model



In [64]:
# train the different models
best_model = model_training(x_train, x_test, y_train, y_test)

Model: Linear Regression
MAE: Mean = 1.5168, Std = 0.0085
------------------------------
Model: Ridge Regression
MAE: Mean = 1.5168, Std = 0.0085
------------------------------
Model: Lasso Regression
MAE: Mean = 2.1307, Std = 0.0049
------------------------------
Model: Elastic Net
MAE: Mean = 2.1751, Std = 0.0076
------------------------------
Model: Decision Tree
MAE: Mean = 0.5219, Std = 0.0048
------------------------------
Model: Bayesian Ridge
MAE: Mean = 1.5168, Std = 0.0085
------------------------------
Model: SGD Regressor
MAE: Mean = 1.5211, Std = 0.0068
------------------------------
Best Model: Decision Tree
Best Model Score (MAE): 0.5219


### Train Best Model

In [68]:
def train_best_model(best_model):
  # Train the best model on the entire training set and evaluate the performance on the test set
  best_model.fit(x_train, y_train)
  y_pred = best_model.predict(x_test)
  test_mae = mean_absolute_error(y_test, y_pred)
  print(f"Test MAE of Best Model: {test_mae:.4f}")

  return test_mae # return the mean absolute error for evaluation


In [69]:
# train the best model and measure its performance
base_mae = train_best_model(best_model)

Test MAE of Best Model: 0.4989


# Hyperameter Tuning

In [83]:

def tune_hyperparameter(x_train, x_test, y_train, y_test):

    # Define XGBoost regressor
    xgb = XGBRegressor(random_state=42)

    # Define parameters grid for Grid Search
    param_grid = {
    'n_estimators': [100, 200],           # Number of boosting rounds or trees to build
    'max_depth': [3, 5, 7],               # Maximum depth of a tree
    'learning_rate': [0.01, 0.05, 0.1],   # Step size shrinkage used to prevent overfitting
    'subsample': [0.8, 1.0],              # Fraction of samples used to train each tree
    'colsample_bytree': [0.8, 1.0]        # Fraction of features used to train each tree
    }

    # Perform Grid Search CV
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
    grid_search.fit(x_train, y_train)

    # Print best parameters 
    best_params = grid_search.best_params_
    print(f"Best Parameters: {best_params}")

    # Save the best XGBoost model using pickle
    with open('best_xgb_model.pkl', 'wb') as f:
        pickle.dump(grid_search.best_estimator_, f)

    # Save the best parameters using pickle
    with open('best_xgb_params.pkl', 'wb') as f:
        pickle.dump(best_params, f)

    # Evaluate the model performance on test
    y_pred = grid_search.best_estimator_.predict(x_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Final XGBoost Model Performance (MAE): {mae:.4f}")

    return grid_search.best_estimator_, best_params

best_xgb_model, best_params = tune_hyperparameter(x_train, x_test, y_train, y_test)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Final XGBoost Model Performance (MAE): 0.5108


In [90]:
def extract_important_features(best_xgb_model):

  # Extract feature importances
  feature_importances = best_xgb_model.feature_importances_

  # Create a DataFrame to display feature importances
  features = data.drop(columns=['overall']).columns
  importance_df = pd.DataFrame({
      'Feature': features,
      'Importance': feature_importances
  })

  # Sort features by importance
  importance_df = importance_df.sort_values(by='Importance', ascending=False)

  # Display the top features
  print(importance_df.head())  # Displaying the top 5 features

  return importance_df

In [92]:
# call the function for extracting the feature importances
important_features = extract_important_features(best_xgb_model)

                     Feature  Importance
1                  value_eur    0.662888
13        movement_reactions    0.072866
3                        age    0.064341
4   international_reputation    0.048114
2                   wage_eur    0.035468


In [93]:
# display the features ranked by their importances
important_features

Unnamed: 0,Feature,Importance
1,value_eur,0.662888
13,movement_reactions,0.072866
3,age,0.064341
4,international_reputation,0.048114
2,wage_eur,0.035468
0,potential,0.033334
39,lb,0.01392
17,mentality_composure,0.011245
22,lf,0.010406
8,physic,0.00956


# Test With New Data Set

In [32]:
# import the new dataset
data = pd.read_csv('players_22.csv', low_memory=False)

# create the identifier columns from the new data set
identifier_col = [
    'sofifa_id',
    'player_url',
    'short_name',
    'long_name',
    'dob',
    'player_face_url',
    'club_logo_url',
    'club_flag_url',
    'nation_logo_url',
    'nation_flag_url'
]

# select the columns with ratings for each position
position_rate = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']


# extract the feature subset for uniformity
feature_subset = ['potential', 'value_eur', 'wage_eur', 'age',
       'international_reputation', 'shooting', 'passing', 'dribbling',
       'physic', 'attacking_short_passing', 'skill_curve',
       'skill_long_passing', 'skill_ball_control', 'movement_reactions',
       'power_shot_power', 'power_long_shots', 'mentality_vision',
       'mentality_composure', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
       'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm',
       'cdm', 'rdm', 'rwb', 'lb', 'rb']

In [33]:
# preprocess that data
data = preprocess_data(data, identifier_col, position_rate)

...imputing missing data
...calculating effective positional ratings
...Encoding categorical variables
...create feature subset

Correlation coeffients:
  real_face_No              -0.435513
club_position_RES         -0.339250
work_rate_Medium/Medium   -0.228235
club_team_id              -0.227651
nationality_name_India    -0.189739
                             ...   
potential                  0.644275
passing                    0.654773
mentality_composure        0.708867
movement_reactions         0.871823
overall                    1.000000
Name: corr_coeff, Length: 3573, dtype: float64

 Features with the strongest correlation:
 ['real_face_No' 'real_face_Yes' 'overall' 'potential' 'value_eur'
 'wage_eur' 'age' 'international_reputation' 'release_clause_eur'
 'shooting' 'passing' 'dribbling' 'physic' 'attacking_crossing'
 'attacking_short_passing' 'skill_curve' 'skill_long_passing'
 'skill_ball_control' 'movement_reactions' 'power_shot_power'
 'power_long_shots' 'mentality_aggress

In [34]:
data.shape

(19239, 50)

In [35]:

# Function to load the saved model
def load_model(model_path):
    with open(model_path, 'rb') as f:
        loaded_model = pickle.load(f)
    return loaded_model

# Function to test and evaluate the model
def test_model(data):
    # Separate the target from the features
    Y = data['overall']  # Get the target
    X = data[feature_subset]  # Extract the features

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform the training data
    X_scaled = scaler.fit_transform(X)

    # Save the scaler to a pickle file
    with open('scaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    print('...Scaler saved')

    # Paths to saved model and parameters
    model_path = 'best_xgb_model.pkl'
    params_path = 'best_xgb_params.pkl'

    # Load the model and parameters
    best_xgb_model = load_model(model_path)
    best_params = load_model(params_path)

    # Make predictions using the loaded model
    y_pred = best_xgb_model.predict(X_scaled)

    # Measure performance using Mean Absolute Error (MAE)
    mae = mean_absolute_error(Y, y_pred)
    print(f"XGBoost Model Performance on New Data (MAE): {mae:.4f}")

    # Create a default values for model prediction: Calculate the mean for all features
    default_values = X.mean().to_dict()

    # Save the default values to a JSON file
    with open('default_values.json', 'w') as json_file:
        json.dump(default_values, json_file)
    print('... Default values for features created')


# Test and evaluate the model
test_model(data)


XGBoost Model Performance on New Data (MAE): 0.9897
... Default values for features created
