The purpose of this code is to clean, impute and encode a dataset containing some features for rating a player's performance by FIFA in 2021, use that data to train an Artificial Intelligence model to predict a player's rating, and test that model using the 2022 dataset.

---



In [None]:
# Importing relevant libraries and connecting the file to Google Drive
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Reading the dataset
df = pd.read_csv('/content/drive/My Drive/AI/players_21.csv')

In [None]:
# Displaying the first five rows in the dataset
df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,103500000.0,560000.0,33,...,52+3,52+3,52+3,62+3,19+3,https://cdn.sofifa.net/players/158/023/21_120.png,https://cdn.sofifa.net/teams/241/60.png,https://cdn.sofifa.net/flags/es.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,92,63000000.0,220000.0,35,...,54+3,54+3,54+3,61+3,20+3,https://cdn.sofifa.net/players/020/801/21_120.png,https://cdn.sofifa.net/teams/45/60.png,https://cdn.sofifa.net/flags/it.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
2,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,91,91,111000000.0,240000.0,31,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/21_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,,https://cdn.sofifa.net/flags/pl.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,132000000.0,270000.0,28,...,49+3,49+3,49+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/21_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CAM, CM",91,91,129000000.0,370000.0,29,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/21_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [None]:
# Viewing the description of the data in the dataset
df.describe()

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
count,18944.0,18944.0,18944.0,18707.0,18719.0,18944.0,18944.0,18944.0,18719.0,18719.0,...,18944.0,18944.0,18944.0,18944.0,18944.0,18944.0,18944.0,18944.0,18944.0,2083.0
mean,226232.328917,65.677787,71.086729,2902288.0,9148.482825,25.225823,181.190773,75.016892,47908.905551,1.355468,...,57.978674,46.470017,47.581767,45.546505,16.446052,16.236486,16.103357,16.225982,16.551309,37.203553
std,27166.574284,7.002278,6.109985,7743775.0,19893.149956,4.697354,6.825672,7.05714,53906.733776,0.739015,...,12.11839,20.169591,21.402461,20.953997,17.577332,16.84548,16.519399,17.017341,17.878121,10.714523
min,41.0,47.0,47.0,9000.0,500.0,16.0,155.0,50.0,1.0,1.0,...,12.0,3.0,5.0,4.0,1.0,1.0,1.0,1.0,1.0,12.0
25%,210028.75,61.0,67.0,475000.0,1000.0,21.0,176.0,70.0,461.0,1.0,...,50.0,29.0,27.0,24.0,8.0,8.0,8.0,8.0,8.0,28.0
50%,232301.5,66.0,71.0,975000.0,3000.0,25.0,181.0,75.0,1913.0,1.0,...,59.0,52.0,55.0,52.0,11.0,11.0,11.0,11.0,11.0,37.0
75%,246745.75,70.0,75.0,2000000.0,8000.0,29.0,186.0,80.0,110986.0,1.0,...,66.0,63.0,65.0,63.0,14.0,14.0,14.0,14.0,14.0,45.0
max,258970.0,93.0,95.0,185500000.0,560000.0,53.0,206.0,110.0,114899.0,4.0,...,96.0,94.0,93.0,90.0,90.0,92.0,93.0,91.0,90.0,65.0


In [None]:
# Creating a list of columns that would be unneccessary in the prediction
columns_to_drop=['sofifa_id','player_url','short_name','dob','long_name','club_team_id',
                 'club_position','club_flag_url','club_loaned_from','club_joined',
                 'club_contract_valid_until','nationality_id','nation_flag_url',
                 'nation_logo_url','club_logo_url','player_face_url','nationality_name',
                 'nation_team_id','nation_position','nation_jersey_number','real_face',
                 'ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm',
                 'lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk']

Justification for dropping the listed columns when predicting a player's rating:
**sofifa_id, player_url:** These columns are unique identifiers and contain no inherent information about a player's skill or performance. They are only for administrative purposes and thus irrelevant for predicting ratings.

**short_name, dob, long_name:** A player's name, date of birth, and full name do not directly impact their abilities or performance on the field. Including these personal details in the model would introduce noise and complexity without adding predictive value.

**club_team_id:** Information about a player's club is gotten from their club name. As such, the team ID is not relevant to their rating.

**club_position,club_flag_url, club_loaned_from, club_joined, club_contract_valid_until:** While these columns provide contextual information about a player's team and role, they are not measures of a player's inherent talent or potential. Including them could lead to overfitting based on transient factors.

**nationality_id, nation_flag_url, nation_logo_url, nationality_name, nation_team_id, nation_position, nation_jersey_number:** These columns describe a player's nationality and involvement with national teams, but nationality does not correlate with a player's performance or rating. These are social and administrative attributes, not performance indicators.

**player_face_url, real_face:** The player's appearance or facial features are unrelated to their athletic abilities or rating. Including visual aspects would be irrelevant to predicting their performance.

**Columns like ls, st, rs, lw, lf, cf, rf, rw, lam, cam, ram, lm, lcm, cm, rcm, rm, lwb, ldm, cdm, rdm, rwb, lb, lcb, cb, rcb, rb:**  While these attributes provide detailed information about a player's positions and skills, including all of them could result in overfitting and a more complex model. By selecting the most relevant features, we ensure the model focuses on the attributes that truly influence a player's rating. These features are also mentioned in the player_positions column.

In [None]:
# Dropping the unneccessary columns using the list
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,player_positions,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_name,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,"RW, ST, CF",93,93,103500000.0,560000.0,33,170,72,FC Barcelona,Spain Primera Division,...,96,32,35,24,6,11,15,14,8,
1,"ST, LW",92,92,63000000.0,220000.0,35,187,83,Juventus,Italian Serie A,...,95,28,32,24,7,11,15,14,11,
2,ST,91,91,111000000.0,240000.0,31,184,80,FC Bayern München,German 1. Bundesliga,...,88,35,42,19,15,6,12,8,10,
3,"LW, CAM",91,91,132000000.0,270000.0,28,175,68,Paris Saint-Germain,French Ligue 1,...,93,35,30,29,9,9,15,15,11,
4,"CAM, CM",91,91,129000000.0,370000.0,29,181,70,Manchester City,English Premier League,...,91,68,65,53,15,13,5,10,13,


In [None]:
# Imputing missing values
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = imputer.fit_transform(df)

In [None]:
# Converting the imputed array back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

In [None]:
# Getting the categorical columns
cat_cols = df_imputed.select_dtypes(include=['object']).columns.tolist()
cat_cols

['player_positions',
 'overall',
 'potential',
 'value_eur',
 'wage_eur',
 'age',
 'height_cm',
 'weight_kg',
 'club_name',
 'league_name',
 'league_level',
 'club_jersey_number',
 'preferred_foot',
 'weak_foot',
 'skill_moves',
 'international_reputation',
 'work_rate',
 'body_type',
 'release_clause_eur',
 'player_tags',
 'player_traits',
 'pace',
 'shooting',
 'passing',
 'dribbling',
 'defending',
 'physic',
 'attacking_crossing',
 'attacking_finishing',
 'attacking_heading_accuracy',
 'attacking_short_passing',
 'attacking_volleys',
 'skill_dribbling',
 'skill_curve',
 'skill_fk_accuracy',
 'skill_long_passing',
 'skill_ball_control',
 'movement_acceleration',
 'movement_sprint_speed',
 'movement_agility',
 'movement_reactions',
 'movement_balance',
 'power_shot_power',
 'power_jumping',
 'power_stamina',
 'power_strength',
 'power_long_shots',
 'mentality_aggression',
 'mentality_interceptions',
 'mentality_positioning',
 'mentality_vision',
 'mentality_penalties',
 'mentality_co

In [None]:
# Encoding the object columns using the LabelEncoder
label_encoder = LabelEncoder()
for col in cat_cols:
    df_imputed[col] = label_encoder.fit_transform(df_imputed[col])

In [None]:
# Ensuring that the columns of type object are now of type int or float
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 62 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   player_positions             18944 non-null  int64
 1   overall                      18944 non-null  int64
 2   potential                    18944 non-null  int64
 3   value_eur                    18944 non-null  int64
 4   wage_eur                     18944 non-null  int64
 5   age                          18944 non-null  int64
 6   height_cm                    18944 non-null  int64
 7   weight_kg                    18944 non-null  int64
 8   club_name                    18944 non-null  int64
 9   league_name                  18944 non-null  int64
 10  league_level                 18944 non-null  int64
 11  club_jersey_number           18944 non-null  int64
 12  preferred_foot               18944 non-null  int64
 13  weak_foot                    18944 non-null  i

In [None]:
# Splitting the data into features (X) and target variable (y)
X = df_imputed.drop(columns=['overall'])
y = df_imputed['overall']

In [None]:
# Feature selection of features with a rating greater than 0.4
corr_matrix = df_imputed.corr()
corr_overall_rating = abs(corr_matrix['overall'])
relevant_features = corr_overall_rating[corr_overall_rating > 0.4]
relevant_features.drop(['overall'], inplace=True)
X = df_imputed[relevant_features.index]
y = df_imputed['overall']

In [None]:
# Scaling the data
scaler = StandardScaler()
scaler.fit_transform(X)

array([[ 3.58669086,  6.08040974,  7.10223075, ..., -0.66785648,
         2.92728918,  3.1375727 ],
       [ 3.4230146 ,  4.9724    ,  6.37315112, ...,  0.4366283 ,
         2.05286089,  3.05505131],
       [ 3.25933834,  6.21471395,  6.49466439, ...,  1.4829823 ,
         1.83425382,  2.47740157],
       ...,
       [-3.94241716, -1.70923447, -0.31007886, ...,  0.08784363,
        -0.71616203, -1.89623213],
       [-0.66889193, -1.40705   , -0.97840185, ..., -1.30729503,
        -0.35181691, -1.07101823],
       [-0.66889193, -1.40705   , -0.97840185, ..., -0.90037959,
        -0.49755496, -1.64866796]])

In [None]:
# Saving relevant features to a CSV file
relevant_features.to_csv('relevant_features.csv', index=False)

In [None]:
print(relevant_features.sort_values(ascending=False))

movement_reactions          0.867267
value_eur                   0.866719
release_clause_eur          0.840014
mentality_composure         0.705252
wage_eur                    0.695656
passing                     0.654135
potential                   0.636357
dribbling                   0.595440
power_shot_power            0.558361
mentality_vision            0.509067
attacking_short_passing     0.502234
skill_long_passing          0.487135
physic                      0.486902
age                         0.468528
skill_ball_control          0.449380
shooting                    0.442689
international_reputation    0.440987
skill_curve                 0.420495
attacking_crossing          0.410530
power_long_shots            0.407525
mentality_aggression        0.401920
Name: overall, dtype: float64


In [None]:
# Creating an XGBoostRegressor model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBRegressor(random_state=42)

# Defining the number of splits for K-Fold cross-validation
n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store the results
mse_scores = []
mae_scores = []
r2_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_val_fold)

    mse = mean_squared_error(y_val_fold, y_pred)
    mae = mean_absolute_error(y_val_fold, y_pred)
    r2 = r2_score(y_val_fold, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("XGBoost Regressor Cross-validation Mean Squared Error:", np.mean(mse_scores))
print("XGBoost Regressor Cross-validation Mean Absolute Error:", np.mean(mae_scores))
print("XGBoost Regressor Cross-validation R-squared score:", np.mean(r2_scores))

XGBoost Regressor Cross-validation Mean Squared Error: 0.295956954759664
XGBoost Regressor Cross-validation Mean Absolute Error: 0.3725062053850955
XGBoost Regressor Cross-validation R-squared score: 0.9940170225288443


**The below is Random Forest Regressor that is implemented using the RandomForestRegressor class from the sklearn.ensemble module. It creates an ensemble of decision trees and uses bagging to improve the overall performance.**

In [None]:
# Creating a RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Defining the number of splits for K-Fold cross-validation
n_splits = 5  # You can adjust this as needed

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store the results
mse_scores = []
mae_scores = []
r2_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_val_fold)

    mse = mean_squared_error(y_val_fold, y_pred)
    mae = mean_absolute_error(y_val_fold, y_pred)
    r2 = r2_score(y_val_fold, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Random Forest Regressor Cross-validation Mean Squared Error:", np.mean(mse_scores))
print("Random Forest Regressor Cross-validation Mean Absolute Error:", np.mean(mae_scores))
print("Random Forest Regressor Cross-validation R-squared score:", np.mean(r2_scores))

Random Forest Regressor Cross-validation Mean Squared Error: 0.2489061497855493
Random Forest Regressor Cross-validation Mean Absolute Error: 0.2829119102606401
Random Forest Regressor Cross-validation R-squared score: 0.9949674415498315


In [None]:
# Creating a GradientBoostingRegressor model
model = GradientBoostingRegressor(random_state=42)

# Defining the number of splits for K-Fold cross-validation
n_splits = 5  # You can adjust this as needed

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Lists to store the results
mse_scores = []
mae_scores = []
r2_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model.fit(X_train_fold, y_train_fold)
    y_pred = model.predict(X_val_fold)

    mse = mean_squared_error(y_val_fold, y_pred)
    mae = mean_absolute_error(y_val_fold, y_pred)
    r2 = r2_score(y_val_fold, y_pred)

    mse_scores.append(mse)
    mae_scores.append(mae)
    r2_scores.append(r2)

print("Gradient Boosting Regressor Cross-validation Mean Squared Error:", np.mean(mse_scores))
print("Gradient Boosting Regressor Cross-validation Mean Absolute Error:", np.mean(mae_scores))
print("Gradient Boosting Regressor Cross-validation R-squared score:", np.mean(r2_scores))

Gradient Boosting Regressor Cross-validation Mean Squared Error: 0.6620608168995138
Gradient Boosting Regressor Cross-validation Mean Absolute Error: 0.5950105568660646
Gradient Boosting Regressor Cross-validation R-squared score: 0.9866069924906812


In [None]:
# Creating the AdaBoostRegressor with a base estimator (i.e. DecisionTreeRegressor)
dt = DecisionTreeRegressor(random_state=42)
ab = AdaBoostRegressor(estimator=dt, random_state=42)

cv = KFold(n_splits=2)

# Defining the hyperparameter grid to search
PARAMETERS = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
}
# Creating the GridSearchCV object
model_gs = GridSearchCV(ab, param_grid=PARAMETERS, cv=cv, scoring="neg_mean_squared_error")
model_gs.fit(X_train, y_train)

y_pred = model_gs.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Grid Search Mean Squared Error:', mse)
print('Grid Search Regressor Mean Absolute Error:', mae)
print('Grid Search Regressor R-squared score:', r2)

# Getting the best parameters and best estimator
best_params = model_gs.best_params_
best_regressor = model_gs.best_estimator_

# Using the best regressor for predictions
y_pred = best_regressor.predict(X_test)


# Save the trained model
joblib.dump(best_regressor, 'trained_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

Grid Search Mean Squared Error: 0.21060965954077593
Grid Search Regressor Mean Absolute Error: 0.1572974399577725
Grid Search Regressor R-squared score: 0.9955317752177252


['scaler.pkl']

We chose the Ada Boost Regressor with Grid Search CV as the best model becuase of the following:

Mean Squared Error (MSE): The MSE measures the average squared difference between the predicted and actual target values. In our case, the MSE is relatively low (approximately 0.21), which means that, on average, the model's predictions are very close to the actual values. Lower MSE values indicate better predictive accuracy.

Mean Absolute Error (MAE): The MAE measures the average absolute difference between predicted and actual values. A low MAE (around 0.157) indicates that the model's predictions are, on average, close to the actual values. MAE is a more robust metric than MSE to the influence of outliers.

R-squared (R2) score: The R2 score measures the proportion of the variance in the target variable that is predictable from the features. An R2 score close to 1 (0.996 in our case) indicates that the model explains a large portion of the variance in the target variable. In other words, the Ada Boost model captures most of the patterns and relationships within the data, making it a strong predictor.

**The below is Gradient Boosting Regressor that is implemented using the GradientBoostingRegressor class, which creates an ensemble of decision trees and uses boosting to iteratively improve the model's performance.**

In [None]:
# Reading the 2022 dataset for testing the model
df_new_season = pd.read_csv('/content/drive/My Drive/AI/players_22.csv')

  df_new_season = pd.read_csv('/content/drive/My Drive/AI/players_22.csv')


In [None]:
df_new_season.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [None]:
# Creating a list of columns that would be unneccessary in the prediction
columns_to_drop_2=['sofifa_id','player_url','short_name','dob','long_name','club_team_id','club_name',
                 'league_name','league_level','club_position','club_jersey_number','club_flag_url',
                 'club_loaned_from','club_joined','club_contract_valid_until','nationality_id',
                 'nation_flag_url','nation_logo_url','club_logo_url','player_face_url','nationality_name',
                 'nation_team_id','nation_position','nation_jersey_number','real_face','player_tags',
                 'ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm',
                 'lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk']

In [None]:
# Dropping the unneccessary columns using the list
df_new_season.drop(columns_to_drop_2, axis=1, inplace=True)

In [None]:
# Imputing the new dataset
imputer = SimpleImputer(strategy='most_frequent')
df_imputed_new_season = imputer.fit_transform(df_new_season)

In [None]:
# Converting the imputed array back to a DataFrame
df_imputed_new_season = pd.DataFrame(df_imputed, columns=df.columns)

In [None]:
# Getting the categorical columns
cat_cols_2 = df_new_season.select_dtypes(include=['object']).columns.tolist()
cat_cols_2

['player_positions',
 'preferred_foot',
 'work_rate',
 'body_type',
 'player_traits']

In [None]:
# Label encoding the categorical columns in the new dataset
label_encoder = LabelEncoder()
for col in cat_cols_2:
    df_imputed_new_season[col] = label_encoder.fit_transform(df_imputed_new_season[col])

In [None]:
# Selecting the independent and dependent variables
X_new_season = df_imputed_new_season[relevant_features.index]
y_new_season = df_imputed_new_season['overall']

In [None]:
# Using the trained model to make predictions on the new season's data
y_pred_new_season = model_gs.predict(X_new_season)

In [None]:
# Saving the y_test and y_pred values in a CSV file
data = {'y_test': y_new_season, 'y_pred': y_pred_new_season}
new_df = pd.DataFrame(data)
new_df.to_csv('y_test_and_y_pred.csv', index=False)

In [None]:
# Evaluating the model's performance on the new data
mse_new_season = mean_squared_error(y_new_season, y_pred_new_season)
mae_new_season = mean_absolute_error(y_new_season, y_pred_new_season)
r2_new_season = r2_score(y_new_season, y_pred_new_season)
print('Mean Squared Error (New Season):', mse_new_season)
print('Mean Absolute Error (New Season):', mae_new_season)
print('R-squared score (New Season):', r2_new_season)

Mean Squared Error (New Season): 0.04212415540540541
Mean Absolute Error (New Season): 0.03146114864864865
R-squared score (New Season): 0.9991408372455347
