In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

**Load the data**

In [2]:
fifa_data = pd.read_csv('male_players (legacy).csv', low_memory=False)

In [3]:
fifa_data.head()

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/150002,15,2,2014-09-18,L. Messi,Lionel Andrés Messi Cuccittini,CF,93.0,95.0,...,62+3,62+3,62+3,54+3,45+3,45+3,45+3,54+3,15+3,https://cdn.sofifa.net/players/158/023/15_120.png
1,20801,/player/20801/c-ronaldo-dos-santos-aveiro/150002,15,2,2014-09-18,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"LW, LM",92.0,92.0,...,63+3,63+3,63+3,57+3,52+3,52+3,52+3,57+3,16+3,https://cdn.sofifa.net/players/020/801/15_120.png
2,9014,/player/9014/arjen-robben/150002,15,2,2014-09-18,A. Robben,Arjen Robben,"RM, LM, RW",90.0,90.0,...,64+3,64+3,64+3,55+3,46+3,46+3,46+3,55+3,14+3,https://cdn.sofifa.net/players/009/014/15_120.png
3,41236,/player/41236/zlatan-ibrahimovic/150002,15,2,2014-09-18,Z. Ibrahimović,Zlatan Ibrahimović,ST,90.0,90.0,...,65+3,65+3,61+3,56+3,55+3,55+3,55+3,56+3,17+3,https://cdn.sofifa.net/players/041/236/15_120.png
4,167495,/player/167495/manuel-neuer/150002,15,2,2014-09-18,M. Neuer,Manuel Peter Neuer,GK,90.0,90.0,...,40+3,40+3,36+3,36+3,38+3,38+3,38+3,36+3,87+3,https://cdn.sofifa.net/players/167/495/15_120.png


In [4]:
fifa_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5893 entries, 0 to 5892
Columns: 110 entries, player_id to player_face_url
dtypes: float64(60), int64(3), object(47)
memory usage: 4.9+ MB


**Drop irrelevant columns**

In [5]:
irrelevant_cols = [
    'player_id', 'player_url', 'fifa_version', 'fifa_update', 'fifa_update_date',
    'short_name', 'long_name', 'player_face_url'
]
fifa_data.drop(columns=irrelevant_cols, inplace=True)

**Drop columns with more than 40% missing values**

In [6]:
threshold = 0.4
columns_to_keep = fifa_data.columns[fifa_data.isnull().mean() < threshold]
fifa_data = fifa_data[columns_to_keep]

In [7]:
print(fifa_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5893 entries, 0 to 5892
Data columns (total 94 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   player_positions                5892 non-null   object 
 1   overall                         5892 non-null   float64
 2   potential                       5892 non-null   float64
 3   value_eur                       5730 non-null   float64
 4   wage_eur                        5761 non-null   float64
 5   age                             5892 non-null   float64
 6   dob                             5892 non-null   object 
 7   height_cm                       5892 non-null   float64
 8   weight_kg                       5892 non-null   float64
 9   league_id                       5761 non-null   float64
 10  league_name                     5761 non-null   object 
 11  league_level                    5758 non-null   float64
 12  club_team_id                    57

**Separate numeric and non-numeric features**

In [8]:
numeric_data = fifa_data.select_dtypes(include=np.number)
non_numeric = fifa_data.select_dtypes(include=['object'])

**Multivariate Imputation**

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
numeric_data = pd.DataFrame(np.round(imp.fit_transform(numeric_data)), columns=numeric_data.columns)

**Deal with non-numeric data**

In [10]:
non_numeric = pd.get_dummies(non_numeric).astype(int)

**Combine numeric and non-numeric data**

In [11]:
X = pd.concat([numeric_data, non_numeric], axis=1)

**Separate target variable**

In [12]:
y = X['overall']
X = X.drop('overall', axis=1)

In [13]:

column_names = X.columns

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=column_names)

**Split data into train and test sets**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train = X_train.loc[:, (X_train != X_train.iloc[0]).any()]
X_test = X_test[X_train.columns]

**Feature Importance**

In [16]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [17]:
feature_importances = rf.feature_importances_
feature_names = X_train.columns

assert len(feature_importances) == len(feature_names), "Mismatch between features and importances"

importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values('importance', ascending=False)

print("Best features:")
print(importance_df.head(20))

top_features = importance_df['feature'].head(20).tolist()

available_top_features = list(set(top_features) & set(fifa_data.columns))

excluded_features = set(top_features) - set(available_top_features)
print("\nExcluded features:", excluded_features)

X_train = X_train[available_top_features]
X_test = X_test[available_top_features]

fifa_data_filtered = fifa_data[available_top_features + ['overall']]

print("\nFinal features used:")
print(available_top_features)

Best features:
                          feature  importance
1                       value_eur    0.811187
2                        wage_eur    0.117989
0                       potential    0.033816
3                             age    0.019503
19                      defending    0.000977
47      defending_standing_tackle    0.000726
42        mentality_interceptions    0.000684
48       defending_sliding_tackle    0.000550
14       international_reputation    0.000497
34             movement_reactions    0.000329
46    defending_marking_awareness    0.000288
22            attacking_finishing    0.000261
9588                        rb_75    0.000213
6                       league_id    0.000197
30             skill_ball_control    0.000161
29             skill_long_passing    0.000160
25              attacking_volleys    0.000150
8                    club_team_id    0.000141
9663                        gk_76    0.000139
21             attacking_crossing    0.000131

Excluded features:

**Define classification models**

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, auc

**Initialize the models**

In [19]:
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier()
sv = SVC(probability=True)
nb = GaussianNB()
LR = LogisticRegression()

**Train and evaluate models**

In [20]:
for model in (dt, knn, sv, nb, LR):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.__class__.__name__)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

DecisionTreeClassifier
[[ 68   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  3 157   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  1   2 105  24   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   1  34  85   0   1   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0  89  26   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   1   1  36  46   1   1   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   1   0   1  69  10   1   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   1   0  12  61   1   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   2   0  58  21   2   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   1  19  32   0   1   0   0   0   0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVC
[[  0  68   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0 154   8   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0  40  92   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0  28  92   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   7  17   0  90   0   1   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   3  11   0  68   0   2   0   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   2   6   0  14   0  54   0   6   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   1   4   0  16   0  52   0   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   3   0   0   5   0  29   0  45   0   1   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0  17   0  36   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression
[[ 1  0  0 23  1  0  2  6 41  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0 64  4  0  2 53 37  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0 57  1  0  1 13 59  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0 40  2  0  2 31 44  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 1  0  0 33  2  0  0 50 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 24  5  0  1 39 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3 15  2  0  0  8 54  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  8  9  0  1 45 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  3 16  0  0 23 40  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  1  6  0  0 16 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  3  0  0 16 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0  4  0  0 14 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0  4  0  0  3 24  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  0  0  7 14  0  0  0  0  0  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**hard voting**


In [21]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(estimators=[('dt', dt), ('knn', knn), ('sv', sv), ('nb', nb), ('LR', LR)], voting='hard')
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(voting.__class__.__name__)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier
[[ 48  20   5   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  9 149   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  6  28  93   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  2  23  77  18   1   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  1   5   2   5  95   4   1   2   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   3   5   2  63   9   2   0   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   2   6   0  10   1  59   2   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   1   9   0  16  48   1   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  1   2   0   0   2   0  13  14  49   1   1   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0   4   8  39   2   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**soft voting**

In [22]:
soft_voting = VotingClassifier(estimators=[('dt', dt), ('knn', knn), ('sv', sv), ('nb', nb), ('LR', LR)], voting='soft')
soft_voting.fit(X_train, y_train)
y_pred = soft_voting.predict(X_test)
print(soft_voting.__class__.__name__)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier
[[ 61  11   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  2 158   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  4  13  98  17   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  1   4  40  75   0   1   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  1   3   3   1  85  21   1   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   2   2   2  34  43   0   0   2   1   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   3   0   3   2  72   2   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   3   0   9  60   3   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   1   0   0   0   0   7   6  52  16   1   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0   1   5  21  26   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0

**RandomForest classifier**

In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(rf.__class__.__name__)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForestClassifier
[[ 59  15   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  1 160   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   5 116  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   2  42  76   1   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   2  92  21   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   1   3  33  44   4   1   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   1   4  77   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0   7  66   2   0   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   1   6   6  58  11   1   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0   0   0   0   0   0   1   2  18  32   0   0   0   0   0   0   0   0
    0   0   0   0   0]
 [  0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Boosting**

In [24]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100)
gb = GradientBoostingClassifier(n_estimators=100)

In [25]:
for model in (ada, gb):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.__class__.__name__)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

AdaBoostClassifier
[[ 68   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  3 156   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  1   2 108  21   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   1  33  86   0   1   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   2  88  25   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   2  35  47   1   1   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   1   0   0  76   3   1   1   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   1   0  10  63   1   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   1  59  20   2   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   2  18  33   0   0   0   0   0   0   0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GradientBoostingClassifier
[[ 65   9   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  2 160   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   3 111  18   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   2  34  85   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0  91  24   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   2  23  58   0   3   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   3  78   1   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   2   2   4  67   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   1  69  11   1   0   0   0   0   0   0   0
    0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   1  22  29   0   0   0   0   0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**selecting the best model**

In [26]:
models = {
    'DecisionTree': dt,
    'KNeighbors': knn,
    'SVC': sv,
    'GaussianNB': nb,
    'LogisticRegression': LR,
    'Voting (Hard)': voting,
    'Voting (Soft)': soft_voting,
    'RandomForest': rf,
    'AdaBoost': ada,
    'GradientBoosting': gb
}

In [27]:
best_model_name = None
best_model_score = 0

for name, model in models.items():
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score > best_model_score:
        best_model_score = score
        best_model_name = name

print(f"The best model is {best_model_name} with an accuracy of {best_model_score:.2f}")

The best model is GradientBoosting with an accuracy of 0.78


In [38]:
scaler = StandardScaler()
scaler.fit(X_train)

In [39]:
model_data = {
    'model': model,
    'feature_names': X.columns.tolist()
}
with open('best_model.pkl', 'wb') as file:
    pickle.dump(model_data, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

**Testing the model using the function**

In [36]:
new_data = pd.read_csv('players_22.csv', low_memory=False, nrows=10000)


In [37]:
def preprocess_data(data, train_columns, encoder, scaler=None):
    cat_columns = data.select_dtypes(include=['object']).columns
    encoded_cols = encoder.transform(data[cat_columns])
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_columns))

    data = data.drop(columns=cat_columns)
    data = pd.concat([data, encoded_df], axis=1)

    for col in train_columns:
        if col not in data.columns:
            data[col] = 0

    data = data[train_columns]

    data = data.fillna(0)

    if scaler:
        data = pd.DataFrame(scaler.transform(data), columns=data.columns)

    return data


In [None]:
def test_model_in_batches(model, X_new, y_new, train_columns, encoder, scaler=None, batch_size=1000):
    num_batches = len(X_new) // batch_size + 1
    all_predictions = []

    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        X_batch = X_new[start:end]
        y_batch = y_new[start:end]

        X_batch = preprocess_data(X_batch, train_columns, encoder, scaler)

        # Make predictions
        y_pred_batch = model.predict(X_batch)
        all_predictions.extend(y_pred_batch)


    mse = mean_squared_error(y_new, all_predictions)
    r2 = r2_score(y_new, all_predictions)

    return {
        'mean_squared_error': mse,
        'r2_score': r2
    }

with open('best_model.pkl', 'rb') as file:
    model_data = pickle.load(file)

best_model = model_data['model']
train_columns = model_data['feature_names']
encoder = model_data['encoder']
scaler = model_data.get('scaler')

print("Model, encoder, and scaler loaded successfully.")

new_data = pd.read_csv('players_22.csv', low_memory=False)

y_new = new_data['overall']
X_new = new_data.drop('overall', axis=1)

results = test_model_in_batches(best_model, X_new, y_new, train_columns, encoder, scaler=scaler)

print(f"Mean Squared Error on new data: {results['mean_squared_error']:.2f}")
print(f"R2 Score on new data: {results['r2_score']:.2f}")