In [1]:
import pandas as pd

# Load the dataset to examine its structure and prepare for preprocessing
data = pd.read_csv('Final.csv')

# Display basic info and first few rows to understand the dataset's structure
data_info = data.info()
data_head = data.head()

data_info, data_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 55 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Player               1440 non-null   object 
 1   Nation               1440 non-null   object 
 2   Pos                  1440 non-null   object 
 3   Squad                1440 non-null   object 
 4   Age                  1440 non-null   float64
 5   90s                  1440 non-null   float64
 6   MP                   1440 non-null   int64  
 7   Gls                  1440 non-null   float64
 8   Ast                  1440 non-null   float64
 9   Sh                   1440 non-null   float64
 10  SoT                  1440 non-null   float64
 11  xG                   1440 non-null   float64
 12  npxG                 1440 non-null   float64
 13  xA                   1440 non-null   float64
 14  SCA                  1440 non-null   float64
 15  GCA                  1440 non-null   f

(None,
               Player Nation    Pos           Squad   Age   90s  MP  Gls  Ast  \
 0    Aaron Cresswell    ENG  DF,FW        West Ham  33.0   4.8  11  0.0  0.0   
 1  Aaron Wan-Bissaka    ENG     DF  Manchester Utd  25.0  19.8  22  0.0  2.0   
 2       Aarón Martín    ESP     DF           Genoa  26.0  15.3  22  0.0  1.0   
 3       Abakar Sylla    CIV     DF      Strasbourg  20.0  19.9  22  2.0  0.0   
 4        Abdel Abqar    MAR     DF          Alavés  24.0  25.7  27  0.0  0.0   
 
      Sh  ...  Carries  TotDist_possession  PrgDist_possession  PrgC  \
 0   0.0  ...    169.0               522.0               220.0   4.0   
 1   3.0  ...    594.0              2909.0              1429.0  30.0   
 2   5.0  ...    367.0              1839.0               972.0  28.0   
 3  11.0  ...   1060.0              5999.0              3215.0   8.0   
 4  10.0  ...    492.0              2460.0              1418.0   7.0   
 
    1/3_possession  CPA   Mis  Dis    Rec  PrgR  
 0             5.0  0

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Drop irrelevant columns
data_cleaned = data.drop(columns=['Player', 'Nation', 'Pos', 'Squad'])

# Handle missing values by filling with median values
data_cleaned['Succ%'].fillna(data_cleaned['Succ%'].median(), inplace=True)
data_cleaned['Tkld%'].fillna(data_cleaned['Tkld%'].median(), inplace=True)

# Separate features and target variable
X = data_cleaned.drop(columns=['market_value_in_eur'])
y = data_cleaned['market_value_in_eur']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the shape of the processed datasets
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Succ%'].fillna(data_cleaned['Succ%'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Tkld%'].fillna(data_cleaned['Tkld%'].median(), inplace=True)


((1152, 50), (288, 50), (1152,), (288,))

In [4]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import random

# Jellyfish Search Optimizer (JSO) Parameters
population_size = 20
iterations = 50
search_space = {
    'max_depth': (3, 20),  # Range for max depth of the tree
    'min_samples_split': (2, 20)  # Range for min samples split
}

# Initialize population with random individuals within the search space
def initialize_population():
    population = []
    for _ in range(population_size):
        individual = {
            'max_depth': random.randint(search_space['max_depth'][0], search_space['max_depth'][1]),
            'min_samples_split': random.randint(search_space['min_samples_split'][0], search_space['min_samples_split'][1])
        }
        population.append(individual)
    return population

# Objective function: Mean Squared Error (MSE)
def objective_function(individual):
    model = DecisionTreeRegressor(max_depth=individual['max_depth'], min_samples_split=individual['min_samples_split'])
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predictions)
    return mse

# Update individual position based on JSO algorithm rules
def update_position(individual, best_individual):
    # Random perturbations around the best solution
    for param in individual.keys():
        if random.random() < 0.5:
            # Move towards the best solution
            individual[param] = int(best_individual[param] + random.uniform(-1, 1) * (search_space[param][1] - search_space[param][0]) / 2)
        else:
            # Random movement in the search space
            individual[param] = random.randint(search_space[param][0], search_space[param][1])
        # Clip the values to ensure they stay within the defined search space
        individual[param] = max(search_space[param][0], min(individual[param], search_space[param][1]))
    return individual

# Main JSO Optimization Loop
def jellyfish_search_optimizer():
    population = initialize_population()
    best_individual = min(population, key=objective_function)  # Best solution based on objective function

    for _ in range(iterations):
        for i in range(population_size):
            # Update each individual towards the best individual found so far
            population[i] = update_position(population[i], best_individual)

            # Evaluate and update the best individual
            if objective_function(population[i]) < objective_function(best_individual):
                best_individual = population[i]

    return best_individual

# Find optimal hyperparameters using JSO
best_hyperparameters = jellyfish_search_optimizer()
print("Best Hyperparameters:", best_hyperparameters)

# Train the optimized Decision Tree model
optimized_model = DecisionTreeRegressor(max_depth=best_hyperparameters['max_depth'],
                                        min_samples_split=best_hyperparameters['min_samples_split'])
optimized_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = optimized_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Best Hyperparameters: {'max_depth': 4, 'min_samples_split': 20}
Mean Squared Error: 224784100995955.88
R-squared: 0.3110472909313584


In [5]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import random

# Honey Badger Algorithm (HBA) Parameters
population_size = 20
iterations = 50
search_space = {
    'max_depth': (3, 20),  # Range for max depth of the tree
    'min_samples_split': (2, 20)  # Range for min samples split
}

# Initialize population with random individuals within the search space
def initialize_population():
    population = []
    for _ in range(population_size):
        individual = {
            'max_depth': random.randint(search_space['max_depth'][0], search_space['max_depth'][1]),
            'min_samples_split': random.randint(search_space['min_samples_split'][0], search_space['min_samples_split'][1])
        }
        population.append(individual)
    return population

# Objective function: Mean Squared Error (MSE)
def objective_function(individual):
    model = DecisionTreeRegressor(max_depth=individual['max_depth'], min_samples_split=individual['min_samples_split'])
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predictions)
    return mse

# Update individual position based on HBA algorithm rules
def update_position(individual, best_individual, iteration, max_iterations):
    for param in individual.keys():
        # Honey phase: Move closer to the best solution found
        if random.random() < 0.5:
            step = (best_individual[param] - individual[param]) * np.exp(-iteration / max_iterations)
            individual[param] = int(individual[param] + step)
        else:
            # Digging phase: Random movement in search space
            individual[param] = random.randint(search_space[param][0], search_space[param][1])
        
        # Clip values to ensure they stay within the defined search space
        individual[param] = max(search_space[param][0], min(individual[param], search_space[param][1]))
    return individual

# Main HBA Optimization Loop
def honey_badger_optimizer():
    population = initialize_population()
    best_individual = min(population, key=objective_function)  # Initial best solution

    for iteration in range(iterations):
        for i in range(population_size):
            # Update each individual based on Honey Badger algorithm
            population[i] = update_position(population[i], best_individual, iteration, iterations)

            # Update best solution if a better one is found
            if objective_function(population[i]) < objective_function(best_individual):
                best_individual = population[i]

    return best_individual

# Find optimal hyperparameters using HBA
best_hyperparameters = honey_badger_optimizer()
print("Best Hyperparameters:", best_hyperparameters)

# Train the optimized Decision Tree model
optimized_model = DecisionTreeRegressor(max_depth=best_hyperparameters['max_depth'],
                                        min_samples_split=best_hyperparameters['min_samples_split'])
optimized_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = optimized_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Best Hyperparameters: {'max_depth': 4, 'min_samples_split': 16}
Mean Squared Error: 222082364139423.12
R-squared: 0.3193279874675087
