In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import logging
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Read the dataset from the specified file path and log success or error
def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Dataset {file_path} loaded successfully.")
        return data
    except Exception as e:
        logging.error(f"Error loading dataset {file_path}: {e}")
        return None

# Copy the original data, drop unnecessary columns, apply One-Hot Encoding, align columns, apply label encoding, handle missing values, and return the modified dataset
def preprocess_data(data, age_column_name, all_columns=None):
    # Create a copy of the data to avoid modifying the original dataframe
    data = data.copy()

    # Drop unnecessary columns that are not required for the analysis
    data = data.drop(columns=['position', 'based_in', 'birth_region', 'nationality'])

    # Define nominal categorical columns that will be one-hot encoded
    nominal_columns = ['nation_of_birth', 'group', 'second_nationality', 'club', 'birth_city']
    
    # Apply One-Hot Encoding to nominal categorical variables
    data = pd.get_dummies(data, columns=nominal_columns)

    # Log the new columns after one-hot encoding for reference
    logging.info(f"Columns after one-hot encoding: {data.columns.tolist()}")

    # If all_columns is provided, ensure the dataset includes all specified columns
    if all_columns is not None:
        # Add missing columns with a default value of 0 to maintain consistency
        for col in all_columns:
            if col not in data.columns:
                data[col] = 0
        # Reorder columns to match the order in all_columns
        data = data[all_columns]

    # Initialise a dictionary to store label encoders for ordinal and binary columns
    label_encoders = {}

    # Define ordinal or binary categorical columns that will be label encoded
    ordinal_columns = ['division', 'division_tier', 'is_top_4_tier']

    # Apply Label Encoding to ordinal or binary categorical variables
    for col in ordinal_columns:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le  # Store the encoder for potential inverse transformations

    # Handle missing values by filling NaNs with a placeholder value (e.g., -1)
    data.fillna(-1, inplace=True)

    # Return the preprocessed data along with the label encoders used
    return data, label_encoders


# Filter goalkeepers out of the dataset as they are not relevant to this particular investigation
def filter_goalkeepers(data):
    # Filter out goalkeepers using the correct column name
    if 'group_Goalkeeper' in data.columns:
        logging.info(f"Before filtering, data shape: {data.shape}")
        data = data[data['group_Goalkeeper'] != 1]  # Exclude rows where group_Goalkeeper is 1 (indicating goalkeepers)
        logging.info(f"After filtering goalkeepers, data shape: {data.shape}")
    else:
        logging.warning("Goalkeeper identification column 'group_Goalkeeper' not found. Check the encoding step.")
    return data

# Drop the columns that won't be used in the model and extract the target variables
def prepare_features_and_targets(data, age_column_name):
    # Drop the columns that should not be included in the model's features
    features = data.drop(columns=['goals', 'appearances', 'tier_quality', 'name', 'date_of_birth'])

    # Extract the target variables, converting to appropriate data types
    target_goals = data['goals'].astype(int)
    target_appearances = data['appearances'].astype(int)
    target_tier_quality = data['tier_quality']

    # Assign the features to X and the target variables to separate y variables
    X = features
    y_goals = target_goals
    y_appearances = target_appearances
    y_tier_quality = target_tier_quality

    # Return the features and the target variables
    return X, y_goals, y_appearances, y_tier_quality


# Set the hyperparameters based on what returns the best results through testing different parameters and RandomizedSearchCV
def tune_hyperparameters(X_train, y_train, is_classifier=False):
    # Define the distribution of hyperparameters to search over
    param_distributions = {
        'n_estimators': np.arange(100, 1100, 100),  # Explore a wide range of trees from 100 to 1000
        'max_depth': [None] + list(np.arange(10, 51, 10)),  # Test various depths, including unrestricted growth (None)
        'min_samples_split': [2, 5, 10, 15, 20],  # Experiment with different split criteria to control overfitting
        'min_samples_leaf': [1, 2, 5, 10]  # Adjust leaf size to balance complexity and generalisation
    }
    
    # Choose the model and scoring metric based on whether it is a classifier (output is a category or class) or regressor (output is a continuous value)
    if is_classifier:
        model = RandomForestClassifier(random_state=42)
        scoring = 'accuracy'  # Use accuracy as the scoring metric for classification
    else:
        model = RandomForestRegressor(random_state=42)
        scoring = 'neg_mean_squared_error'  # Use negative mean squared error for regression
    
    # Set up the RandomizedSearchCV to find the best hyperparameters across the defined search space
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, 
                                       n_iter=100, cv=5, n_jobs=-1, verbose=2, scoring=scoring, random_state=42)
    
    # Fit the model to the training data while tuning hyperparameters
    random_search.fit(X_train, y_train)
    
    # Log the best hyperparameters found during the search
    logging.info(f"Best parameters: {random_search.best_params_}")
    
    # Return the model with the best-found hyperparameters
    return random_search.best_estimator_


# Tune hyperparameters, train, evaluate the RandomForest models and log their performance metrics
def train_and_evaluate(X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_val, y_val_goals, y_val_appearances, y_val_tier_quality):
    # Tune hyperparameters for goals prediction
    best_rf_goals = tune_hyperparameters(X_train, y_train_goals)
    best_rf_goals.fit(X_train, y_train_goals)

    # Tune hyperparameters for appearances prediction
    best_rf_appearances = tune_hyperparameters(X_train, y_train_appearances)
    best_rf_appearances.fit(X_train, y_train_appearances)

    # Tune hyperparameters for tier quality prediction
    best_rf_tier_quality = tune_hyperparameters(X_train, y_train_tier_quality, is_classifier=True)
    best_rf_tier_quality.fit(X_train, y_train_tier_quality)

    # Evaluate the models on validation set
    y_val_pred_goals = best_rf_goals.predict(X_val)
    y_val_pred_appearances = best_rf_appearances.predict(X_val)
    y_val_pred_tier_quality = best_rf_tier_quality.predict(X_val)

    logging.info(f'Validation MSE Goals: {mean_squared_error(y_val_goals, y_val_pred_goals):.0f}')
    logging.info(f'Validation MSE Appearances: {mean_squared_error(y_val_appearances, y_val_pred_appearances):.0f}')
    logging.info(f'Validation Accuracy Tier Quality: {accuracy_score(y_val_tier_quality, y_val_pred_tier_quality)}')
    logging.info(f'Validation Classification Report Tier Quality:\n{classification_report(y_val_tier_quality, y_val_pred_tier_quality)}')

    return best_rf_goals, best_rf_appearances, best_rf_tier_quality


# Evaluate the trained RF model and log the metrics and feature importance
def evaluate_models(best_rf_goals, best_rf_appearances, best_rf_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality):
    # Make predictions on the testing set
    y_pred_goals = best_rf_goals.predict(X_test)
    y_pred_appearances = best_rf_appearances.predict(X_test)
    y_pred_tier_quality = best_rf_tier_quality.predict(X_test)

    # Evaluate the models
    mse_goals = mean_squared_error(y_test_goals, y_pred_goals)
    mse_appearances = mean_squared_error(y_test_appearances, y_pred_appearances)
    accuracy_tier_quality = accuracy_score(y_test_tier_quality, y_pred_tier_quality)

    logging.info(f'MSE Goals: {mse_goals:.0f}')
    logging.info(f'MSE Appearances: {mse_appearances:.0f}')
    logging.info(f'Accuracy Tier Quality: {accuracy_tier_quality}')
    logging.info(f'Test Classification Report Tier Quality:\n{classification_report(y_test_tier_quality, y_pred_tier_quality)}')

    # Convert feature importances to percentages
    feature_names = X_test.columns  # Get feature names
    goals_feature_importances = best_rf_goals.feature_importances_ * 100
    appearances_feature_importances = best_rf_appearances.feature_importances_ * 100
    tier_quality_feature_importances = best_rf_tier_quality.feature_importances_ * 100

    # Pair feature names with their importance percentages, formatted to 2 decimal places
    goals_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, goals_feature_importances)}
    appearances_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, appearances_feature_importances)}
    tier_quality_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, tier_quality_feature_importances)}

    # Log the feature importances with names
    logging.info(f'Feature importances for goals (as percentages): {goals_importances_with_names}')
    logging.info(f'Feature importances for appearances (as percentages): {appearances_importances_with_names}')
    logging.info(f'Feature importances for tier quality (as percentages): {tier_quality_importances_with_names}')

    return y_pred_goals, y_pred_appearances, y_pred_tier_quality


# Save the predictions by filtering the original data to match the test set and out put to a CSV file
def save_predictions(original_data, X_test, y_pred_goals, y_pred_appearances, y_pred_tier_quality, output_file):
    # Filter the original data to match the X_test indices
    filtered_original_data = original_data.loc[X_test.index]

    # Reconstruct the 'group' column if necessary
    group_columns = [col for col in filtered_original_data.columns if col.startswith('group_')]
    if group_columns:
        filtered_original_data['group'] = filtered_original_data[group_columns].idxmax(axis=1).str.replace('group_', '')

    # Create predictions DataFrame
    predictions = pd.DataFrame({
        'name': filtered_original_data['name'],
        'position': filtered_original_data['position'],
        'group': filtered_original_data['group'],
        'date_of_birth': filtered_original_data['date_of_birth'],
        'birth_month': filtered_original_data['birth_month'],
        'birth_quarter': filtered_original_data['birth_quarter'],  
        'appearances_pred': np.round(y_pred_appearances, 0).astype(int),
        'goals_pred': np.round(y_pred_goals, 0).astype(int),
        'tier_pred': y_pred_tier_quality
    })
    
    # Save predictions to CSV
    predictions.to_csv(output_file, index=False)
    logging.info(f"Predictions have been saved to '{output_file}'.")

def main():
    # Load and preprocess the 2016 senior players dataset
    data_senior_2016 = load_data('dataset_2016_senior_players.csv')
    if data_senior_2016 is not None:
        logging.info(f"Columns in the 2016 senior dataset: {data_senior_2016.columns.tolist()}")
        data_senior_preprocessed_2016, _ = preprocess_data(data_senior_2016, 'age_(months)_on_1_july_2015')
        
        # Filter out goalkeepers after preprocessing
        data_senior_preprocessed_2016 = filter_goalkeepers(data_senior_preprocessed_2016)

        # Capture all column names after preprocessing the training data
        all_columns_2016 = data_senior_preprocessed_2016.columns.tolist()

        # Prepare features and targets for the senior dataset (training)
        X_senior_2016, y_goals_senior_2016, y_appearances_senior_2016, y_tier_quality_senior_2016 = prepare_features_and_targets(data_senior_preprocessed_2016, 'age_(months)_on_1_july_2015')
        
        # Split the senior dataset into training and validation sets
        X_train_senior_2016, X_val_senior_2016, y_train_goals_senior_2016, y_val_goals_senior_2016, y_train_appearances_senior_2016, y_val_appearances_senior_2016, y_train_tier_quality_senior_2016, y_val_tier_quality_senior_2016 = train_test_split(
            X_senior_2016, y_goals_senior_2016, y_appearances_senior_2016, y_tier_quality_senior_2016, test_size=0.2, random_state=42)
        
        # Train the models on the senior dataset and evaluate on the validation set
        best_rf_goals_2016, best_rf_appearances_2016, best_rf_tier_quality_2016 = train_and_evaluate(
            X_train_senior_2016, y_train_goals_senior_2016, y_train_appearances_senior_2016, y_train_tier_quality_senior_2016,
            X_val_senior_2016, y_val_goals_senior_2016, y_val_appearances_senior_2016, y_val_tier_quality_senior_2016)

    # Load and preprocess the 2016 youth players dataset
    data_youth_2016 = load_data('dataset_2016_youth_players.csv')
    if data_youth_2016 is not None:
        logging.info(f"Columns in the 2016 youth dataset: {data_youth_2016.columns.tolist()}")
        data_youth_preprocessed_2016, _ = preprocess_data(data_youth_2016, 'age_(months)_on_1_july_2015', all_columns=all_columns_2016)

        # Filter out goalkeepers after preprocessing
        data_youth_preprocessed_2016 = filter_goalkeepers(data_youth_preprocessed_2016)

        # Prepare features and targets for the youth dataset (testing)
        X_test_youth_2016, y_test_goals_youth_2016, y_test_appearances_youth_2016, y_test_tier_quality_youth_2016 = prepare_features_and_targets(data_youth_preprocessed_2016, 'age_(months)_on_1_july_2015')
        
        # Make predictions on the youth dataset using the models trained on the senior dataset
        y_pred_goals_youth_2016, y_pred_appearances_youth_2016, y_pred_tier_quality_youth_2016 = evaluate_models(
            best_rf_goals_2016, best_rf_appearances_2016, best_rf_tier_quality_2016, X_test_youth_2016, y_test_goals_youth_2016, y_test_appearances_youth_2016, y_test_tier_quality_youth_2016)

        # Save predictions for the 2016 youth dataset
        save_predictions(data_youth_2016, X_test_youth_2016, y_pred_goals_youth_2016, y_pred_appearances_youth_2016, y_pred_tier_quality_youth_2016, 'predictions_2016_youth.csv')

    # Repeat the same process for the 2024 datasets

    data_senior_2024 = load_data('dataset_2024_senior_players.csv')
    if data_senior_2024 is not None:
        logging.info(f"Columns in the 2024 senior dataset: {data_senior_2024.columns.tolist()}")
        data_senior_preprocessed_2024, _ = preprocess_data(data_senior_2024, 'age_(months)_on_1_july_2023')
        
        # Filter out goalkeepers after preprocessing
        data_senior_preprocessed_2024 = filter_goalkeepers(data_senior_preprocessed_2024)

        # Capture all column names after preprocessing the training data
        all_columns_2024 = data_senior_preprocessed_2024.columns.tolist()

        # Prepare features and targets for the senior dataset (training)
        X_senior_2024, y_goals_senior_2024, y_appearances_senior_2024, y_tier_quality_senior_2024 = prepare_features_and_targets(data_senior_preprocessed_2024, 'age_(months)_on_1_july_2023')
        
        # Split the senior dataset into training and validation sets
        X_train_senior_2024, X_val_senior_2024, y_train_goals_senior_2024, y_val_goals_senior_2024, y_train_appearances_senior_2024, y_val_appearances_senior_2024, y_train_tier_quality_senior_2024, y_val_tier_quality_senior_2024 = train_test_split(
            X_senior_2024, y_goals_senior_2024, y_appearances_senior_2024, y_tier_quality_senior_2024, test_size=0.2, random_state=42)
        
        # Train the models on the senior dataset and evaluate on the validation set
        best_rf_goals_2024, best_rf_appearances_2024, best_rf_tier_quality_2024 = train_and_evaluate(
            X_train_senior_2024, y_train_goals_senior_2024, y_train_appearances_senior_2024, y_train_tier_quality_senior_2024,
            X_val_senior_2024, y_val_goals_senior_2024, y_val_appearances_senior_2024, y_val_tier_quality_senior_2024)

    data_youth_2024 = load_data('dataset_2024_youth_players.csv')
    if data_youth_2024 is not None:
        logging.info(f"Columns in the 2024 youth dataset: {data_youth_2024.columns.tolist()}")
        data_youth_preprocessed_2024, _ = preprocess_data(data_youth_2024, 'age_(months)_on_1_july_2023', all_columns=all_columns_2024)

        # Filter out goalkeepers after preprocessing
        data_youth_preprocessed_2024 = filter_goalkeepers(data_youth_preprocessed_2024)

        # Prepare features and targets for the youth dataset (testing)
        X_test_youth_2024, y_test_goals_youth_2024, y_test_appearances_youth_2024, y_test_tier_quality_youth_2024 = prepare_features_and_targets(data_youth_preprocessed_2024, 'age_(months)_on_1_july_2023')
        
        # Make predictions on the youth dataset using the models trained on the senior dataset
        y_pred_goals_youth_2024, y_pred_appearances_youth_2024, y_pred_tier_quality_youth_2024 = evaluate_models(
            best_rf_goals_2024, best_rf_appearances_2024, best_rf_tier_quality_2024, X_test_youth_2024, y_test_goals_youth_2024, y_test_appearances_youth_2024, y_test_tier_quality_youth_2024)

        # Save predictions for the 2024 youth dataset
        save_predictions(data_youth_2024, X_test_youth_2024, y_pred_goals_youth_2024, y_pred_appearances_youth_2024, y_pred_tier_quality_youth_2024, 'predictions_2024_youth.csv')

if __name__ == "__main__":
    main()


2024-08-15 12:53:06,300 - INFO - Dataset dataset_2016_senior_players.csv loaded successfully.
2024-08-15 12:53:06,301 - INFO - Columns in the 2016 senior dataset: ['name', 'position', 'group', 'club', 'division', 'based_in', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2015', 'age_(months)_on_1_july_2015', 'age_(years)_on_1_july_2015', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'second_nationality', 'height_(ft_in)', 'height_(ft)', 'height_(in)', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier', 'train_or_test']
2024-08-15 12:53:06,321 - INFO - Columns after one-hot encoding: ['name', 'division', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2015', 'age_(months)_on_1_july_2015', 'age_(years)_on_1_july_2015', 'height_(ft_in)', 'height_(ft)', 'height_(in)', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier', 'tra

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   9.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   9.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   9.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   9.3s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   9.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   9.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   9.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   9.7s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   2.4s
[CV] END max_dep

2024-08-15 13:03:07,027 - INFO - Best parameters: {'n_estimators': 1000, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 20}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  11.5s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  11.7s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  11.8s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  11.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  11.9s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  11.9s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  11.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  12.2s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   3.0s
[CV] END max_dep

2024-08-15 13:15:28,138 - INFO - Best parameters: {'n_estimators': 100, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_depth': 10}


Fitting 5 folds for each of 100 candidates, totalling 500 fits




[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.5s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.2s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=

2024-08-15 13:18:20,861 - INFO - Best parameters: {'n_estimators': 900, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 50}
2024-08-15 13:18:23,458 - INFO - Validation MSE Goals: 258
2024-08-15 13:18:23,460 - INFO - Validation MSE Appearances: 2797
2024-08-15 13:18:23,461 - INFO - Validation Accuracy Tier Quality: 0.989821882951654
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024-08-15 13:18:23,505 - INFO - Validation Classification Report Tier Quality:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         1
         1.0       1.00      0.98      0.99        51
         2.0       0.00      0.00      0.00         1
         3.0       1.00      0.99      0.99        79
         4.0       0.99      1.00      0.99        69
         5.0       0.97      1

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  19.8s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  20.0s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  19.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  20.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  20.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  20.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  20.5s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  20.6s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   4.8s
[CV] END max_dep

2024-08-15 13:35:29,029 - INFO - Best parameters: {'n_estimators': 500, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_depth': 10}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  16.4s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  16.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  16.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=  16.5s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  16.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  17.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  17.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=  17.6s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   4.2s
[CV] END max_dep

2024-08-15 13:52:57,984 - INFO - Best parameters: {'n_estimators': 400, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_depth': 10}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.0s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.0s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.8s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.9s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_d

2024-08-15 13:55:59,763 - INFO - Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 50}
2024-08-15 13:56:00,716 - INFO - Validation MSE Goals: 390
2024-08-15 13:56:00,719 - INFO - Validation MSE Appearances: 2670
2024-08-15 13:56:00,721 - INFO - Validation Accuracy Tier Quality: 0.9843342036553525
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024-08-15 13:56:00,735 - INFO - Validation Classification Report Tier Quality:
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        40
           2       0.00      0.00      0.00         1
           3       0.96      0.99      0.97        69
           4       0.98      1.00      0.99        61
           5       1.00      1.00      1.00        71
           6       0.98      