In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
import logging
import numpy as np

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Dataset {file_path} loaded successfully.")
        return data
    except Exception as e:
        logging.error(f"Error loading dataset {file_path}: {e}")
        return None

def preprocess_data(data):
    data = data.copy()

    # Convert categorical variables to numerical using LabelEncoder
    label_encoders = {}
    categorical_columns = ['name', 'based_in', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 
                           'position', 'group', 'club', 'division', 'division_tier', 'second_nationality', 
                           'is_top_4_tier']

    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
        else:
            logging.warning(f"Column '{col}' not found in the dataset.")

    # Handle missing values - fill NaNs with a placeholder (e.g., -1) or the median for numerical columns
    data.fillna(-1, inplace=True)

    return data, label_encoders

def prepare_features_and_targets(data):
    features = data[['position', 'group','age_(months)_on_1_july_2023', 'birth_month', 'birth_quarter', 'second_nationality', 'height_(cm)', 'weight_(kg)', 
                     'club', 'division', 'division_tier', 'tier_quality']]
    target_goals = data['goals'].astype(int)
    target_appearances = data['appearances'].astype(int)
    target_tier_quality = data['tier_quality']

    X = features
    y_goals = target_goals
    y_appearances = target_appearances
    y_tier_quality = target_tier_quality

    return X, y_goals, y_appearances, y_tier_quality

def tune_hyperparameters(X_train, y_train, is_classifier=False):
    param_distributions = {
        'n_estimators': np.arange(100, 1100, 100),
        'max_depth': [None] + list(np.arange(10, 51, 10)),
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
    
    if is_classifier:
        model = RandomForestClassifier(random_state=42)
        scoring = 'accuracy'
    else:
        model = RandomForestRegressor(random_state=42)
        scoring = 'neg_mean_squared_error'
    
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, 
                                       n_iter=100, cv=5, n_jobs=-1, verbose=2, scoring=scoring, random_state=42)
    
    random_search.fit(X_train, y_train)
    logging.info(f"Best parameters: {random_search.best_params_}")
    return random_search.best_estimator_

def train_and_evaluate(X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_val, y_val_goals, y_val_appearances, y_val_tier_quality):
    # Tune hyperparameters for goals prediction
    best_rf_goals = tune_hyperparameters(X_train, y_train_goals)
    best_rf_goals.fit(X_train, y_train_goals)

    # Tune hyperparameters for appearances prediction
    best_rf_appearances = tune_hyperparameters(X_train, y_train_appearances)
    best_rf_appearances.fit(X_train, y_train_appearances)

    # Check the class distribution of tier_quality
    class_counts = y_train_tier_quality.value_counts()
    min_class_count = class_counts.min()
    logging.info(f"Class distribution in training set: {class_counts.to_dict()}")

    # Adjust k_neighbors to be less than or equal to min_class_count - 1
    k_neighbors = min(5, max(1, min_class_count - 1))
    logging.info(f"Using k_neighbors={k_neighbors} for SMOTE")

    # Apply SMOTE to the training data for tier_quality
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_train_resampled, y_train_tier_quality_resampled = smote.fit_resample(X_train, y_train_tier_quality)

    # Tune hyperparameters for tier quality prediction
    best_rf_tier_quality = tune_hyperparameters(X_train_resampled, y_train_tier_quality_resampled, is_classifier=True)
    best_rf_tier_quality.fit(X_train_resampled, y_train_tier_quality_resampled)

    # Evaluate the models on validation set
    y_val_pred_goals = best_rf_goals.predict(X_val)
    y_val_pred_appearances = best_rf_appearances.predict(X_val)
    y_val_pred_tier_quality = best_rf_tier_quality.predict(X_val)

    logging.info(f'Validation MSE Goals: {mean_squared_error(y_val_goals, y_val_pred_goals):.0f}')
    logging.info(f'Validation MSE Appearances: {mean_squared_error(y_val_appearances, y_val_pred_appearances):.0f}')
    logging.info(f'Validation Accuracy Tier Quality: {accuracy_score(y_val_tier_quality, y_val_pred_tier_quality)}')
    logging.info(f'Validation Classification Report Tier Quality:\n{classification_report(y_val_tier_quality, y_val_pred_tier_quality)}')

    return best_rf_goals, best_rf_appearances, best_rf_tier_quality

def evaluate_models(best_rf_goals, best_rf_appearances, best_rf_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality):
    # Make predictions on the testing set
    y_pred_goals = best_rf_goals.predict(X_test)
    y_pred_appearances = best_rf_appearances.predict(X_test)
    y_pred_tier_quality = best_rf_tier_quality.predict(X_test)

    # Evaluate the models
    mse_goals = mean_squared_error(y_test_goals, y_pred_goals)
    mse_appearances = mean_squared_error(y_test_appearances, y_pred_appearances)
    accuracy_tier_quality = accuracy_score(y_test_tier_quality, y_pred_tier_quality)

    logging.info(f'MSE Goals: {mse_goals:.0f}')
    logging.info(f'MSE Appearances: {mse_appearances:.0f}')
    logging.info(f'Accuracy Tier Quality: {accuracy_tier_quality}')
    logging.info(f'Test Classification Report Tier Quality:\n{classification_report(y_test_tier_quality, y_pred_tier_quality)}')

    # Convert feature importances to percentages
    feature_names = X_test.columns  # Get feature names
    goals_feature_importances = best_rf_goals.feature_importances_ * 100
    appearances_feature_importances = best_rf_appearances.feature_importances_ * 100
    tier_quality_feature_importances = best_rf_tier_quality.feature_importances_ * 100

    # Pair feature names with their importance percentages, formatted to 2 decimal places
    goals_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, goals_feature_importances)}
    appearances_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, appearances_feature_importances)}
    tier_quality_importances_with_names = {name: f'{imp:.2f}' for name, imp in zip(feature_names, tier_quality_feature_importances)}

    # Log the feature importances with names
    logging.info(f'Feature importances for goals (as percentages): {goals_importances_with_names}')
    logging.info(f'Feature importances for appearances (as percentages): {appearances_importances_with_names}')
    logging.info(f'Feature importances for tier quality (as percentages): {tier_quality_importances_with_names}')

    return y_pred_goals, y_pred_appearances, y_pred_tier_quality

def save_predictions(original_data, X_test, y_pred_goals, y_pred_appearances, y_pred_tier_quality, output_file):
    # Filter the original data to match the X_test indices
    filtered_original_data = original_data.loc[X_test.index]

    # Create predictions DataFrame
    predictions = pd.DataFrame({
        'name': filtered_original_data['name'],
        'position': filtered_original_data['position'],
        'group': filtered_original_data['group'],
        'date_of_birth': filtered_original_data['date_of_birth'],
        'birth_month': filtered_original_data['birth_month'],
        'birth_quarter': filtered_original_data['birth_quarter'],  
        'appearances_pred': np.round(y_pred_appearances, 0).astype(int),
        'goals_pred': np.round(y_pred_goals, 0).astype(int),
        'tier_pred': y_pred_tier_quality
    })
    
    # Save predictions to CSV
    predictions.to_csv(output_file, index=False)
    logging.info(f"Predictions have been saved to '{output_file}'.")

def main():
    # Load and preprocess the senior players dataset
    data_senior = load_data('dataset_2024_senior_players.csv')
    if data_senior is not None:
        logging.info(f"Columns in the senior dataset: {data_senior.columns.tolist()}")
        data_senior_preprocessed, _ = preprocess_data(data_senior)

        # Filter the data for players aged 23 or below (276 months or below) and not goalkeepers
        data_senior_preprocessed = data_senior_preprocessed[
            (data_senior_preprocessed['age_(months)_on_1_july_2023'] <= 276) &
            (data_senior_preprocessed['group'] != 2)  # Where '2' is the encoded value for goalkeepers
        ]

        X_senior, y_goals_senior, y_appearances_senior, y_tier_quality_senior = prepare_features_and_targets(data_senior_preprocessed)
        
        # Split the senior dataset into training and validation sets
        X_train_senior, X_val_senior, y_train_goals_senior, y_val_goals_senior, y_train_appearances_senior, y_val_appearances_senior, y_train_tier_quality_senior, y_val_tier_quality_senior = train_test_split(
            X_senior, y_goals_senior, y_appearances_senior, y_tier_quality_senior, test_size=0.2, random_state=42)
        
        # Log the distribution of target variables
        logging.info(f"Goals distribution in senior training set: {y_train_goals_senior.describe()}")
        logging.info(f"Appearances distribution in senior training set: {y_train_appearances_senior.describe()}")

        # Train the model on the senior dataset and evaluate on the validation set
        best_rf_goals, best_rf_appearances, best_rf_tier_quality = train_and_evaluate(
            X_train_senior, y_train_goals_senior, y_train_appearances_senior, y_train_tier_quality_senior,
            X_val_senior, y_val_goals_senior, y_val_appearances_senior, y_val_tier_quality_senior)

    # Load and preprocess the youth players dataset
    data_youth = load_data('dataset_2024_youth_players.csv')
    if data_youth is not None:
        logging.info(f"Columns in the youth dataset: {data_youth.columns.tolist()}")
        data_youth_preprocessed, _ = preprocess_data(data_youth)

        # Filter the data for not goalkeepers
        data_youth_preprocessed = data_youth_preprocessed[
            (data_youth_preprocessed['group'] != 2)  # Where '2' is the encoded value for goalkeepers
        ]

        X_test_youth, y_test_goals_youth, y_test_appearances_youth, y_test_tier_quality_youth = prepare_features_and_targets(data_youth_preprocessed)
        
        # Make predictions on the youth dataset using the models trained on the senior dataset
        y_pred_goals_youth, y_pred_appearances_youth, y_pred_tier_quality_youth = evaluate_models(
            best_rf_goals, best_rf_appearances, best_rf_tier_quality, X_test_youth, y_test_goals_youth, y_test_appearances_youth, y_test_tier_quality_youth)

        # Save predictions for the youth dataset
        save_predictions(data_youth, X_test_youth, y_pred_goals_youth, y_pred_appearances_youth, y_pred_tier_quality_youth, 'predictions_2024_youth.csv')

if __name__ == "__main__":
    main()


2024-08-08 12:10:16,227 - INFO - Dataset dataset_2024_senior_players.csv loaded successfully.
2024-08-08 12:10:16,230 - INFO - Columns in the senior dataset: ['name', 'position', 'group', 'club', 'division', 'based_in', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2023', 'age_(months)_on_1_july_2023', 'age_(years)_on_1_july_2023', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'second_nationality', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier']
2024-08-08 12:10:16,285 - INFO - Goals distribution in senior training set: count    420.000000
mean       6.009524
std        8.303888
min        0.000000
25%        1.000000
50%        3.000000
75%        8.000000
max       68.000000
Name: goals, dtype: float64
2024-08-08 12:10:16,286 - INFO - Appearances distribution in senior training set: count    420.000000
mean      53.000000
std       28.151059
min       20.000000
25%       30.000000
50%  

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.2s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.3s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_dep

2024-08-08 12:11:24,483 - INFO - Best parameters: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 20}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.6s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.6s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.6s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.7s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.9s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_dep

2024-08-08 12:12:41,229 - INFO - Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_depth': 10}
2024-08-08 12:12:41,676 - INFO - Class distribution in training set: {6: 129, 3: 87, 4: 87, 5: 64, 1: 38, 7: 14, 2: 1}
2024-08-08 12:12:41,676 - INFO - Using k_neighbors=1 for SMOTE


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except Exception as e:
        print(f"Error loading dataset {file_path}: {e}")
        return None

def encode_group(data):
    le = LabelEncoder()
    data['group_encoded'] = le.fit_transform(data['group'].astype(str))
    position_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    return position_mapping

# Load the dataset
data = load_data('dataset_senior_players.csv')

# Encode the position column and get the mapping
if data is not None:
    group_mapping = encode_group(data)
    print("Group encoding mapping:", group_mapping)
    # Print a sample to see the encoded position column
    print(data[['group', 'group_encoded']].head())


Error loading dataset dataset_senior_players.csv: [Errno 2] No such file or directory: 'dataset_senior_players.csv'
