In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Dataset {file_path} loaded successfully.")
        return data
    except Exception as e:
        logging.error(f"Error loading dataset {file_path}: {e}")
        return None

def preprocess_data(data):
    data = data.copy()

    # Convert categorical variables to numerical using LabelEncoder
    label_encoders = {}
    categorical_columns = ['name', 'based_in', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 
                           'position', 'group', 'club', 'division', 'division_tier', 'second_nationality', 
                           'is_top_4_tier']

    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
        else:
            logging.warning(f"Column '{col}' not found in the dataset.")

    # Handle missing values - fill NaNs with a placeholder (e.g., -1) or the median for numerical columns
    data.fillna(-1, inplace=True)

    return data, label_encoders

def prepare_features_and_targets(data):
    features = data[['position', 'group', 'age_(months)_on_1_july_2023', 'second_nationality', 'height_(cm)', 'weight_(kg)', 
                     'club', 'division', 'division_tier', 'is_top_4_tier']]
    target_goals = data['goals'].astype(int)
    target_appearances = data['appearances'].astype(int)
    target_tier_quality = data['tier_quality']

    X = features
    y_goals = target_goals
    y_appearances = target_appearances
    y_tier_quality = target_tier_quality

    return X, y_goals, y_appearances, y_tier_quality

def tune_hyperparameters(X_train, y_train, is_classifier=False):
    param_distributions = {
        'n_estimators': np.arange(100, 1100, 100),
        'max_depth': [None] + list(np.arange(10, 51, 10)),
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
    
    if is_classifier:
        model = RandomForestClassifier(random_state=42)
        scoring = 'accuracy'
    else:
        model = RandomForestRegressor(random_state=42)
        scoring = 'neg_mean_squared_error'
    
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, 
                                       n_iter=100, cv=5, n_jobs=-1, verbose=2, scoring=scoring, random_state=42)
    
    random_search.fit(X_train, y_train)
    logging.info(f"Best parameters: {random_search.best_params_}")
    return random_search.best_estimator_

def train_and_evaluate(X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_val, y_val_goals, y_val_appearances, y_val_tier_quality):
    # Tune hyperparameters for goals prediction
    best_rf_goals = tune_hyperparameters(X_train, y_train_goals)
    best_rf_goals.fit(X_train, y_train_goals)

    # Tune hyperparameters for appearances prediction
    best_rf_appearances = tune_hyperparameters(X_train, y_train_appearances)
    best_rf_appearances.fit(X_train, y_train_appearances)

    # Tune hyperparameters for tier quality prediction
    best_rf_tier_quality = tune_hyperparameters(X_train, y_train_tier_quality, is_classifier=True)
    best_rf_tier_quality.fit(X_train, y_train_tier_quality)

    # Evaluate the models on validation set
    y_val_pred_goals = best_rf_goals.predict(X_val)
    y_val_pred_appearances = best_rf_appearances.predict(X_val)
    y_val_pred_tier_quality = best_rf_tier_quality.predict(X_val)

    logging.info(f'Validation MSE Goals: {mean_squared_error(y_val_goals, y_val_pred_goals)}')
    logging.info(f'Validation MSE Appearances: {mean_squared_error(y_val_appearances, y_val_pred_appearances)}')
    logging.info(f'Validation Accuracy Tier Quality: {accuracy_score(y_val_tier_quality, y_val_pred_tier_quality)}')

    return best_rf_goals, best_rf_appearances, best_rf_tier_quality

def evaluate_models(best_rf_goals, best_rf_appearances, best_rf_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality):
    # Make predictions on the testing set
    y_pred_goals = best_rf_goals.predict(X_test)
    y_pred_appearances = best_rf_appearances.predict(X_test)
    y_pred_tier_quality = best_rf_tier_quality.predict(X_test)

    # Evaluate the models
    mse_goals = mean_squared_error(y_test_goals, y_pred_goals)
    mse_appearances = mean_squared_error(y_test_appearances, y_pred_appearances)
    accuracy_tier_quality = accuracy_score(y_test_tier_quality, y_pred_tier_quality)

    logging.info(f'MSE Goals: {mse_goals}')
    logging.info(f'MSE Appearances: {mse_appearances}')
    logging.info(f'Accuracy Tier Quality: {accuracy_tier_quality}')

    # Check feature importance
    logging.info(f'Feature importances for goals: {best_rf_goals.feature_importances_}')
    logging.info(f'Feature importances for appearances: {best_rf_appearances.feature_importances_}')
    logging.info(f'Feature importances for tier quality: {best_rf_tier_quality.feature_importances_}')

    return y_pred_goals, y_pred_appearances, y_pred_tier_quality

def save_predictions(original_data, X_test, y_pred_goals, y_pred_appearances, y_pred_tier_quality, output_file):
    predictions = pd.DataFrame({
        'Player name': original_data['name'],
        'Position': original_data['position'],
        'Group': original_data['group'],
        'Predicted appearances': y_pred_appearances,
        'Predicted goals': y_pred_goals,
        'Predicted tier quality': y_pred_tier_quality
    })
    predictions.to_csv(output_file, index=False)
    logging.info(f"Predictions have been saved to '{output_file}'.")

def main():
    # Load and preprocess the senior players dataset
    data_senior = load_data('dataset_senior_players.csv')
    if data_senior is not None:
        logging.info(f"Columns in the senior dataset: {data_senior.columns.tolist()}")
        data_senior_preprocessed, _ = preprocess_data(data_senior)
        X_senior, y_goals_senior, y_appearances_senior, y_tier_quality_senior = prepare_features_and_targets(data_senior_preprocessed)
        
        # Split the senior dataset into training and validation sets
        X_train_senior, X_val_senior, y_train_goals_senior, y_val_goals_senior, y_train_appearances_senior, y_val_appearances_senior, y_train_tier_quality_senior, y_val_tier_quality_senior = train_test_split(
            X_senior, y_goals_senior, y_appearances_senior, y_tier_quality_senior, test_size=0.2, random_state=42)
        
        # Log the distribution of target variables
        logging.info(f"Goals distribution in senior training set: {y_train_goals_senior.describe()}")
        logging.info(f"Appearances distribution in senior training set: {y_train_appearances_senior.describe()}")

        # Train the model on the senior dataset and evaluate on the validation set
        best_rf_goals, best_rf_appearances, best_rf_tier_quality = train_and_evaluate(
            X_train_senior, y_train_goals_senior, y_train_appearances_senior, y_train_tier_quality_senior,
            X_val_senior, y_val_goals_senior, y_val_appearances_senior, y_val_tier_quality_senior)

    # Load and preprocess the youth players dataset
    data_youth = load_data('dataset_youth_players.csv')
    if data_youth is not None:
        logging.info(f"Columns in the youth dataset: {data_youth.columns.tolist()}")
        data_youth_preprocessed, _ = preprocess_data(data_youth)
        X_test_youth, y_test_goals_youth, y_test_appearances_youth, y_test_tier_quality_youth = prepare_features_and_targets(data_youth_preprocessed)
        
        # Make predictions on the youth dataset using the models trained on the senior dataset
        y_pred_goals_youth, y_pred_appearances_youth, y_pred_tier_quality_youth = evaluate_models(
            best_rf_goals, best_rf_appearances, best_rf_tier_quality, X_test_youth, y_test_goals_youth, y_test_appearances_youth, y_test_tier_quality_youth)

        # Save predictions for the youth dataset
        save_predictions(data_youth, X_test_youth, y_pred_goals_youth, y_pred_appearances_youth, y_pred_tier_quality_youth, 'predictions_youth.csv')

if __name__ == "__main__":
    main()


2024-07-15 12:37:54,433 - INFO - Dataset dataset_senior_players.csv loaded successfully.
2024-07-15 12:37:54,433 - INFO - Columns in the senior dataset: ['name', 'position', 'group', 'club', 'division', 'based_in', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2023', 'age_(months)_on_1_july_2023', 'age_(years)_on_1_july_2023', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'second_nationality', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier']
2024-07-15 12:37:54,449 - INFO - Goals distribution in senior training set: count    1712.000000
mean       19.835864
std        31.708905
min         0.000000
25%         2.000000
50%         8.000000
75%        24.000000
max       279.000000
Name: goals, dtype: float64
2024-07-15 12:37:54,450 - INFO - Appearances distribution in senior training set: count    1712.000000
mean      179.133762
std       139.348590
min        20.000000
25%        64.7500

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   2.6s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   2.7s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   2.6s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   2.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.0s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.1s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.3s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_dep

2024-07-15 12:40:45,332 - INFO - Best parameters: {'n_estimators': 100, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_depth': 20}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.6s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.7s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.7s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   3.7s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   3.8s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_dep

2024-07-15 12:43:39,069 - INFO - Best parameters: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 20}


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.2s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.8s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.9s
[CV] END max_depth=50, min_samples_leaf=10, min_samples_split=10, n_estimators=900; total time=   1.9s
[CV] END max_dep

2024-07-15 12:44:54,640 - INFO - Best parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
2024-07-15 12:44:55,870 - INFO - Validation MSE Goals: 455.2667564614675
2024-07-15 12:44:55,870 - INFO - Validation MSE Appearances: 4830.674582627766
2024-07-15 12:44:55,871 - INFO - Validation Accuracy Tier Quality: 0.9813519813519813
2024-07-15 12:44:55,894 - INFO - Dataset dataset_youth_players.csv loaded successfully.
2024-07-15 12:44:55,894 - INFO - Columns in the youth dataset: ['name', 'position', 'group', 'club', 'division', 'based_in', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2023', 'age_(months)_on_1_july_2023', 'age_(years)_on_1_july_2023', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'second_nationality', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier']
2024-07-15 12:44:56,041 - INFO - MSE Goals: 65.86054658174572
2024-07-15 12:44:56,0

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_distributions(train_data, test_data, features):
    for feature in features:
        plt.figure(figsize=(12, 6))
        sns.histplot(train_data[feature], color='blue', kde=True, label='Senior Players')
        sns.histplot(test_data[feature], color='red', kde=True, label='Youth Players')
        plt.title(f'Distribution of {feature}')
        plt.legend()
        plt.show()

# Define the list of features to compare
features_to_compare = ['position', 'group', 'age_(months)_on_1_july_2023', 'second_nationality', 'height_(cm)', 
                       'weight_(kg)', 'club', 'division', 'division_tier', 'is_top_4_tier']

# Plot distributions
plot_feature_distributions(data_senior, data_youth, features_to_compare)


NameError: name 'data_senior' is not defined