In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV
import logging

# Configure logging - this is to track events as the software runs, to allow for debugging etc. It then includes the output formatted in time and date, then level of log, and the message.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    try:
        data = pd.read_csv('dataset.csv')
        logging.info("Dataset loaded successfully.")
        return data
    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        return None

def preprocess_data(data):
    data = data.copy()

    # Convert categorical variables to numerical using LabelEncoder
    label_encoders = {}
    categorical_columns = ['name', 'based_in', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 
                           'position', 'group', 'club', 'division', 'division_tier', 'second_nationality', 
                           'is_top_4_tier', 'train_or_test']

    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
        else:
            logging.warning(f"Column '{col}' not found in the dataset.")

    # Handle missing values - fill NaNs with a placeholder (e.g., -1) or the median for numerical columns
    data.fillna(-1, inplace=True)

    # Convert date_of_birth to age (assuming the current year is 2023)
    data['date_of_birth'] = pd.to_datetime(data['date_of_birth'], errors='coerce')
    data['age'] = 2023 - data['date_of_birth'].dt.year

    # Drop the original date_of_birth column
    data.drop(columns=['date_of_birth'], inplace=True)

    return data

def split_data(data):
    features = data[['name', 'position', 'group', 'age', 'second_nationality', 'height_(cm)', 'weight_(kg)', 
                     'club', 'division', 'division_tier', 'is_top_4_tier']]
    target_goals = data['goals'].astype(int)
    target_appearances = data['appearances'].astype(int)
    target_tier_quality = data['tier_quality']

    train_data = data[data['train_or_test'] == 0]
    test_data = data[data['train_or_test'] == 1]

    X_train = train_data[features.columns]
    y_train_goals = train_data['goals'].astype(int)
    y_train_appearances = train_data['appearances'].astype(int)
    y_train_tier_quality = train_data['tier_quality']

    X_test = test_data[features.columns]
    y_test_goals = test_data['goals']
    y_test_appearances = test_data['appearances']
    y_test_tier_quality = test_data['tier_quality']

    return X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality


def tune_hyperparameters(X_train, y_train, is_classifier=False):
    param_grid = {
        'n_estimators': [100, 200, 500, 1000],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    
    if is_classifier:
        model = RandomForestClassifier(random_state=42)
        scoring = 'accuracy'
    else:
        model = RandomForestRegressor(random_state=42)
        scoring = 'neg_mean_squared_error'
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                               cv=5, n_jobs=-1, verbose=2, scoring=scoring)
    
    grid_search.fit(X_train, y_train)
    logging.info(f"Best parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def train_and_evaluate(X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality):
    # Tune hyperparameters for goals prediction
    best_rf_goals = tune_hyperparameters(X_train, y_train_goals)
    best_rf_goals.fit(X_train, y_train_goals)

    # Tune hyperparameters for appearances prediction
    best_rf_appearances = tune_hyperparameters(X_train, y_train_appearances)
    best_rf_appearances.fit(X_train, y_train_appearances)

    # Tune hyperparameters for tier quality prediction
    best_rf_tier_quality = tune_hyperparameters(X_train, y_train_tier_quality, is_classifier=True)
    best_rf_tier_quality.fit(X_train, y_train_tier_quality)

    # Make predictions on the testing set
    y_pred_goals = best_rf_goals.predict(X_test)
    y_pred_appearances = best_rf_appearances.predict(X_test)
    y_pred_tier_quality = best_rf_tier_quality.predict(X_test)

    # Evaluate the models
    mse_goals = mean_squared_error(y_test_goals, y_pred_goals)
    mse_appearances = mean_squared_error(y_test_appearances, y_pred_appearances)
    accuracy_tier_quality = accuracy_score(y_test_tier_quality, y_pred_tier_quality)

    logging.info(f'MSE Goals: {mse_goals}')
    logging.info(f'MSE Appearances: {mse_appearances}')
    logging.info(f'Accuracy Tier Quality: {accuracy_tier_quality}')

    # Check model predictions on training data for debugging
    y_train_pred_goals = best_rf_goals.predict(X_train)
    y_train_pred_appearances = best_rf_appearances.predict(X_train)
    y_train_pred_tier_quality = best_rf_tier_quality.predict(X_train)

    logging.info(f'Training MSE Goals: {mean_squared_error(y_train_goals, y_train_pred_goals)}')
    logging.info(f'Training MSE Appearances: {mean_squared_error(y_train_appearances, y_train_pred_appearances)}')
    logging.info(f'Training Accuracy Tier Quality: {accuracy_score(y_train_tier_quality, y_train_pred_tier_quality)}')

    return y_pred_goals, y_pred_appearances, y_pred_tier_quality

def save_predictions(X_test, y_pred_goals, y_pred_appearances, y_pred_tier_quality, output_file):
    predictions = pd.DataFrame({
        'Player name': X_test['name'],
        'Position': X_test['position'],
        'Group': X_test['group'],
        'Predicted appearances': y_pred_appearances,
        'Predicted goals': y_pred_goals,
        'Predicted tier quality': y_pred_tier_quality
    })
    predictions.to_csv(output_file, index=False)
    logging.info(f"Predictions have been saved to '{output_file}'.")

def main():
    data = load_data('dataset.csv')
    if data is not None:
        logging.info(f"Columns in the dataset: {data.columns.tolist()}")
        data_preprocessed = preprocess_data(data)
        X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality = split_data(data_preprocessed)
        y_pred_goals, y_pred_appearances, y_pred_tier_quality = train_and_evaluate(X_train, y_train_goals, y_train_appearances, y_train_tier_quality, X_test, y_test_goals, y_test_appearances, y_test_tier_quality)
        save_predictions(X_test, y_pred_goals, y_pred_appearances, y_pred_tier_quality, 'predictions.csv')

if __name__ == "__main__":
    main()


2024-07-12 12:22:49,410 - INFO - Dataset loaded successfully.
2024-07-12 12:22:49,411 - INFO - Columns in the dataset: ['name', 'position', 'group', 'club', 'division', 'based_in', 'division_tier', 'tier_quality', 'date_of_birth', 'birth_month', 'birth_quarter', 'age_(days)_on_1_july_2023', 'age_(months)_on_1_july_2023', 'age_(years)_on_1_july_2023', 'birth_city', 'nation_of_birth', 'birth_region', 'nationality', 'second_nationality', 'height_(cm)', 'weight_(kg)', 'goals', 'appearances', 'is_top_4_tier', 'train_or_test']


Fitting 5 folds for each of 48 candidates, totalling 240 fits


  data['date_of_birth'] = pd.to_datetime(data['date_of_birth'], errors='coerce')


[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=500; total time=   0.4s
[CV] END max_depth=None, min_samples_split=2, n_estimators=500; total time=   0.4s
[CV]

2024-07-12 12:23:04,451 - INFO - Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


[CV] END max_depth=30, min_samples_split=10, n_estimators=1000; total time=   0.6s
Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, 

2024-07-12 12:23:21,023 - INFO - Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=500; total time=   1.1s
[CV] END max_depth=None, 

2024-07-12 12:23:54,925 - INFO - Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
2024-07-12 12:23:55,600 - INFO - MSE Goals: 1401.1391872956563
2024-07-12 12:23:55,600 - INFO - MSE Appearances: 50985.34283045306
2024-07-12 12:23:55,601 - INFO - Accuracy Tier Quality: 0.9612330686595049
2024-07-12 12:23:55,654 - INFO - Training MSE Goals: 0.0
2024-07-12 12:23:55,654 - INFO - Training MSE Appearances: 0.0
2024-07-12 12:23:55,655 - INFO - Training Accuracy Tier Quality: 1.0
2024-07-12 12:23:55,668 - INFO - Predictions have been saved to 'predictions.csv'.
