In [None]:
import pandas as pd
!pip install autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import numpy as np
from google.colab import files
!pip install "pandas<2.0.0"

# Constants
label = 'total_points'
drop_columns = ['season', 'round', 'name', 'team', 'xP']

# Load data
final_data = pd.read_csv('https://raw.githubusercontent.com/danismailov/fpl-with-fml/main/top200_lag1_last3_points_noxp_dropnan.csv', index_col=0).reset_index(drop=True)
train = final_data.sample(frac=0.8, random_state=200)
validation = final_data.drop(train.index)

# Prepare training data
train_x = train.drop(drop_columns, axis=1)
train_y = train[label]

# Tabular dataset
train_data = TabularDataset(train_x)

# Hyperparameters
hyperparameters = {
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XTrees'}, 'num_boost_round': 100},
            {'num_boost_round': 100}],
    'RF': [{'n_estimators': 1000}],
    'XT': [{'n_estimators': 1000}],
    'CAT': [{'iterations': 100}],
    'NN_TORCH': [{'num_epochs': 10}],  # Neural network with a limited number of epochs to control resource usage
}

# Regression model
predictor = TabularPredictor(label=label, problem_type='regression').fit(
    train_data,
    num_gpus=1,
    hyperparameters=hyperparameters,
    time_limit=3600,  # One hour limit; adjust as needed
    num_bag_folds=10,  # Bagging with 5 folds
    num_stack_levels=3,  # Stacking can help but also increases memory usage
    # You may want to adjust or experiment with these settings based on your dataset and problem specifics
)

predictor.evaluate(validation, display=True)
predictor.leaderboard(validation, display=True)

# Load new test data
test = pd.read_csv('https://raw.githubusercontent.com/danismailov/fpl-with-fml/main/test_data_noxp_dropnan.csv', index_col=0).reset_index(drop=True)

# Feature importance
# with pd.option_context(
#     'display.max_rows', None,
#     'display.max_columns', None,
#     'display.precision', 3):
#     print(predictor.feature_importance(test))

# Evaluate models on new test data
predictor.evaluate(test, display=True)

# Prepare test data
test_x = test.drop(drop_columns, axis=1)
test_y = test[label]  # Actual total_points

# Make predictions with the classification and regression models
regression_predictions = predictor.predict(test)

# Calculate accuracy using RMSE and MAE for regression model
rmse = np.sqrt(mean_squared_error(test_y, regression_predictions))
mae = mean_absolute_error(test_y, regression_predictions)

# Calculate accuracy using RMSE and MAE for 'xP'
xP_rmse = np.sqrt(mean_squared_error(test_y, test['xP']))
xP_mae = mean_absolute_error(test_y, test['xP'])

print("Regression model RMSE:", rmse)
print("Regression model MAE:", mae)
print("'xP' RMSE:", xP_rmse)
print("'xP' MAE:", xP_mae)

# Create a DataFrame with actual, predicted, and 'xP' values
comparison = test[['name', 'season', 'round', 'total_points', 'xP']].copy()
comparison['regression_predicted_total_points'] = regression_predictions

comparison.head()
comparison.to_csv('results.csv', index=False)
files.download('results.csv')

Collecting autogluon
  Downloading autogluon-1.0.0-py3-none-any.whl (9.9 kB)
Collecting autogluon.core[all]==1.0.0 (from autogluon)
  Downloading autogluon.core-1.0.0-py3-none-any.whl (229 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.1/229.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==1.0.0 (from autogluon)
  Downloading autogluon.features-1.0.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==1.0.0 (from autogluon)
  Downloading autogluon.tabular-1.0.0-py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.0/306.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==1.0.0 (from autogluon)
  Downloading autogluon.multimodal-1.0.0-py3-none-any.whl (416 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

Collecting pandas<2.0.0
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires kaleido, which is not installed.
autogluon-common 1.0.0 requires pandas<2.2.0,>=2.0.0, but you have pandas 1.5.3 which is incompatible.
autogluon-core 1.0.0 requires pandas<2.2.0,>=2.0.0, but you have pandas 1.5.3 which is incompatible.
autogluon-features 1.0.0 requires pandas<2.2.0,>=2.0.0, but you have pandas 1.5.3 which is incompatible.
autogluon-multimodal 1.0.0

No path specified. Models will be saved in: "AutogluonModels/ag-20240124_174551"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels/ag-20240124_174551"
AutoGluon Version:  1.0.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sat Nov 18 15:31:17 UTC 2023
CPU Count:          2
Memory Avail

                    model  score_test  score_val              eval_metric  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         LightGBM_BAG_L2   -2.779014  -2.867703  root_mean_squared_error        3.752055       3.954443   623.644457                 0.585039                0.139126          52.200365            2       True          9
1         LightGBM_BAG_L4   -2.783057  -2.851708  root_mean_squared_error       10.881356      13.468947  2086.525918                 0.648336                0.558117         219.383932            4       True         23
2     WeightedEnsemble_L5   -2.783381  -2.840773  root_mean_squared_error       11.463054      14.110869  2292.033432                 0.005809                0.000667           1.117927            5       True         28
3     WeightedEnsemble_L3   -2.783990  -2.846241  root_mean_squared_error        7.275058       8.528428  1215.70634

Evaluation: root_mean_squared_error on test data: -3.2388660014760067
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -3.2388660014760067,
    "mean_squared_error": -10.490252975517175,
    "mean_absolute_error": -2.357676086991004,
    "r2": 0.04914985159728158,
    "pearsonr": 0.23341203193929538,
    "median_absolute_error": -1.7958699464797974
}


Regression model RMSE: 3.2388660014760067
Regression model MAE: 2.357676086991004
'xP' RMSE: 2.6842399888047446
'xP' MAE: 1.902976548406494


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>