In [27]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from utils_data_structures import col_prefixes


In [28]:
grouped_df = pd.read_csv('sorare_data/large_grouped_sorare_data.csv')

grouped_df = grouped_df.fillna(0.0)

print(grouped_df['So_5_Scores_9'].isna().sum())

0


In [29]:
# get dummies for current club
processed_df = pd.get_dummies(grouped_df, columns=['Current_Club'])
le = LabelEncoder()
processed_df['Position_Encoded'] = le.fit_transform(processed_df['Position'])

columns_to_drop = ['Display_Name', 'First_Name','Last_Name','Player_Number', 'Position']
processed_df = processed_df.drop(columns=columns_to_drop)

processed_df.fillna(0.0)

# pd.set_option('display.max_columns', None)
processed_df.dtypes.value_counts()

bool       2012
float64     308
int64         4
int32         1
Name: count, dtype: int64

In [30]:
# Create X and y and split into training and testing sets
target_column = 'So_5_Scores_9'
columns_to_drop = [f'{col}_9' for col in col_prefixes]

X = processed_df.drop(columns=columns_to_drop)
y = processed_df[target_column]


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y.isna().sum())

0


In [32]:
# Initialize the LightGBM regressor
lgbm = lgb.LGBMRegressor(
    boosting_type='gbdt',  # Gradient boosting decision tree
    n_estimators=400,      # Number of boosting rounds (trees)
    learning_rate=0.01,     # Step size shrinkage
    max_depth=10,          # Unlimited depth of trees (-1 means no limit)
    random_state=42,        # For reproducibility
)
# 

# Train the model
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15395
[LightGBM] [Info] Number of data points in the train set: 12443, number of used features: 407
[LightGBM] [Info] Start training from score 17.266825


In [33]:
# Make predictions
y_pred = lgbm.predict(X_test)

# Evaluate the model using RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

RMSE: 16.009569975291974


In [34]:
# Define the parameter distribution
param_distributions = {
    'n_estimators': [250, 400, 500, 800, 900, 950, 1000, 1050, 1100, 1200],
    'learning_rate': [0.01, 0.1, 0.025],
    'max_depth': [-1, 10, 20, 30],
    'num_leaves': [20, 30, 40],  # Number of leaves in the tree
    'min_child_samples': [10, 20, 30],
    'subsample': [0.3, 0.5, 0.7, 0.8, 1.0]
}

# Initialize RandomizedSearchCV
lgbm_random = RandomizedSearchCV(estimator=lgb.LGBMRegressor(random_state=42), 
                                 param_distributions=param_distributions, 
                                 n_iter=25,  # Number of random combinations to try
                                 cv=4,       # Cross-validation
                                 scoring='neg_mean_squared_error', 
                                 random_state=42, 
                                 n_jobs=-1)

# Fit RandomizedSearchCV
lgbm_random.fit(X_train, y_train)

# Get the best parameters and best score
best_params = lgbm_random.best_params_
best_rmse = np.sqrt(-lgbm_random.best_score_)

print(f"Best Parameters: {best_params}")
print(f"Best RMSE: {best_rmse}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15890
[LightGBM] [Info] Number of data points in the train set: 12443, number of used features: 654
[LightGBM] [Info] Start training from score 17.266825
Best Parameters: {'subsample': 0.5, 'num_leaves': 20, 'n_estimators': 400, 'min_child_samples': 10, 'max_depth': 10, 'learning_rate': 0.01}
Best RMSE: 16.585778055588275
