In [54]:
import pandas as pd
import glob
import os

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

## Import

In [59]:
os.chdir("../data/")
all_df = pd.DataFrame()
all_filenames = [file for file in glob.glob(r"*.csv") if file[:3] == 'PER']
all_df = pd.concat([pd.read_csv(f) for f in all_filenames])

## Cleaning and Formatting

In [60]:
all_df['NAME'] = all_df['FIRST_NAME'] + " " + all_df['LAST_NAME']
all_df.drop(['FIRST_NAME', 'LAST_NAME', 'P_ID','TEAM_ID',
             'TEAM_ABBREVIATION','TEAM','factor','vop','drbp',
             'uPER','T_PACE','L_PACE','adjustment','aPER',
            'GS','MIN','FGM','FGA','FG3M','FG3A','FTM','FTA'], axis=1, inplace=True)

all_df['REB'] = all_df['REB'] / all_df['GP']
all_df['AST'] = all_df['AST'] / all_df['GP']
all_df['STL'] = all_df['STL'] / all_df['GP']
all_df['BLK'] = all_df['BLK'] / all_df['GP']
all_df['TOV'] = all_df['TOV'] / all_df['GP']
all_df['PF'] = all_df['PF'] / all_df['GP']
all_df['PTS'] = all_df['PTS'] / all_df['GP']

all_df['SEASON_ID'] = all_df.SEASON_ID.apply(lambda x: int(x[:4]))
all_df.head()

Unnamed: 0,SEASON_ID,AGE,GP,FG%,FG3%,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,MPG,PER,NAME
0,1983,37,80,0.578,0.0,0.723,169.0,418.0,7.3375,2.6375,0.6875,1.7875,2.7625,2.6375,21.4625,32.78,23.05,Kareem Abdul-Jabbar
1,1983,29,70,0.462,0.0,0.825,118.0,201.0,4.557143,3.128571,1.042857,0.442857,1.671429,2.785714,9.571429,20.74,17.29,Alvan Adams
2,1983,24,79,0.524,0.268,0.749,161.0,308.0,5.936709,4.531646,1.012658,0.278481,3.607595,3.113924,29.493671,36.71,24.34,Mark Aguirre
3,1983,25,71,0.46,0.273,0.821,29.0,87.0,1.633803,2.28169,0.577465,0.056338,0.985915,2.014085,5.408451,16.25,11.96,Danny Ainge
4,1983,23,78,0.426,0.158,0.773,136.0,270.0,5.205128,2.474359,0.589744,0.358974,1.397436,2.346154,8.5,17.69,17.2,Richard Anderson


In [61]:
check = all_df[['NAME', 'PER']]
check.shape

(12373, 2)

## Lagging

In [62]:
labels = all_df[['NAME','PER','SEASON_ID']]
labels['SEASON_ID'] = labels['SEASON_ID']-1
labels = labels.rename(columns={'PER':'NEXT_PER'})
all_df = all_df.merge(labels, on=['SEASON_ID','NAME'], how='left')    

idx = all_df[all_df.NEXT_PER.isnull()].index.to_list()
check.drop(idx, axis=0, inplace=True)
check.reset_index(drop=True, inplace=True)

all_df.dropna(inplace=True)
all_df.reset_index(drop=True, inplace=True)
all_df.drop(['SEASON_ID','NAME','PER'], axis=1, inplace=True)
all_df.shape, check.shape

KeyError: '[4 5 16 ... 12817 12818 12819] not found in axis'

## Machine Learning

In [14]:
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import train_test_split, cross_validate, \
    cross_val_score, RepeatedKFold
from sklearn.preprocessing import RobustScaler, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_validate
import numpy as np
from tqdm import tqdm

In [15]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if
                dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if
                   dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if
                dataframe[col].dtypes != "O"]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    return cat_cols, num_cols, cat_but_car, num_but_cat

In [16]:
def validate(model, X, y):
        results = pd.DataFrame(cross_validate(model, X, y, cv=5,
                                              scoring=["neg_mean_squared_error",
                                                       "r2"]))
        results["test_neg_mean_squared_error"] = results[
            "test_neg_mean_squared_error"].apply(lambda x: -x)
        results["rmse"] = results["test_neg_mean_squared_error"].apply(
            lambda x: np.sqrt(x))
        return results.mean().to_frame().T

In [17]:
def results(dataframe, target, scale=False, ordinal=False):
    
    X = dataframe.drop(target, axis=1)
    y = dataframe[target]

    if scale:
        cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(X)
        ss = StandardScaler()
        for col in num_cols:
            X[col] = ss.fit_transform(X[[col]])

    if ordinal:
        cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(X)
        for col in cat_cols:
            if X[col].dtype.name == 'category':
                oe = OrdinalEncoder(
                    categories=[X[col].dtype.categories.to_list()])
                X[col] = oe.fit_transform(X[[col]])

    X = pd.get_dummies(X, drop_first=True)

    models = [catboost.CatBoostRegressor(random_state=42, silent=True),
              RandomForestRegressor(random_state=42),
              ExtraTreesRegressor(random_state=42),
              xgb.XGBRegressor(random_state=42),
              lgbm.LGBMRegressor(random_state=42)]

    result = pd.DataFrame()
    for model in tqdm(models, desc='Fitting '):
        mdl = model
        res = validate(mdl, X, y)
        result = pd.concat([result, res])

    result.index = ['CatB','RF', 'ET', 'XGB','LGBM']
    result = result[['test_neg_mean_squared_error', 'test_r2', 'rmse']]
    result = result.rename(columns={'test_neg_mean_squared_error': 'MSE',
                                    'test_r2': 'R2',
                                    'rmse': 'RMSE'})
    return result.T

In [18]:
results(all_df, 'NEXT_PER', scale=True)

Fitting : 100%|██████████| 5/5 [02:08<00:00, 25.66s/it]


Unnamed: 0,CatB,RF,ET,XGB,LGBM
MSE,7.78802,8.210121,8.110977,8.676255,7.883905
R2,0.577294,0.554382,0.560513,0.528539,0.571863
RMSE,2.788923,2.863343,2.845194,2.943948,2.806212


## Feature Selection

In [19]:
from sklearn.feature_selection import RFE

In [20]:
rmse = {}
for i in range(9,15): 
    print(f"Trying {i} features.")
    X = all_df.drop('NEXT_PER', axis=1)
    y = all_df['NEXT_PER']
    cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(X)
    ss = StandardScaler()
    for col in num_cols:
        X[col] = ss.fit_transform(X[[col]])
    X = pd.get_dummies(X, drop_first=True)
    rfe_selector = RFE(estimator=catboost.CatBoostRegressor(silent=True,
                                                            random_state=42),
                       n_features_to_select=i, step=1, verbose=0)
    rfe_selector.fit(X, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    X = X[rfe_feature]
    models = [('CatB', catboost.CatBoostRegressor(silent=True, random_state=42)),
              ('RF', RandomForestRegressor(random_state=42)),
              ('ET', ExtraTreesRegressor(random_state=42)),
              ('XGB', xgb.XGBRegressor(random_state=42)),
              ('LGBM', lgbm.LGBMRegressor(random_state=42))]
    for id, mod in models:
        rmse[f"{id}_{i}"] = (validate(mod, X, y).rmse.values[0], rfe_feature)

min_rmse = min(rmse.values())[0]
using = (min(rmse.values())[1])
est = min(rmse, key=rmse.get)

print(f"--- Min RMSE ---\n{min_rmse}\n--- Used Features ---\n{using}\n--- Estimator ---\n{est}")

Trying 9 features.


KeyboardInterrupt: 

## Hiperparametre

import optuna
from IPython.display import clear_output

# CatBoost
def objective(trial):
    iterations = trial.suggest_int('iterations', 500, 1000)
    depth = trial.suggest_int('depth', 6, 10)
    model = catboost.CatBoostRegressor(random_seed=42,
                                       loss_function='RMSE',
                                       logging_level='Silent',
                                       iterations=iterations,
                                       depth=depth)
    score = np.sqrt(-cross_val_score(model, X, y, cv=5,
                                     scoring="neg_mean_squared_error"))
    clear_output()
    return score.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)
study.best_params

In [21]:
X = all_df.drop('NEXT_PER', axis=1)
y = all_df['NEXT_PER']
cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(X)
ss = StandardScaler()
for col in num_cols:
    X[col] = ss.fit_transform(X[[col]])

## Get predictions

In [22]:
cb = catboost.CatBoostRegressor(random_state=42, silent=True)
cb.fit(X, y)
y_pred = cb.predict(X)
all_df['y_pred'] = y_pred

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

In [23]:
all_df.head()

Unnamed: 0,AGE,GP,FG%,FG3%,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,MPG,NEXT_PER,y_pred
0,37,80,0.578,0.0,0.723,169.0,418.0,7.3375,2.6375,0.6875,1.7875,2.7625,2.6375,21.4625,32.78,25.71,23.771302
1,29,70,0.462,0.0,0.825,118.0,201.0,4.557143,3.128571,1.042857,0.442857,1.671429,2.785714,9.571429,20.74,22.2,16.3194
2,24,79,0.524,0.268,0.749,161.0,308.0,5.936709,4.531646,1.012658,0.278481,3.607595,3.113924,29.493671,36.71,23.28,24.555229
3,25,71,0.46,0.273,0.821,29.0,87.0,1.633803,2.28169,0.577465,0.056338,0.985915,2.014085,5.408451,16.25,15.5,12.577146
4,24,76,0.423,0.118,0.793,49.0,107.0,2.052632,4.381579,1.026316,0.052632,2.236842,1.486842,8.855263,22.53,14.94,14.690903


In [215]:
all_df.tail()

Unnamed: 0,AGE,GP,FG%,FG3%,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,MPG,NEXT_PER,y_pred
12504,25,41,0.514,0.374,0.631,76.0,319.0,9.634146,1.731707,0.829268,1.170732,1.95122,2.146341,20.97561,32.34,18.34,19.898169
12505,29,36,0.464,0.348,0.789,37.0,127.0,4.555556,5.027778,1.611111,0.527778,1.333333,1.25,10.388889,27.75,11.58,13.976601
12506,29,27,0.462,0.398,0.833,28.0,77.0,3.888889,3.592593,1.592593,0.407407,1.296296,1.111111,10.037037,27.75,11.58,13.081779
12508,22,63,0.438,0.343,0.886,38.0,207.0,3.888889,9.428571,0.84127,0.190476,4.142857,1.761905,25.301587,33.73,23.43,24.029415
12510,24,72,0.652,0.25,0.789,189.0,330.0,7.208333,1.25,0.333333,0.861111,1.125,2.597222,9.027778,22.35,18.67,18.269461
