In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
from dataset import get_dataset

In [4]:
points_data_set, _ = get_dataset()
for _, player_data in points_data_set.items():
    points_data_set = player_data
    break
player_data = points_data_set.copy()
for key, value in points_data_set.items():
    if not key.startswith("GW"):
        continue
    
    points_data_set[key] = value['points']

In [116]:
df = pd.DataFrame(points_data_set, index=[0])
df = df.drop(columns=['first_name', 'last_name', 'name', 'team', 'position'])
df.head(100)

Unnamed: 0,id,GW77,GW78,GW80,GW81,GW82,GW83,GW84,GW85,GW86,...,GW296,GW305,GW306,GW307,GW308,GW309,GW310,GW311,GW312,GW313
0,387,12,-2,0,9,9,1,2,1,1,...,0,1,0,4,1,1,2,0,1,6


In [117]:
gw_columns = [col for col in df.columns if col.startswith('GW')]
df_melted = df.melt(id_vars=['id'],
                    value_vars=gw_columns,
                    var_name='GW',
                    value_name='Points')
df_melted.dropna(subset=['Points'], inplace=True)
df_melted.head()

Unnamed: 0,id,GW,Points
0,387,GW77,12
1,387,GW78,-2
2,387,GW80,0
3,387,GW81,9
4,387,GW82,9


In [118]:
df_melted['GW'] = df_melted['GW'].str.replace('GW', '').astype(int)
df_melted.head()

Unnamed: 0,id,GW,Points
0,387,77,12
1,387,78,-2
2,387,80,0
3,387,81,9
4,387,82,9


In [119]:
df_melted['RecentFormAvg'] = df_melted.groupby('id')['Points'].rolling(window=3, min_periods=1).mean().reset_index(0,drop=True)
df_melted.head()

Unnamed: 0,id,GW,Points,RecentFormAvg
0,387,77,12,12.0
1,387,78,-2,5.0
2,387,80,0,3.333333
3,387,81,9,2.333333
4,387,82,9,6.0


In [120]:
df_melted['HistoricalPerformanceAvg'] = df_melted.groupby('id')['Points'].expanding().mean().reset_index(0,drop=True)
df_melted.head()

Unnamed: 0,id,GW,Points,RecentFormAvg,HistoricalPerformanceAvg
0,387,77,12,12.0,12.0
1,387,78,-2,5.0,5.0
2,387,80,0,3.333333,3.333333
3,387,81,9,2.333333,4.75
4,387,82,9,6.0,5.6


In [121]:
df_melted = df_melted.sort_values(by=['id', 'GW'])
df = df_melted
df_melted.head()

Unnamed: 0,id,GW,Points,RecentFormAvg,HistoricalPerformanceAvg
0,387,77,12,12.0,12.0
1,387,78,-2,5.0,5.0
2,387,80,0,3.333333,3.333333
3,387,81,9,2.333333,4.75
4,387,82,9,6.0,5.6


In [122]:
df = df.drop(columns=['GW'])
df.head()

Unnamed: 0,id,Points,RecentFormAvg,HistoricalPerformanceAvg
0,387,12,12.0,12.0
1,387,-2,5.0,5.0
2,387,0,3.333333,3.333333
3,387,9,2.333333,4.75
4,387,9,6.0,5.6


In [123]:
X = df.drop(columns=['Points'])
y = df['Points']

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [128]:
grid = {'max_depth': np.arange(1, 25, 1), 'n_estimators': np.arange(25, 100, 1)}

rfr = RandomForestRegressor(max_features= 1/3)
rfrCV = GridSearchCV(estimator=rfr, param_grid=grid, n_jobs=-1)
rfrCV.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [126]:
print('Random Forest Regressor:')
print(' Optimal Parameters:', rfrCV.best_params_)
print(' Optimal Valid R2 =', rfrCV.best_score_)

Random Forest Regressor:
 Optimal Parameters: {'max_depth': 3, 'n_estimators': 37}
 Optimal Valid R2 = 0.17440065770906868


In [127]:
rfrCV.best_estimator_.score(X_test, y_test)
rfrCV.best_estimator_.predict(X_test)

0.22456570704056456

0.4372

In [129]:
df.head()

Unnamed: 0,id,Points,RecentFormAvg,HistoricalPerformanceAvg
0,387,12,12.0,12.0
1,387,-2,5.0,5.0
2,387,0,3.333333,3.333333
3,387,9,2.333333,4.75
4,387,9,6.0,5.6


In [70]:
def do_forest(player_data, pred_by):
    filtered_data = player_data.copy()
    for key, value in filtered_data.items():
        if not key.startswith("GW"):
            continue

        filtered_data[key] = value['points']

    df = pd.DataFrame(filtered_data, index=[0])
    df = df.drop(columns=['first_name', 'last_name', 'name', 'team', 'position'])

    gw_columns = [col for col in df.columns if col.startswith("GW")]
    df = df.melt(id_vars=['id'],
                        value_vars=gw_columns,
                        var_name='GW',
                        value_name='points')
    df.dropna(subset=['points'], inplace=True)
    df['GW'] = df['GW'].str.replace('GW', '').astype(int)
    df = df.sort_values(by=['id', 'GW'])
    df = df.drop(columns=['GW', 'id'])

    grid = {'max_depth': np.arange(1, 25, 2), 'n_estimators': np.arange(25, 100, 5)}

    rfr = RandomForestRegressor(max_features=1 / 3)
    rfrCV = GridSearchCV(estimator=rfr, param_grid=grid, n_jobs=-1)
    best_estimator = None
    pred = []

    for _ in pred_by:
        df['RecentFormAvg'] = df['points'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)
        df['HistoricalPerformanceAvg'] = df['points'].expanding().mean().reset_index(0, drop=True)

        last_row = df.iloc[-1]
        X = df[:-1].drop(columns=['points'])
        y = df[:-1]['points']

        if best_estimator is None:
            rfrCV.fit(X.to_numpy(), y)
            best_estimator = rfrCV.best_estimator_

        pred.append(best_estimator.predict([last_row.drop('points')]))
        df.loc[len(df)] = {'points': pred[-1][0], 'RecentFormAvg': 0, 'HistoricalPerformanceAvg': 0}

    overall = sum(pred)

    if overall / len(pred_by) >= 20 or overall / len(pred_by) <= -20:
        raise "bad"

    return pred

In [71]:
do_forest(player_data, [1, 2]) 

[array([1.87339326]), array([2.36452923])]