In [56]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [57]:
from dataset import get_dataset

In [58]:
points_data_set, _ = get_dataset()

In [59]:
df = pd.DataFrame.from_dict(points_data_set, orient='index')
df = df.drop(columns=['first_name', 'last_name', 'name'])
df[df['id'] == 387]

Unnamed: 0,team,id,position,GW77,GW78,GW80,GW81,GW82,GW83,GW84,...,GW299,GW195,GW107,GW217,GW223,GW224,GW190,GW229,GW218,GW304
387,MUN,387,DEF,"{'diff': 2, 'points': 12, 'team': 'FUL'}","{'diff': 4, 'points': -2, 'team': 'LIV'}","{'diff': 2, 'points': 0, 'team': 'NFO'}","{'diff': 2, 'points': 9, 'team': 'IPS'}","{'diff': 2, 'points': 9, 'team': 'NEW'}","{'diff': 3, 'points': 1, 'team': 'AVL'}","{'diff': 3, 'points': 2, 'team': 'WOL'}",...,,,,,,,,,,


In [60]:
gw_columns = [col for col in df.columns if col.startswith('GW')]
for col in gw_columns:
    df[col] = df[col].apply(lambda x: x['points'] if pd.notnull(x) else None)
df_melted = df.melt(id_vars=['team', 'position', 'id'],
                    value_vars=gw_columns,
                    var_name='GW',
                    value_name='Points')
df_melted.dropna(subset=['Points'], inplace=True)
df_melted.head(1)

Unnamed: 0,team,position,id,GW,Points
0,MUN,DEF,387,GW77,12.0


In [61]:
df_melted['GW'] = df_melted['GW'].str.replace('GW', '').astype(int)
df_melted = pd.get_dummies(df_melted, columns=['team', 'position'])
df_melted.head(1)

Unnamed: 0,id,GW,Points,team_ARS,team_AVL,team_BHA,team_BOU,team_BRE,team_CHE,team_CRY,...,team_NEW,team_NFO,team_SOU,team_TOT,team_WHU,team_WOL,position_DEF,position_FWD,position_GKP,position_MID
0,387,77,12.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [62]:
df_melted['RecentFormAvg'] = df_melted.groupby('id')['Points'].rolling(window=3, min_periods=1).mean().reset_index(0,drop=True)
df_melted.head(1)

Unnamed: 0,id,GW,Points,team_ARS,team_AVL,team_BHA,team_BOU,team_BRE,team_CHE,team_CRY,...,team_NFO,team_SOU,team_TOT,team_WHU,team_WOL,position_DEF,position_FWD,position_GKP,position_MID,RecentFormAvg
0,387,77,12.0,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,12.0


In [63]:
df_melted['HistoricalPerformanceAvg'] = df_melted.groupby('id')['Points'].expanding().mean().reset_index(0,drop=True)
df_melted.head()


Unnamed: 0,id,GW,Points,team_ARS,team_AVL,team_BHA,team_BOU,team_BRE,team_CHE,team_CRY,...,team_SOU,team_TOT,team_WHU,team_WOL,position_DEF,position_FWD,position_GKP,position_MID,RecentFormAvg,HistoricalPerformanceAvg
0,387,77,12.0,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,12.0,12.0
1,216,77,3.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,3.0,3.0
2,80,77,6.0,False,False,False,True,False,False,False,...,False,False,False,False,True,False,False,False,6.0,6.0
3,468,77,11.0,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,11.0,11.0
4,309,77,7.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,7.0,7.0


In [64]:
df_melted = df_melted.sort_values(by=['id', 'GW'])
df = df_melted[df_melted['GW'] == df_melted['GW'].max()]

In [65]:
df = df.drop(columns=['GW'])
df.head()

Unnamed: 0,id,Points,team_ARS,team_AVL,team_BHA,team_BOU,team_BRE,team_CHE,team_CRY,team_EVE,...,team_SOU,team_TOT,team_WHU,team_WOL,position_DEF,position_FWD,position_GKP,position_MID,RecentFormAvg,HistoricalPerformanceAvg
72866,1,1.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,2.0,3.689076
73153,2,1.0,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,5.0,3.783784
73007,3,6.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,5.666667,3.684932
73256,5,0.0,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,0.0,0.0
73138,8,2.0,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,1.333333,3.540541


In [66]:
X = df.drop(columns=['Points'])
y = df['Points']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [68]:
model = LinearRegression()
model.fit(X_train, y_train)

In [69]:
y_pred = model.predict(X_test)

In [70]:
r2_score(y_test, y_pred)

0.22038564600227595