In [1]:
import pandas as pd
import numpy  as np

import polars as pl

In [2]:
#results = pd.read_parquet('../data/raw/pl_game_results.parquet')

In [3]:
results = pl.read_delta('../data/raw/game_results.delta')

In [4]:
results.filter(pl.col('date').dt.year() == 2023).sort('round')

season,round,venue,date,home_team_name,home_team_points,away_team_name,away_team_points
i64,i64,str,date,str,i16,str,i16
2023,1,"""CommBank Stadi…",2023-03-02,"""Eels""",12,"""Storm""",16
2023,1,"""Sky Stadium, W…",2023-03-03,"""Warriors""",20,"""Knights""",12
2023,1,"""BlueBet Stadiu…",2023-03-03,"""Panthers""",12,"""Broncos""",13
2023,1,"""4 Pines Park, …",2023-03-04,"""Sea Eagles""",31,"""Bulldogs""",6
2023,1,"""Queensland Cou…",2023-03-04,"""Cowboys""",19,"""Raiders""",18
2023,1,"""PointsBet Stad…",2023-03-04,"""Sharks""",18,"""Rabbitohs""",27
2023,1,"""Suncorp Stadiu…",2023-03-05,"""Dolphins""",28,"""Roosters""",18
2023,1,"""Leichhardt Ova…",2023-03-05,"""Wests Tigers""",10,"""Titans""",22
2023,2,"""BlueBet Stadiu…",2023-03-09,"""Panthers""",16,"""Rabbitohs""",10
2023,2,"""CommBank Stadi…",2023-03-10,"""Eels""",26,"""Sharks""",30


In [5]:
#results = results.with_columns(pl.col("home_team_points").cast(pl.Int64))

In [6]:
#rdf = results 

#all_results

In [7]:
def result_expander(rdf):
    '''
    Duplicate results table such that each game has a row where each team is prime and opt.

    This creates a dataframe that can be used for creating lagged effects on each team's performance.
    '''
    results_home = rdf.clone()
    results_away = rdf.clone()
    results_home = results_home.with_columns(home_away=pl.lit('home'))
    results_away = results_away.with_columns(home_away=pl.lit('away'))

    home_cols = ['season', 'round', 'venue', 'date', 'prime_team', 'prime_team_points',
                    'opt_team', 'opt_team_points', 'home_away']
    away_cols = ['season', 'round', 'venue', 'date', 'opt_team', 'opt_team_points',
                    'prime_team', 'prime_team_points', 'home_away']
    results_home.columns = home_cols
    results_away.columns = away_cols

    results_home = results_home.select(home_cols)
    results_away = results_away.select(home_cols) # this looks weird but is intended to select these columns so that the two dataframes can be concatenated

    all_results = pl.concat([results_home, results_away])
    all_results = all_results.with_columns(prime_margin=pl.col('prime_team_points') - pl.col('opt_team_points'))
    all_results=all_results.with_columns(result=np.sign(pl.col('prime_margin')))
    #all_results = all_results.reset_index(drop=True)

        ## make categorical things categorical types
    all_results=all_results.with_columns(home_away=pl.col('home_away').cast(pl.Categorical))
    all_results=all_results.with_columns(venue=pl.col('venue').cast(pl.Categorical))
    all_results=all_results.with_columns(prime_team=pl.col('prime_team').cast(pl.Categorical))
    all_results=all_results.with_columns(opt_team=pl.col('opt_team').cast(pl.Categorical))
    return all_results

    
def create_features(erdf):
    erdf = erdf.sort('prime_team', 'date')
    lagged_cols = [f'prime_margin_{i}' for i in range(1,7)]
    lags = erdf.groupby('prime_team').agg([pl.col('prime_margin').shift(i).alias(f"prime_margin_{i}") for i in range(1,7)]).explode(lagged_cols)
    lags = lags.select(*lagged_cols)
    l6wm = lags.mean(axis=1).alias('pm_l6w').to_frame()
    erdf = pl.concat([erdf, lags, l6wm], how='horizontal')

    return erdf

In [8]:
unplayed = pl.read_delta("../data/raw/unplayed_games.delta")
next_round = unplayed.filter(pl.col('round') == pl.col('round').min())

results = pl.concat([results, next_round])

In [9]:
next_round

season,round,venue,date,home_team_name,home_team_points,away_team_name,away_team_points
i64,i64,str,date,str,i16,str,i16
2023,11,"""AAMI Park, Mel…",2023-05-11,"""Storm""",,"""Broncos""",
2023,11,"""Accor Stadium,…",2023-05-12,"""Bulldogs""",,"""Warriors""",
2023,11,"""BlueBet Stadiu…",2023-05-12,"""Panthers""",,"""Roosters""",
2023,11,"""Accor Stadium,…",2023-05-13,"""Rabbitohs""",,"""Wests Tigers""",
2023,11,"""Queensland Cou…",2023-05-13,"""Cowboys""",,"""Dragons""",
2023,11,"""GIO Stadium, C…",2023-05-13,"""Raiders""",,"""Eels""",
2023,11,"""McDonald Jones…",2023-05-14,"""Knights""",,"""Titans""",
2023,11,"""4 Pines Park, …",2023-05-14,"""Sea Eagles""",,"""Sharks""",


In [10]:
expanded_results = result_expander(results)
feature_df = create_features(expanded_results)

In [11]:
feature_df.filter(pl.col('result')==0)

season,round,venue,date,prime_team,prime_team_points,opt_team,opt_team_points,home_away,prime_margin,result,prime_margin_1,prime_margin_2,prime_margin_3,prime_margin_4,prime_margin_5,prime_margin_6,pm_l6w
i64,i64,cat,date,cat,i16,cat,i16,cat,i16,i16,i16,i16,i16,i16,i16,i16,f64
2019,17,"""Suncorp Stadiu…",2019-07-13,"""Warriors""",18,"""Broncos""",18,"""away""",0,0,4,-1,4,-22,-6,20,-0.166667
2020,3,"""Campbelltown S…",2020-05-31,"""Knights""",14,"""Panthers""",14,"""away""",0,0,18,20,-44,34,-42,36,3.666667
2023,5,"""Glen Willow Ov…",2023-04-01,"""Knights""",32,"""Sea Eagles""",32,"""away""",0,0,10,-16,2,-8,-22,-10,-7.333333
2023,5,"""Glen Willow Ov…",2023-04-01,"""Knights""",32,"""Sea Eagles""",32,"""away""",0,0,0,10,-16,2,-8,-22,-5.666667
2019,17,"""Suncorp Stadiu…",2019-07-13,"""Broncos""",18,"""Warriors""",18,"""home""",0,0,2,-14,-28,-8,6,5,-6.166667
2023,5,"""Glen Willow Ov…",2023-04-01,"""Sea Eagles""",32,"""Knights""",32,"""home""",0,0,-1,4,25,-1,-42,-34,-8.166667
2023,5,"""Glen Willow Ov…",2023-04-01,"""Sea Eagles""",32,"""Knights""",32,"""home""",0,0,0,-1,4,25,-1,-42,-2.5
2020,3,"""Campbelltown S…",2020-05-31,"""Panthers""",14,"""Knights""",14,"""home""",0,0,4,6,44,-16,-14,-12,2.0


In [12]:
feature_df.write_parquet('../data/raw/features.parquet')

In [13]:
feature_df.sort(['season', 'round'])

season,round,venue,date,prime_team,prime_team_points,opt_team,opt_team_points,home_away,prime_margin,result,prime_margin_1,prime_margin_2,prime_margin_3,prime_margin_4,prime_margin_5,prime_margin_6,pm_l6w
i64,i64,cat,date,cat,i16,cat,i16,cat,i16,i16,i16,i16,i16,i16,i16,i16,f64
2018,1,"""Optus Stadium,…",2018-03-10,"""Rabbitohs""",20,"""Warriors""",32,"""home""",-12,-1,,,,,,,
2018,1,"""Optus Stadium,…",2018-03-10,"""Warriors""",32,"""Rabbitohs""",20,"""away""",12,1,,,,,,,
2018,1,"""Optus Stadium,…",2018-03-10,"""Storm""",36,"""Bulldogs""",18,"""away""",18,1,,,,,,,
2018,1,"""McDonald Jones…",2018-03-09,"""Knights""",19,"""Sea Eagles""",18,"""home""",1,1,,,,,,,
2018,1,"""Cbus Super Sta…",2018-03-11,"""Titans""",30,"""Raiders""",28,"""home""",2,1,,,,,,,
2018,1,"""Panthers Stadi…",2018-03-11,"""Eels""",14,"""Panthers""",24,"""away""",-10,-1,,,,,,,
2018,1,"""Optus Stadium,…",2018-03-10,"""Bulldogs""",18,"""Storm""",36,"""home""",-18,-1,,,,,,,
2018,1,"""Netstrata Jubi…",2018-03-08,"""Broncos""",12,"""Dragons""",34,"""away""",-22,-1,,,,,,,
2018,1,"""Stadium Austra…",2018-03-10,"""Wests Tigers""",10,"""Roosters""",8,"""home""",2,1,,,,,,,
2018,1,"""1300SMILES Sta…",2018-03-09,"""Cowboys""",20,"""Sharks""",14,"""home""",6,1,,,,,,,


In [14]:
feature_df.filter(pl.col('season') == 2023)

season,round,venue,date,prime_team,prime_team_points,opt_team,opt_team_points,home_away,prime_margin,result,prime_margin_1,prime_margin_2,prime_margin_3,prime_margin_4,prime_margin_5,prime_margin_6,pm_l6w
i64,i64,cat,date,cat,i16,cat,i16,cat,i16,i16,i16,i16,i16,i16,i16,i16,f64
2023,1,"""PointsBet Stad…",2023-03-04,"""Rabbitohs""",27,"""Sharks""",18,"""away""",9,1,-10,10,-4,26,38,-1,9.833333
2023,2,"""BlueBet Stadiu…",2023-03-09,"""Rabbitohs""",10,"""Panthers""",16,"""away""",-6,-1,9,-10,10,-4,26,38,11.5
2023,3,"""Allianz Stadiu…",2023-03-17,"""Rabbitohs""",18,"""Roosters""",20,"""away""",-2,-1,-6,9,-10,10,-4,26,4.166667
2023,4,"""Accor Stadium,…",2023-03-25,"""Rabbitohs""",13,"""Sea Eagles""",12,"""home""",1,1,-2,-6,9,-10,10,-4,-0.5
2023,5,"""Accor Stadium,…",2023-03-31,"""Rabbitohs""",10,"""Storm""",18,"""home""",-8,-1,1,-2,-6,9,-10,10,0.333333
2023,5,"""Accor Stadium,…",2023-03-31,"""Rabbitohs""",10,"""Storm""",18,"""home""",-8,-1,-8,1,-2,-6,9,-10,-2.666667
2023,6,"""Accor Stadium,…",2023-04-07,"""Rabbitohs""",50,"""Bulldogs""",16,"""away""",34,1,-8,-8,1,-2,-6,9,-2.333333
2023,7,"""Suncorp Stadiu…",2023-04-13,"""Rabbitohs""",36,"""Dolphins""",14,"""away""",22,1,34,-8,-8,1,-2,-6,1.833333
2023,8,"""Accor Stadium,…",2023-04-20,"""Rabbitohs""",20,"""Panthers""",18,"""home""",2,1,22,34,-8,-8,1,-2,6.5
2023,9,"""Suncorp Stadiu…",2023-04-28,"""Rabbitohs""",32,"""Broncos""",6,"""away""",26,1,2,22,34,-8,-8,1,7.166667
