In [75]:
import pandas as pd

df = pd.read_csv("../data/spi_matches.csv", parse_dates=["date"])
df = df[df.league_id == 2411]
df = df[df.score1.isnull()]

dates_to_ignore = [(2019, 2, 22), (2019, 2, 23), (2019, 2, 24), # League Cup Final
                   (2019, 3, 16), (2019, 3, 17), (2019, 3, 18), # FA Cup Quarters
                   (2019, 4,  6), (2019, 4,  7), (2019, 4,  8)] # FA Cup Semis
dates_to_ignore_dt = [pd.datetime(t[0], t[1], t[2]) for t in dates_to_ignore]
# 2019 4 30 is week ten
teams_used = []
df = df[(df.date > pd.datetime(2019, 1, 28)) &
        (df.date < pd.datetime(2019, 4, 30)) &
        (~df.date.isin(dates_to_ignore_dt)) &
        (~df.team1.isin(teams_used))]

In [76]:
df.date.unique()

array(['2019-01-29T00:00:00.000000000', '2019-01-30T00:00:00.000000000',
       '2019-02-02T00:00:00.000000000', '2019-02-03T00:00:00.000000000',
       '2019-02-04T00:00:00.000000000', '2019-02-09T00:00:00.000000000',
       '2019-02-10T00:00:00.000000000', '2019-02-11T00:00:00.000000000',
       '2019-02-26T00:00:00.000000000', '2019-02-27T00:00:00.000000000',
       '2019-03-02T00:00:00.000000000', '2019-03-09T00:00:00.000000000',
       '2019-03-30T00:00:00.000000000', '2019-04-13T00:00:00.000000000',
       '2019-04-20T00:00:00.000000000', '2019-04-27T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [77]:
max_per_team = df.groupby(["team1"]).agg("max")["prob1"].reset_index().sort_values("prob1", ascending=False).reset_index(drop=True)
max_per_team

Unnamed: 0,team1,prob1
0,Liverpool,0.8885
1,Manchester City,0.8437
2,Tottenham Hotspur,0.7966
3,Chelsea,0.7875
4,Arsenal,0.7065
5,Manchester United,0.6793
6,Leicester City,0.5575
7,Crystal Palace,0.5527
8,Southampton,0.5404
9,Wolverhampton,0.5395


In [78]:
max_per_team_a = df.groupby(["team2"]).agg("max")["prob2"].reset_index().sort_values("prob2", ascending=False).reset_index(drop=True)
max_per_team_a.columns = ["team1", "prob2"]

In [79]:
home_away_comparison = pd.merge(max_per_team, max_per_team_a, on=["team1"])
home_away_comparison[home_away_comparison.prob1 < home_away_comparison.prob2]

Unnamed: 0,team1,prob1,prob2


# No teams have their best match away from home

In [80]:
pd.merge(df, max_per_team.iloc[6:10], how="inner", on=["team1", "prob1"])[["date", "team1", "team2", "prob1"]]

Unnamed: 0,date,team1,team2,prob1
0,2019-02-09,Southampton,Cardiff City,0.5404
1,2019-03-02,Wolverhampton,Cardiff City,0.5395
2,2019-03-09,Leicester City,Fulham,0.5575
3,2019-03-30,Crystal Palace,Huddersfield Town,0.5527


In [86]:
picks = pd.merge(df, max_per_team.iloc[:10], how="inner", on=["team1", "prob1"])[["date", "team1", "team2", "prob1"]]
picks

Unnamed: 0,date,team1,team2,prob1
0,2019-01-29,Arsenal,Cardiff City,0.7065
1,2019-01-29,Manchester United,Burnley,0.6793
2,2019-02-02,Chelsea,Huddersfield Town,0.7875
3,2019-02-09,Southampton,Cardiff City,0.5404
4,2019-02-26,Manchester City,West Ham United,0.8437
5,2019-03-02,Wolverhampton,Cardiff City,0.5395
6,2019-03-09,Leicester City,Fulham,0.5575
7,2019-03-30,Crystal Palace,Huddersfield Town,0.5527
8,2019-04-13,Tottenham Hotspur,Huddersfield Town,0.7966
9,2019-04-27,Liverpool,Huddersfield Town,0.8885


In [83]:
df[df.date == pd.datetime(2019,4,20)][["team1", "team2", "prob1", "prob2"]]

Unnamed: 0,team1,team2,prob1,prob2
20240,AFC Bournemouth,Fulham,0.525,0.2288
20241,Everton,Manchester United,0.3362,0.4051
20242,Arsenal,Crystal Palace,0.5688,0.1876
20243,Chelsea,Burnley,0.7747,0.0609
20247,Newcastle,Southampton,0.3881,0.3183
20248,Cardiff City,Liverpool,0.0836,0.7509
20251,Wolverhampton,Brighton and Hove Albion,0.4917,0.2117
20252,West Ham United,Leicester City,0.4098,0.3043
20253,Manchester City,Tottenham Hotspur,0.6028,0.1821
20254,Huddersfield Town,Watford,0.2999,0.4093


Use United first week and arsenal on the 20th



In [93]:
round(picks.iloc[2:].prob1.product() * .6793 * .5688 * 100, 1)

1.6