In [1]:
# /Users/chriswesterman/Library/Jupyter/nbextensions/snippets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

# Exploratory notebook

In [2]:
path = "../data/"  #Insert path here
database = path + 'database.sqlite'

conn = sqlite3.connect(database)


In [3]:
matches = pd.read_sql("""SELECT m.id, 
                        m.season, m.stage, m.date, 
                        ht.team_long_name as home_team, at.team_long_name as away_team, m.home_team_goal, 
                        m.away_team_goal                                      
                        FROM Match as m
                        LEFT JOIN Team AS ht on ht.team_api_id = m.home_team_api_id
                        LEFT JOIN Team AS at on at.team_api_id = m.away_team_api_id
                        WHERE league_id = 1729 AND season = '2015/2016'
                        ;""", conn)
pd.set_option('display.max_columns', 500)
matches.head()

Unnamed: 0,id,season,stage,date,home_team,away_team,home_team_goal,away_team_goal
0,4389,2015/2016,1,2015-08-09 00:00:00,Arsenal,West Ham United,0,2
1,4390,2015/2016,1,2015-08-08 00:00:00,Bournemouth,Aston Villa,0,1
2,4391,2015/2016,1,2015-08-08 00:00:00,Chelsea,Swansea City,2,2
3,4392,2015/2016,1,2015-08-08 00:00:00,Everton,Watford,2,2
4,4393,2015/2016,1,2015-08-08 00:00:00,Leicester City,Sunderland,4,2


In [4]:
matches.sort_values('date', inplace=True)

In [5]:
matches.head()

Unnamed: 0,id,season,stage,date,home_team,away_team,home_team_goal,away_team_goal
1,4390,2015/2016,1,2015-08-08 00:00:00,Bournemouth,Aston Villa,0,1
2,4391,2015/2016,1,2015-08-08 00:00:00,Chelsea,Swansea City,2,2
3,4392,2015/2016,1,2015-08-08 00:00:00,Everton,Watford,2,2
4,4393,2015/2016,1,2015-08-08 00:00:00,Leicester City,Sunderland,4,2
5,4394,2015/2016,1,2015-08-08 00:00:00,Manchester United,Tottenham Hotspur,1,0


In [6]:
matches['home_w'] = 0
matches['away_w'] = 0
matches['draw'] = 0

matches.loc[matches['home_team_goal'] > matches['away_team_goal'], 'home_w'] = 1
matches.loc[matches['home_team_goal'] < matches['away_team_goal'], 'away_w'] = 1
matches.loc[matches['home_team_goal'] == matches['away_team_goal'], 'draw'] = 1

In [7]:
matches['home_t_home_goals'] = matches.groupby('home_team')['home_team_goal'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['home_t_total_goals'] = 0

matches['home_t_goals_against'] = matches.groupby('home_team')['away_team_goal'].apply(lambda x  : x.cumsum().shift(fill_value=0))
# matches['home_t_total_goals_against'] = 0

matches['home_t_home_wins'] = matches.groupby('home_team')['home_w'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['home_t_home_losses'] = matches.groupby('home_team')['away_w'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['home_t_home_draws'] = matches.groupby('home_team')['draw'].apply(lambda x  : x.cumsum().shift(fill_value=0))
# matches['home_t_total_wins'] = 0
# matches['home_t_total_losses'] = 0
# matches['home_t_total_draws'] = 0


matches['away_t_away_goals'] = matches.groupby('away_team')['away_team_goal'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['away_t_total_goals'] = 0
matches['away_t_away_goals_against'] = matches.groupby('away_team')['home_team_goal'].apply(lambda x  : x.cumsum().shift(fill_value=0))
# matches['away_t_total_goals_against'] = 0


matches['away_t_away_wins'] = matches.groupby('away_team')['away_w'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['away_t_away_losses'] = matches.groupby('away_team')['home_w'].apply(lambda x  : x.cumsum().shift(fill_value=0))
matches['away_t_away_draws'] = matches.groupby('away_team')['draw'].apply(lambda x  : x.cumsum().shift(fill_value=0))
# matches['away_t_total_wins'] = 0
# matches['away_t_total_losses'] = 0
# matches['away_t_total_draws'] = 0

In [None]:
matches.head()

In [None]:
matches[(matches['stage'] < 4) & (matches['away_team'] == 'Everton' )].groupby('away_team')['away_team_goal'].sum()

In [None]:
matches[(matches.home_team =='Everton') | (matches.away_team == 'Everton')]

# row 3 hom_t_total_goals should be 5 (2 home goals in week 1 / 3 away goals in week 2)

In [8]:
def ht_total_goals(st, ht, hg):
    if st == 1:
        return 0
    total_goals = matches[(matches['stage'] < st) & (matches['away_team'] == ht)]\
        .groupby('away_team')['away_team_goal'].sum() + hg
    return total_goals[0]
    
    
    

def at_total_goals(st, at, ag):
    if st == 1:
        return 0
    total_goals = matches[(matches['stage'] < st) & (matches['home_team'] == at)]\
        .groupby('home_team')['home_team_goal'].sum() + ag
    return total_goals[0]


matches['home_t_total_goals'] = matches.apply(lambda x: ht_total_goals( x['stage'], x['home_team'], x['home_t_home_goals']), axis=1)

matches['away_t_total_goals'] = matches.apply(lambda x: at_total_goals( x['stage'], x['away_team'], x['away_t_away_goals']), axis=1)



In [11]:
matches[(matches.home_team =='Arsenal') | (matches.away_team == 'Arsenal')]

Unnamed: 0,id,season,stage,date,home_team,away_team,home_team_goal,away_team_goal,home_w,away_w,draw,home_t_home_goals,home_t_total_goals,home_t_goals_against,home_t_home_wins,home_t_home_losses,home_t_home_draws,away_t_away_goals,away_t_total_goals,away_t_away_goals_against,away_t_away_wins,away_t_away_losses,away_t_away_draws
0,4389,2015/2016,1,2015-08-09 00:00:00,Arsenal,West Ham United,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
111,4500,2015/2016,2,2015-08-16 00:00:00,Crystal Palace,Arsenal,1,2,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0
220,4609,2015/2016,3,2015-08-24 00:00:00,Arsenal,Liverpool,0,0,0,0,1,0,2,2,0,1,0,1,2,0,1,0,0
325,4714,2015/2016,4,2015-08-29 00:00:00,Newcastle United,Arsenal,0,1,0,1,0,2,2,2,0,0,1,2,2,1,1,0,0
330,4719,2015/2016,5,2015-09-12 00:00:00,Arsenal,Stoke City,2,0,1,0,0,0,3,2,0,1,1,3,3,3,0,0,2
342,4731,2015/2016,6,2015-09-19 00:00:00,Chelsea,Arsenal,2,0,1,0,0,3,7,4,0,1,1,3,5,1,2,0,0
350,4739,2015/2016,7,2015-09-26 00:00:00,Leicester City,Arsenal,2,5,0,1,0,8,13,5,2,0,1,3,5,3,2,1,0
360,4749,2015/2016,8,2015-10-04 00:00:00,Arsenal,Manchester United,3,0,1,0,0,2,10,2,1,1,1,5,12,4,2,1,0
378,4767,2015/2016,9,2015-10-17 00:00:00,Watford,Arsenal,0,3,0,1,0,1,6,1,1,1,2,8,13,5,3,1,0
10,4399,2015/2016,10,2015-10-24 00:00:00,Arsenal,Everton,2,1,1,0,0,5,16,2,2,1,1,6,12,2,2,0,2


In [None]:
for index, df in matches.iterrows():
    total_goals = matches[(matches['stage'] < df['stage']) & (matches['away_team'] == df['home_team'])].groupby('away_team')['away_team_goal'].sum() + df['home_t_home_goals']
    print(total_goals)

In [None]:
test = pd.DataFrame([[1,2,3],[1,5,6],[2,5,6],[2,2,1]], columns=['a', 'b', 'c'])

print(test)

def test_func(a,b):
    return a + b


test['d'] = test.apply(lambda x: test_func(x['a'], x['b']), axis=1)

print(test)