In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import cufflinks as cf

#plotly.offline.init_notebook_mode()

In [2]:
fac = pd.read_csv('facup_v2.csv')

In [3]:
fac.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68543 entries, 0 to 68542
Data columns (total 14 columns):
away_score    68543 non-null float64
away_team     68543 non-null object
date          68543 non-null object
home_score    68543 non-null float64
home_team     68543 non-null object
match_id      68543 non-null int64
qualifier     68543 non-null bool
replay        68543 non-null bool
round         68543 non-null object
round_id      68543 non-null int64
stage         68543 non-null float64
year          68543 non-null int64
year_id       68543 non-null int64
stage_f       68543 non-null float64
dtypes: bool(2), float64(4), int64(4), object(4)
memory usage: 6.9+ MB


In [4]:
fac.head()

Unnamed: 0,away_score,away_team,date,home_score,home_team,match_id,qualifier,replay,round,round_id,stage,year,year_id,stage_f
0,0,Crystal Palace (1),09.03.72,3,Royal Engineers,16066,False,True,Semi-Final,10682,4,1872,71234,13
1,0,Queen's Park (Glasgow),05.03.72,0,Wanderers,16065,False,False,Semi-Final,10682,4,1872,71234,13
2,0,Crystal Palace (1),17.02.72,0,Royal Engineers,16064,False,False,Semi-Final,10682,4,1872,71234,13
3,0,Queen's Park (Glasgow),01.01.01,0,Wanderers,16067,False,True,Semi-Final,10682,4,1872,71234,13
4,5,Royal Engineers,10.01.72,0,Hitchin,16057,False,False,Second Round Proper,10681,2,1872,71234,11


#### Dropping qualifying matches, metadata columns, and setting index to match_id

In [5]:
fac = fac[fac.qualifier == False]

In [6]:
fac = fac[['home_team','away_team','home_score', 'away_score','round','year','date','replay','qualifier', 'stage_f','match_id']]

In [7]:
fac.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16606 entries, 0 to 68542
Data columns (total 11 columns):
home_team     16606 non-null object
away_team     16606 non-null object
home_score    16606 non-null float64
away_score    16606 non-null float64
round         16606 non-null object
year          16606 non-null int64
date          16606 non-null object
replay        16606 non-null bool
qualifier     16606 non-null bool
stage_f       16606 non-null float64
match_id      16606 non-null int64
dtypes: bool(2), float64(3), int64(2), object(4)
memory usage: 1.3+ MB


#### Regularising text strings to upper-case 

In [8]:
fac['home_team'] = fac['home_team'].str.upper()
fac['away_team'] = fac['away_team'].str.upper()
fac['round'] = fac['round'].str.upper()

#### Removing AFC from end of team names, spotted in a later (merge) QC and fix inserted here

First check that AFC does not appear in the middle of any team names

In [9]:
fac['home_team'] = fac['home_team'].str.replace(' AFC', '')
fac['away_team'] = fac['away_team'].str.replace(' AFC', '')

#### Reversing stage counter

In [10]:
# previously all finals = 14, now = 1

fac['stage'] = 15 - fac['stage_f']
fac.drop('stage_f', axis=1, inplace=True)

fac.head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,match_id,stage
0,ROYAL ENGINEERS,CRYSTAL PALACE (1),3,0,SEMI-FINAL,1872,09.03.72,True,False,16066,2
1,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,05.03.72,False,False,16065,2
2,ROYAL ENGINEERS,CRYSTAL PALACE (1),0,0,SEMI-FINAL,1872,17.02.72,False,False,16064,2
3,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,01.01.01,True,False,16067,2
4,HITCHIN,ROYAL ENGINEERS,0,5,SECOND ROUND PROPER,1872,10.01.72,False,False,16057,4


#### Finding teams with non-unique names

In [11]:
teams = pd.concat([fac.home_team, fac.away_team])

teams[teams.str.contains('\(\d\)')].unique()

array(['CRYSTAL PALACE (1)', 'BRENTWOOD (1)', 'ROMFORD (1)', 'HENDON (1)',
       'BIRMINGHAM (1)', 'BOOTLE (1)', 'HORNCHURCH (1)', 'BELPER TOWN (1)',
       'LOUGHBOROUGH (1)', 'WORKINGTON (1)', 'BOSTON (1)',
       'BEDFORD TOWN (1)', 'ROMFORD (2)', 'SOUTH SHIELDS (2)',
       'BRIDGWATER TOWN (1)', 'MAIDSTONE UNITED (1)',
       'ACCRINGTON STANLEY (2)', 'DORKING (2)', 'NEWPORT COUNTY (2)',
       'KIDDERMINSTER (1)', 'TAUNTON TOWN (1)', 'GATESHEAD (2)'], dtype=object)

#### Finding any years that have duplicate teams in same year e.g. ARSENAL (1) ARSENAL (2)

In [12]:
def find_dups(group):
    teams_unq = pd.Series(pd.concat([group.home_team, group.away_team], ignore_index=True).unique()) # unique teams in year
    teams_unq_strip = teams_unq.str.replace('\(\d\)','').value_counts() # stripping values from end of dupe teams and counting (#)
    return teams_unq_strip[teams_unq_strip.values > 1]

fac.groupby('year').apply(find_dups)

There are no duplicate teams occuring in the same year! this is good as we can strip values from end of team name before matching with league tables :)

In [13]:
fac.home_team = fac['home_team'].str.replace('\(\d\)','').str.strip()
fac.away_team = fac['away_team'].str.replace('\(\d\)','').str.strip()

In [14]:
fac.head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,match_id,stage
0,ROYAL ENGINEERS,CRYSTAL PALACE,3,0,SEMI-FINAL,1872,09.03.72,True,False,16066,2
1,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,05.03.72,False,False,16065,2
2,ROYAL ENGINEERS,CRYSTAL PALACE,0,0,SEMI-FINAL,1872,17.02.72,False,False,16064,2
3,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,01.01.01,True,False,16067,2
4,HITCHIN,ROYAL ENGINEERS,0,5,SECOND ROUND PROPER,1872,10.01.72,False,False,16057,4


#### creating match winner columns, NaN for draws

In [15]:
fac[(fac['home_team'] == "BYE")].head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,match_id,stage


In [16]:
fac[(fac['away_team'] == "BYE")].head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,match_id,stage
12,QUEEN'S PARK (GLASGOW),BYE,0,0,THIRD ROUND PROPER,1872,01.01.01,False,False,16061,3
17,HAMPSTEAD HEATHENS,BYE,0,0,FIRST ROUND PROPER,1872,01.01.01,False,False,16048,5
23,QUEEN'S PARK (GLASGOW),BYE,0,0,THIRD ROUND PROPER,1873,01.01.01,False,False,16087,4
24,WANDERERS,BYE,0,0,THIRD ROUND PROPER,1873,01.01.01,False,False,16088,4
26,QUEEN'S PARK (GLASGOW),BYE,0,0,FOURTH ROUND PROPER,1873,01.01.01,False,False,16090,3


In [17]:
fac['winner'] = np.where(fac.home_score > fac.away_score, fac.home_team, fac.away_team) #setting match winners
fac['winner'] = np.where(fac.home_score == fac.away_score, np.nan, fac.winner) #setting match draws
fac['winner'] = np.where(fac.away_team == 'BYE', 'BYE', fac.winner) #setting match draws

In [18]:
# checking BYE matches always have 0-0 scorelines and replace score with NaN

fac[(fac['away_team'] == "BYE")].home_score.max()

0.0

In [19]:
fac[(fac['away_team'] == "BYE")].away_score.max()

0.0

In [20]:
print fac.ix[fac['away_team'] == "BYE", ['home_score', 'away_score']].head()

fac.ix[fac['away_team'] == "BYE", ['home_score', 'away_score']] = np.nan

print fac.ix[fac['away_team'] == "BYE", ['home_score', 'away_score']].head()

    home_score  away_score
12           0           0
17           0           0
23           0           0
24           0           0
26           0           0
    home_score  away_score
12         NaN         NaN
17         NaN         NaN
23         NaN         NaN
24         NaN         NaN
26         NaN         NaN


#### Checking "fixtures"

A fixture is defined as any number of matches between two teams in the same round
(there can be more than one match in each 'fixture' due to replays)

Each fixture should have exactly one match which has a winner
Each fixturs should have exactly one non-replay match

Creating new columns "left" and "right" which organise the teams so that in each fixture the same team always appears in the same column rather than varying between the home_team and away_team columns

In [21]:
fac['left'] = fac[['home_team','away_team']].max(axis=1)
fac['right'] = fac[['home_team','away_team']].min(axis=1)

In [22]:
fac.head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,match_id,stage,winner,left,right
0,ROYAL ENGINEERS,CRYSTAL PALACE,3,0,SEMI-FINAL,1872,09.03.72,True,False,16066,2,ROYAL ENGINEERS,ROYAL ENGINEERS,CRYSTAL PALACE
1,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,05.03.72,False,False,16065,2,,WANDERERS,QUEEN'S PARK (GLASGOW)
2,ROYAL ENGINEERS,CRYSTAL PALACE,0,0,SEMI-FINAL,1872,17.02.72,False,False,16064,2,,ROYAL ENGINEERS,CRYSTAL PALACE
3,WANDERERS,QUEEN'S PARK (GLASGOW),0,0,SEMI-FINAL,1872,01.01.01,True,False,16067,2,,WANDERERS,QUEEN'S PARK (GLASGOW)
4,HITCHIN,ROYAL ENGINEERS,0,5,SECOND ROUND PROPER,1872,10.01.72,False,False,16057,4,ROYAL ENGINEERS,ROYAL ENGINEERS,HITCHIN


In [23]:
fix = fac.groupby(['year','round','left','right'])

In [24]:
fix.size().max()

5

In [25]:
fix.filter(lambda x: len(x) == 5).shape #total number of matches belonging to 5 match fixtures

(25, 14)

The record number of matches in a fixture for "proper" rounds is 5, there are 5 instances of this occuring

Finding number of matches with non-NaN winners in each fixture

In [26]:
fix['winner'].count()

year  round                left                     right                 
1872  FINAL                WANDERERS                ROYAL ENGINEERS           1
      FIRST ROUND PROPER   CIVIL SERVICE            BARNES                    1
                           HAMPSTEAD HEATHENS       BYE                       1
                           HITCHIN                  CRYSTAL PALACE            0
                           MARLOW                   MAIDENHEAD                1
                           QUEEN'S PARK (GLASGOW)   DONNINGTON SCHOOL         0
                           ROYAL ENGINEERS          REIGATE PRIORY            0
                           UPTON PARK               CLAPHAM ROVERS            1
                           WANDERERS                HARROW CHEQUERS           0
      SECOND ROUND PROPER  HAMPSTEAD HEATHENS       BARNES                    1
                           MAIDENHEAD               CRYSTAL PALACE            1
                           QUEEN'S PARK (GLAS

finding fixtures that have multiple winners

In [27]:
overfix = fix.filter(lambda x: x['winner'].count() > 1)

overfix.to_csv('overfix.csv')

In [28]:
over_plt = overfix.year.value_counts().reindex(range(1872, 2016))

In [29]:
over_plt.iplot(kind = 'bar', width = 0.1, dimensions = (900,400))

In [30]:
# 1946 has the majority of "multiple winners" in a single fixture due to EVERY fixture being replayed, the rest are due to results
# which were voided so setting winner column to VOID

# first check that no teams are named void

(pd.concat([fac.home_team, fac.away_team]).values == 'VOID').max()

False

In [31]:
void_match = pd.read_csv('void_match.csv')

In [32]:
void_match.head()

Unnamed: 0,home_team,away_team,home_score,away_score,round,year,match_id
0,SOUTH NORWOOD,WINDSOR HOME PARK,1,0,SECOND ROUND PROPER,1873,16082
1,NOTTS COUNTY,WEDNESBURY STROLLERS,5,3,SECOND ROUND PROPER,1882,16547
2,DREADNOUGHT,SOUTH READING,1,2,FIRST ROUND PROPER,1883,16610
3,BLACKBURN OLYMPIC,CHURCH,4,2,FIRST ROUND PROPER,1886,16976
4,HURST,BRADSHAW,2,1,FIRST ROUND PROPER,1886,16997


filter out winner = VOID rows and check that the only 'multi-winner' fixtures are from 1946

In [33]:
fac.ix[fac.match_id.isin(void_match.match_id),'winner'] = 'VOID'

In [34]:
over_plt2 = (fac[~(fac.winner == 'VOID')]
             .groupby(['year','round','left','right'])
             .filter(lambda x: x['winner'].count() > 1)['year']
             .value_counts().reindex(range(1872, 2016))
            )

In [35]:
figure = over_plt2.iplot(kind = 'bar', width = 0.1, dimensions = (900,400), asFigure = True)

figure['layout']['xaxis1'].update({'range':[1872,2015]})

py.iplot(figure)

now looking at fixtures which have no winners

In [36]:
underfix = fix.filter(lambda x: x['winner'].count() < 1)

underfix.to_csv('underfix.csv')

In [37]:
under_plt = underfix.year.value_counts().reindex(range(1872, 2016))

figure = under_plt.iplot(kind = 'bar', width = 0.1, dimensions = (900,400), asFigure = True)

figure['layout']['xaxis1'].update({'range':[1872,2015]})

py.iplot(figure)

'missing' winners were either penalty shootouts in replays (1992 onwards) or walkovers (pre-1940)

actual match winner or BYE researched and filled in as appropriate

In [38]:
miss_win = pd.read_csv('no_winner.csv')

miss_win.head()

Unnamed: 0,home_team,away_team,round,year,replay,match_id,winner
0,WANDERERS,QUEEN'S PARK (GLASGOW),SEMI-FINAL,1872,False,16065,BYE
1,WANDERERS,QUEEN'S PARK (GLASGOW),SEMI-FINAL,1872,True,16067,BYE
2,QUEEN'S PARK (GLASGOW),DONNINGTON SCHOOL,SECOND ROUND PROPER,1872,False,16058,BYE
3,WANDERERS,CRYSTAL PALACE,THIRD ROUND PROPER,1872,False,16063,BYE
4,HITCHIN,CRYSTAL PALACE,FIRST ROUND PROPER,1872,False,16049,BYE


#### Setting index equal to match_id

In [39]:
fac.set_index('match_id', inplace = True)

miss_win.set_index('match_id', inplace = True)

In [40]:
fac.ix[miss_win.index, 'winner'] = miss_win['winner'] # updating winner column with penalty shootout winners and BYE were appropriate

In [41]:
# checking that there are no longer any 'fixtures' without a winner

fac.groupby(['year','round','left','right']).filter(lambda x: x['winner'].count() < 1)

Unnamed: 0_level_0,home_team,away_team,home_score,away_score,round,year,date,replay,qualifier,stage,winner,left,right
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


setting home and away score in all BYE and VOID matches to NaN

In [42]:
bye_void = fac.winner.isin(['BYE','VOID'])

In [43]:
fac.ix[bye_void, ['home_score', 'away_score']] = np.NaN

In [44]:
fac[fac.home_score.isnull()]['winner'].value_counts()

BYE     295
VOID     37
Name: winner, dtype: int64

In [45]:
fac.to_csv('facup_v3.csv')