In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline
matplotlib.style.use('ggplot')

In [2]:
fac = pd.read_csv('facup_v2.csv')

In [3]:
fac.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68516 entries, 0 to 68515
Data columns (total 14 columns):
away_score    68516 non-null float64
away_team     68516 non-null object
date          68516 non-null object
home_score    68516 non-null float64
home_team     68516 non-null object
match_id      68516 non-null float64
qualifier     68516 non-null bool
replay        68516 non-null bool
round         68516 non-null object
round_id      68516 non-null int64
stage         68515 non-null float64
year          68516 non-null int64
year_id       68516 non-null int64
stage_f       68515 non-null float64
dtypes: bool(2), float64(5), int64(3), object(4)
memory usage: 6.9+ MB


In [None]:
fac.head()

In [None]:
fac = fac[fac.home == False]

In [None]:
fac4.info()

In [None]:
fac4 = fac4[['home_team','away_team','home_score', 'away_score','round','year','date','replay','qualifier', 'stage_r','match_id']]

In [None]:
fac4.info()

In [None]:
fac4.head()

#### Finding teams with non-unique names

In [None]:
fac4.home_team[fac4.home_team.str.contains('\(\d\)')].unique()

#### Finding any years that have duplicate teams in same year e.g. ARSENAL (1) ARSENAL (2)

In [None]:
def find_dups(group):
    teams_unq = pd.Series(pd.concat([group.home_team, group.away_team], ignore_index=True).unique())
    teams_unq_strip = teams_unq.str.replace('\(\d\)','').value_counts() # stripping values from end of dupe teams and counting (#)
    return teams_unq_strip[teams_unq_strip.values > 1]

fac4.groupby('year').apply(find_dups)


There are no duplicate teams occuring in the same year! this is good as we can strip values from end of team name before matching with league tables :)

#### Reversing stage counter

In [None]:
fac4['stage'] = 15 - fac4.stage_r

In [None]:
fac4.drop('stage_r', axis=1, inplace=True)

In [None]:
fac4.head()

#### regularising team names by converting to upper case and stripping whitespace, and numbers from duplicates

In [None]:
fac4.home_team = fac4['home_team'].str.upper()
fac4.away_team = fac4['away_team'].str.upper()

In [None]:
fac4.home_team = fac4['home_team'].str.replace('\(\d\)','').str.strip()
fac4.away_team = fac4['away_team'].str.replace('\(\d\)','').str.strip()

In [None]:
fac4.head()

#### creating match winner columns, NaN for draws

In [None]:
fac4['winner'] = np.where(fac4.home_score > fac4.away_score, fac4.home_team, fac4.away_team) #setting match winners
fac4['winner'] = np.where(fac4.home_score == fac4.away_score, np.nan, fac4.winner) #setting match draws

In [None]:
fac4[(fac4['home_team'] == "BYE")].head()

In [None]:
fac4[(fac4['away_team'] == "BYE")].head()

In [None]:
fac4['winner'] = np.where(fac4.away_team == 'BYE', 'BYE', fac4.winner) #setting match byes

#### Checking that every "fixture" has a winner. A fixture is defined as any number of matches between two teams in the same round
(there can be more than one match in each 'fixture' due to replays)

Creating new columns "left" and "right" which organise the teams so that in each fixture the same team always appears in the same column rather than varying between the home_team and away_team columns

In [None]:
fac4['left'] = fac4[['home_team','away_team']].max(axis=1)
fac4['right'] = fac4[['home_team','away_team']].min(axis=1)

In [None]:
fac4.head()

Sort values so that the final match in each fixture is always the top row in each 'fix' group

In [None]:
fac4.sort_values(['year','stage','date'], ascending = False, inplace = True)

In [None]:
fix = fac4.groupby(['year','round','left','right'])

In [None]:
fix.size().max()

This shows 6 matches played in a single round as the record (5 replays), it belongs to [Alvechurch and Oxford City in 1971](http://www.theguardian.com/sport/2009/nov/21/oxford-city-alvechurch-longest-fa-cup)

4 of the matches were on consecutive days!

In [None]:
fix.filter(lambda x: len(x) == 6)

In [None]:
fac4.sort_values('year')

In [None]:
fix['winner'].max()

In [None]:
fix.get_group((2015, 'Third Round Qualifying', 'MICKLEOVER SPORTS', 'GOSPORT BOROUGH'))

In [None]:
fix.groups[(1933, 'Extra Preliminary Round', 'WEALDSTONE', 'ABINGDON TOWN')]

In [None]:
fac4[(fac4.year == 2015)&(fac4['round'] == 'Third Round Qualifying')]