In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
matplotlib.style.use('ggplot')

In [3]:
matches = pd.read_csv('results.csv') 

In [4]:
matches.info(); matches.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68510 entries, 0 to 68509
Data columns (total 8 columns):
home_team     68508 non-null object
away_team     68510 non-null object
match_id      68510 non-null int64
round_id      68510 non-null int64
replay        68510 non-null bool
home_score    68420 non-null float64
date          68510 non-null object
away_score    68420 non-null float64
dtypes: bool(1), float64(2), int64(2), object(3)
memory usage: 4.2+ MB


Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score
0,Royal Engineers,Crystal Palace (1),16066,10682,True,3,09.03.72,0
1,Wanderers,Queen's Park (Glasgow),16065,10682,False,0,05.03.72,0
2,Royal Engineers,Crystal Palace (1),16064,10682,False,0,17.02.72,0
3,Wanderers,Queen's Park (Glasgow),16067,10682,True,0,01.01.01,0
4,Hitchin,Royal Engineers,16057,10681,False,0,10.01.72,5


In [5]:
# read .csv containing round names, year and stage for each round_id and merge with results

#Stage this is the numeric round number of a particular year,
# e.g. first qualifying round = 1 and final = number of rounds in that year

years = pd.read_csv('years_edit.csv')

years.info(); years.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1506
Data columns (total 5 columns):
round_id    1507 non-null int64
round       1507 non-null object
year_id     1507 non-null int64
year        1507 non-null int64
stage       1507 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 70.6+ KB


Unnamed: 0,round_id,round,year_id,year,stage
0,10682,Semi-Final,71234,1872,4
1,10681,Second Round Proper,71234,1872,2
2,10683,Third Round Proper,71234,1872,3
3,10680,First Round Proper,71234,1872,1
4,10689,Third Round Proper,71235,1873,3


In [6]:
# checking all rounds have been scraped, should equal 1507

matches.round_id.nunique()

1507

In [7]:
matches = pd.merge(matches, years, how='inner', on='round_id')

matches.info(); matches.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68510 entries, 0 to 68509
Data columns (total 12 columns):
home_team     68508 non-null object
away_team     68510 non-null object
match_id      68510 non-null int64
round_id      68510 non-null int64
replay        68510 non-null bool
home_score    68420 non-null float64
date          68510 non-null object
away_score    68420 non-null float64
round         68510 non-null object
year_id       68510 non-null int64
year          68510 non-null int64
stage         68510 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 6.3+ MB


Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score,round,year_id,year,stage
0,Royal Engineers,Crystal Palace (1),16066,10682,True,3,09.03.72,0,Semi-Final,71234,1872,4
1,Wanderers,Queen's Park (Glasgow),16065,10682,False,0,05.03.72,0,Semi-Final,71234,1872,4
2,Royal Engineers,Crystal Palace (1),16064,10682,False,0,17.02.72,0,Semi-Final,71234,1872,4
3,Wanderers,Queen's Park (Glasgow),16067,10682,True,0,01.01.01,0,Semi-Final,71234,1872,4
4,Hitchin,Royal Engineers,16057,10681,False,0,10.01.72,5,Second Round Proper,71234,1872,2


In [8]:
# returning rows with missing data

missing = matches[matches.isnull().any(axis=1)]

missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 55448 to 68470
Data columns (total 12 columns):
home_team     90 non-null object
away_team     92 non-null object
match_id      92 non-null int64
round_id      92 non-null int64
replay        92 non-null bool
home_score    2 non-null float64
date          92 non-null object
away_score    2 non-null float64
round         92 non-null object
year_id       92 non-null int64
year          92 non-null int64
stage         92 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 8.7+ KB


In [9]:
# 2 missing home_teams and 90 missing scorelines, analysing missing scores first 
# grouping by team and round and returning groups which have a missing score

no_score = matches.groupby(['round_id','home_team', 'away_team']).filter(lambda x: x['home_score'].isnull().max())

In [10]:
#returning groups that have more than one row, these are matches which were rescheduled and so missing values can be dropped

no_score.groupby(['round_id','home_team', 'away_team']).filter(lambda x: len(x) > 1).sort_values(['home_team','round_id']).head()

Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score,round,year_id,year,stage
67298,AFC Bournemouth,Burton Albion,390156,12744,False,4.0,14.01.14,1.0,Third Round Proper,75769,2014,9
67315,AFC Bournemouth,Burton Albion,383299,12744,False,,04.01.14,,Third Round Proper,75769,2014,9
63527,Accrington Stanley,Gillingham,98745,12237,False,1.0,19.01.10,0.0,Third Round Proper,75425,2010,9
63533,Accrington Stanley,Gillingham,98589,12237,False,,12.01.10,,Third Round Proper,75425,2010,9
63548,Accrington Stanley,Gillingham,98084,12237,False,,02.01.10,,Third Round Proper,75425,2010,9


In [11]:
# returning groups that only have one row, these need to be filled in (ref. wikipedia)

no_score.groupby(['round_id','home_team', 'away_team']).filter(lambda x: len(x) == 1)

Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score,round,year_id,year,stage
55448,Gillingham,Darlington,71833,9723,False,,20.11.99,,Second Round Proper,71048,2000,7
59563,Thornaby,Dunston UTS,14788,10144,True,,28.09.05,,Second Qualifying Round,71054,2006,4
59957,Stocksbridge Park Steels,Washington,14787,10142,True,,30.08.05,,Preliminary Round,71054,2006,2


In [12]:
matches.ix[55448, 'home_score'] = 3
matches.ix[55448, 'away_score'] = 1

matches.ix[59563, 'home_score'] = 2
matches.ix[59563, 'away_score'] = 1

matches.ix[59957, 'home_score'] = 1
matches.ix[59957, 'away_score'] = 1

In [13]:
# also filling missing team names

matches[matches.home_team.isnull()]

Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score,round,year_id,year,stage
66729,,AFC Sudbury,364896,12739,False,0,28.09.13,0,Second Round Qualifying,75769,2014,4
68030,,Weston Super Mare,433203,12821,False,1,11.10.14,4,Third Round Qualifying,75823,2015,5


In [14]:
matches.ix[66729, 'home_team'] = 'Bye'
matches.ix[68030, 'home_team'] = 'Flackwell Heath'

In [15]:
matches.dropna(inplace = True) #dropping all remaining rows with NaN values 

In [16]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68423 entries, 0 to 68509
Data columns (total 12 columns):
home_team     68423 non-null object
away_team     68423 non-null object
match_id      68423 non-null int64
round_id      68423 non-null int64
replay        68423 non-null bool
home_score    68423 non-null float64
date          68423 non-null object
away_score    68423 non-null float64
round         68423 non-null object
year_id       68423 non-null int64
year          68423 non-null int64
stage         68423 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 6.3+ MB


In [17]:
# finding round names and creating new 'qualifier' column

matches['round'].value_counts()

First Qualifying Round     14314
Preliminary Round          13864
Second Qualifying Round     8063
First Round Proper          5786
Extra Preliminary Round     4796
Third Qualifying Round      4290
Third Round Proper          3968
Fourth Qualifying Round     3448
Second Round Proper         2958
Fourth Round Proper         1949
First Round Qualifying      1134
Fifth Round Proper           932
Second Round Qualifying      786
Sixth Round Proper           458
Fifth Qualifying Round       409
Third Round Qualifying       390
Fourth Round Qualifying      323
Semi-Final                   321
Sixth Qualifying Round       120
Intermediate Round            70
Final                         28
Semi Final                    16
Name: round, dtype: int64

In [18]:
# n.b. Semi Final and Semi-Final, will fix this later in notebook

In [19]:
matches['qualifier'] = matches['round'].str.contains("Qualifying|Preliminary") #search for regex expression, return boolean

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68423 entries, 0 to 68509
Data columns (total 13 columns):
home_team     68423 non-null object
away_team     68423 non-null object
match_id      68423 non-null int64
round_id      68423 non-null int64
replay        68423 non-null bool
home_score    68423 non-null float64
date          68423 non-null object
away_score    68423 non-null float64
round         68423 non-null object
year_id       68423 non-null int64
year          68423 non-null int64
stage         68423 non-null float64
qualifier     68423 non-null bool
dtypes: bool(2), float64(3), int64(4), object(4)
memory usage: 6.4+ MB


In [20]:
# loading finals .csv file. Most finals were not scraped with main results due to different formatting on website

finals = pd.read_csv('results_finals.csv')

finals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 0 to 119
Data columns (total 13 columns):
home_team     120 non-null object
away_team     120 non-null object
match_id      120 non-null int64
round_id      120 non-null int64
year_id       120 non-null int64
replay        120 non-null bool
home_score    120 non-null int64
year          120 non-null int64
date          120 non-null object
away_score    120 non-null int64
round         120 non-null object
stage         120 non-null int64
qualifier     120 non-null bool
dtypes: bool(2), int64(7), object(4)
memory usage: 11.5+ KB


In [21]:
matches = pd.concat([matches, finals], ignore_index = True)

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68543 entries, 0 to 68542
Data columns (total 13 columns):
away_score    68543 non-null float64
away_team     68543 non-null object
date          68543 non-null object
home_score    68543 non-null float64
home_team     68543 non-null object
match_id      68543 non-null int64
qualifier     68543 non-null bool
replay        68543 non-null bool
round         68543 non-null object
round_id      68543 non-null int64
stage         68543 non-null float64
year          68543 non-null int64
year_id       68543 non-null int64
dtypes: bool(2), float64(3), int64(4), object(4)
memory usage: 6.4+ MB


In [22]:
matches.replace('Semi Final', 'Semi-Final', inplace=True)

matches['round'].value_counts()

First Qualifying Round     14314
Preliminary Round          13864
Second Qualifying Round     8063
First Round Proper          5786
Extra Preliminary Round     4796
Third Qualifying Round      4290
Third Round Proper          3968
Fourth Qualifying Round     3448
Second Round Proper         2958
Fourth Round Proper         1949
First Round Qualifying      1134
Fifth Round Proper           932
Second Round Qualifying      786
Sixth Round Proper           458
Fifth Qualifying Round       409
Third Round Qualifying       390
Semi-Final                   337
Fourth Round Qualifying      323
Final                        148
Sixth Qualifying Round       120
Intermediate Round            70
Name: round, dtype: int64

In [23]:
# Semi-Final / Semi Final regularisation complete

# the newly joined finals did not have a match_id value so confirming this then assigning a unique match_id to them

In [24]:
matches['match_id'].value_counts().head()

0        120
68340      1
60168      1
37639      1
39686      1
Name: match_id, dtype: int64

In [28]:
matches.loc[(matches['match_id'] == 0),'match_id'] = matches.index + 5000000

In [30]:
matches['match_id'].value_counts().head()

16376    1
43776    1
56078    1
49933    1
51980    1
Name: match_id, dtype: int64

In [33]:
# sorted :) now match_id is unique for each row

# now the correct 'stage' value needs to be assigned to new finals rows (currently = 0) 
# we start by giving new finals a generic large value and then creating a rank function

In [34]:
matches.loc[matches['stage'] == 0,'stage'] = 100

In [35]:
matches['stage'].value_counts()

2      18432
1      11293
3      11153
4       6483
6       4307
5       4154
7       3571
8       3388
9       2940
10      1484
11       741
12       361
100      120
13       116
Name: stage, dtype: int64

In [36]:
# function to set final 'stage' equal to semi-final 'stage' + 1 for each group

def final_stage(grp):
    grp.ix[(grp['round'] == 'Final'), 'stage'] = grp.ix[(grp['round'] == 'Semi-Final'), 'stage'].max() + 1
    return grp

# grouping by year and applying function

matches = matches.groupby('year').apply(final_stage)

In [37]:
matches['stage'].value_counts()

2     18432
1     11293
3     11153
4      6483
6      4311
5      4156
7      3576
8      3391
9      2944
10     1488
11      746
12      369
13      158
14       43
Name: stage, dtype: int64

In [38]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68543 entries, 0 to 68542
Data columns (total 13 columns):
away_score    68543 non-null float64
away_team     68543 non-null object
date          68543 non-null object
home_score    68543 non-null float64
home_team     68543 non-null object
match_id      68543 non-null int64
qualifier     68543 non-null bool
replay        68543 non-null bool
round         68543 non-null object
round_id      68543 non-null int64
stage         68543 non-null float64
year          68543 non-null int64
year_id       68543 non-null int64
dtypes: bool(2), float64(3), int64(4), object(4)
memory usage: 6.4+ MB


In [39]:
# sorted :) 

# saving dataframe

In [40]:
matches.to_csv('facup_v1.csv', index = False)