In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
matplotlib.style.use('ggplot')

In [3]:
matches = pd.read_csv('results.csv') 

In [4]:
matches.info(); matches.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68511 entries, 0 to 68510
Data columns (total 8 columns):
home_team     68509 non-null object
away_team     68511 non-null object
match_id      68511 non-null int64
round_id      68511 non-null int64
replay        68511 non-null bool
home_score    68421 non-null float64
date          68511 non-null object
away_score    68421 non-null float64
dtypes: bool(1), float64(2), int64(2), object(3)
memory usage: 4.2+ MB


Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score
0,Royal Engineers,Hampstead Heathens,16062,10683,False,3,27.01.72,0
1,Wanderers,Crystal Palace (1),16063,10683,False,0,20.01.72,0
2,Queen's Park (Glasgow),Bye,16061,10683,False,0,01.01.01,0
3,Royal Engineers,Crystal Palace (1),16066,10682,True,3,09.03.72,0
4,Wanderers,Queen's Park (Glasgow),16065,10682,False,0,05.03.72,0


In [5]:
# read .csv containing round names, year and stage for each round_id and merge with results

#Stage this is the numeric round number of a particular year,
# e.g. first qualifying round = 1 and final = number of rounds in that year

years = pd.read_csv('years_edit.csv')

years.info(); years.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1507 entries, 0 to 1506
Data columns (total 5 columns):
round_id    1507 non-null int64
round       1507 non-null object
year_id     1507 non-null int64
year        1507 non-null int64
stage       1507 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 70.6+ KB


Unnamed: 0,round_id,round,year_id,year,stage
0,10682,Semi-Final,71234,1872,4
1,10681,Second Round Proper,71234,1872,2
2,10683,Third Round Proper,71234,1872,3
3,10680,First Round Proper,71234,1872,1
4,10689,Third Round Proper,71235,1873,3


In [6]:
matches = pd.merge(matches, years, how='inner', on='round_id')

matches.info(); matches.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68511 entries, 0 to 68510
Data columns (total 12 columns):
home_team     68509 non-null object
away_team     68511 non-null object
match_id      68511 non-null int64
round_id      68511 non-null int64
replay        68511 non-null bool
home_score    68421 non-null float64
date          68511 non-null object
away_score    68421 non-null float64
round         68511 non-null object
year_id       68511 non-null int64
year          68511 non-null int64
stage         68511 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 6.3+ MB


Unnamed: 0,home_team,away_team,match_id,round_id,replay,home_score,date,away_score,round,year_id,year,stage
0,Royal Engineers,Hampstead Heathens,16062,10683,False,3,27.01.72,0,Third Round Proper,71234,1872,3
1,Wanderers,Crystal Palace (1),16063,10683,False,0,20.01.72,0,Third Round Proper,71234,1872,3
2,Queen's Park (Glasgow),Bye,16061,10683,False,0,01.01.01,0,Third Round Proper,71234,1872,3
3,Royal Engineers,Crystal Palace (1),16066,10682,True,3,09.03.72,0,Semi-Final,71234,1872,4
4,Wanderers,Queen's Park (Glasgow),16065,10682,False,0,05.03.72,0,Semi-Final,71234,1872,4


In [7]:
# returning rows with missing data

missing = matches[matches.isnull().any(axis=1)]

missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 55430 to 68329
Data columns (total 12 columns):
home_team     90 non-null object
away_team     92 non-null object
match_id      92 non-null int64
round_id      92 non-null int64
replay        92 non-null bool
home_score    2 non-null float64
date          92 non-null object
away_score    2 non-null float64
round         92 non-null object
year_id       92 non-null int64
year          92 non-null int64
stage         92 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 8.7+ KB


In [8]:
# mainly missing scorelines but also some missing team names, saving to separate .csv file and dropping from dataframe

missing.to_csv('results_missing.csv')
matches.drop(missing.index, inplace=True)

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68419 entries, 0 to 68510
Data columns (total 12 columns):
home_team     68419 non-null object
away_team     68419 non-null object
match_id      68419 non-null int64
round_id      68419 non-null int64
replay        68419 non-null bool
home_score    68419 non-null float64
date          68419 non-null object
away_score    68419 non-null float64
round         68419 non-null object
year_id       68419 non-null int64
year          68419 non-null int64
stage         68419 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 6.3+ MB


In [9]:
# missing values filled in manually (from wikipedia), some duplicates dropped

missing_edit = pd.read_csv('results_missing_edit.csv')

missing_edit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 0 to 76
Data columns (total 12 columns):
home_team     77 non-null object
away_team     77 non-null object
match_id      77 non-null int64
round_id      77 non-null int64
replay        77 non-null bool
home_score    77 non-null int64
date          77 non-null object
away_score    77 non-null int64
round         77 non-null object
year_id       77 non-null int64
year          77 non-null int64
stage         77 non-null int64
dtypes: bool(1), int64(7), object(4)
memory usage: 7.3+ KB


In [10]:
# concatenate main dataframe with edited missing values

In [11]:
matches = pd.concat([matches, missing_edit], ignore_index=True)

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68496 entries, 0 to 68495
Data columns (total 12 columns):
home_team     68496 non-null object
away_team     68496 non-null object
match_id      68496 non-null int64
round_id      68496 non-null int64
replay        68496 non-null bool
home_score    68496 non-null float64
date          68496 non-null object
away_score    68496 non-null float64
round         68496 non-null object
year_id       68496 non-null int64
year          68496 non-null int64
stage         68496 non-null float64
dtypes: bool(1), float64(3), int64(4), object(4)
memory usage: 6.3+ MB


In [12]:
# finding round names and creating new 'qualifier' column

matches['round'].value_counts()

First Qualifying Round     14314
Preliminary Round          13876
Second Qualifying Round     8063
First Round Proper          5786
Extra Preliminary Round     4801
Third Qualifying Round      4290
Third Round Proper          3988
Fourth Qualifying Round     3448
Second Round Proper         2976
Fourth Round Proper         1950
First Round Qualifying      1141
Fifth Round Proper           934
Second Round Qualifying      790
Sixth Round Proper           459
Fifth Qualifying Round       409
Third Round Qualifying       391
Fourth Round Qualifying      325
Semi-Final                   321
Sixth Qualifying Round       120
Intermediate Round            70
Final                         28
Semi Final                    16
Name: round, dtype: int64

In [13]:
# n.b. Semi Final and Semi-Final, will fix this later in notebook

In [14]:
matches['qualifier'] = matches['round'].str.contains("Qualifying|Preliminary") #search for regex expression, return boolean

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68496 entries, 0 to 68495
Data columns (total 13 columns):
home_team     68496 non-null object
away_team     68496 non-null object
match_id      68496 non-null int64
round_id      68496 non-null int64
replay        68496 non-null bool
home_score    68496 non-null float64
date          68496 non-null object
away_score    68496 non-null float64
round         68496 non-null object
year_id       68496 non-null int64
year          68496 non-null int64
stage         68496 non-null float64
qualifier     68496 non-null bool
dtypes: bool(2), float64(3), int64(4), object(4)
memory usage: 6.4+ MB


In [15]:
# loading finals .csv file. Most finals were not scraped with main results due to different formatting on website

finals = pd.read_csv('results_finals.csv')

finals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 0 to 119
Data columns (total 13 columns):
home_team     120 non-null object
away_team     120 non-null object
match_id      120 non-null int64
round_id      120 non-null int64
year_id       120 non-null int64
replay        120 non-null bool
home_score    120 non-null int64
year          120 non-null int64
date          120 non-null object
away_score    120 non-null int64
round         120 non-null object
stage         120 non-null int64
qualifier     120 non-null bool
dtypes: bool(2), int64(7), object(4)
memory usage: 11.5+ KB


In [16]:
matches = pd.concat([matches, finals], ignore_index = True)

matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68616 entries, 0 to 68615
Data columns (total 13 columns):
away_score    68616 non-null float64
away_team     68616 non-null object
date          68616 non-null object
home_score    68616 non-null float64
home_team     68616 non-null object
match_id      68616 non-null int64
qualifier     68616 non-null bool
replay        68616 non-null bool
round         68616 non-null object
round_id      68616 non-null int64
stage         68616 non-null float64
year          68616 non-null int64
year_id       68616 non-null int64
dtypes: bool(2), float64(3), int64(4), object(4)
memory usage: 6.4+ MB


In [17]:
matches.replace('Semi Final', 'Semi-Final', inplace=True)

matches['round'].value_counts()

First Qualifying Round     14314
Preliminary Round          13876
Second Qualifying Round     8063
First Round Proper          5786
Extra Preliminary Round     4801
Third Qualifying Round      4290
Third Round Proper          3988
Fourth Qualifying Round     3448
Second Round Proper         2976
Fourth Round Proper         1950
First Round Qualifying      1141
Fifth Round Proper           934
Second Round Qualifying      790
Sixth Round Proper           459
Fifth Qualifying Round       409
Third Round Qualifying       391
Semi-Final                   337
Fourth Round Qualifying      325
Final                        148
Sixth Qualifying Round       120
Intermediate Round            70
Name: round, dtype: int64

In [18]:
# Semi-Final / Semi Final regularisation complete

# the newly joined finals did not have a match_id value so confirming this then assigning a unique match_id to them

In [19]:
matches['match_id'].value_counts().head()

0        120
16087      2
16060      2
16057      2
16088      2
Name: match_id, dtype: int64

In [20]:
matches.loc[matches['match_id'] == 0,'match_id'] = matches.index / 100000.0

In [21]:
matches['match_id'].value_counts().head()

16088    2
16087    2
16085    2
16060    2
16059    2
Name: match_id, dtype: int64

In [22]:
# sorted :) now match_id is unique for each row

# now the correct 'stage' value needs to be assigned to new finals rows. 
# we start by giving new finals a generic large value and then creating a rank function

In [23]:
matches['stage'].value_counts()

2     18450
1     11289
3     11164
4      6487
6      4309
5      4155
7      3580
8      3400
9      2956
10     1485
11      743
12      362
0       120
13      116
Name: stage, dtype: int64

In [24]:
matches.loc[matches['stage'] == 0,'stage'] = 100

In [25]:
matches['stage'].value_counts()

2      18450
1      11289
3      11164
4       6487
6       4309
5       4155
7       3580
8       3400
9       2956
10      1485
11       743
12       362
100      120
13       116
Name: stage, dtype: int64

In [26]:
# function to set final 'stage' equal to semi-final 'stage' + 1 for each group

def final_stage(grp):
    grp.ix[(grp['round'] == 'Final'), 'stage'] = grp.ix[(grp['round'] == 'Semi-Final'), 'stage'].max() + 1
    return grp

# grouping by year and applying function

matches = matches.groupby('year').apply(final_stage)

In [27]:
matches['stage'].value_counts()

2     18450
1     11289
3     11164
4      6487
6      4313
5      4157
7      3585
8      3403
9      2960
10     1489
11      748
12      370
13      158
14       43
Name: stage, dtype: int64

In [28]:
# sorted :) 

# saving dataframe

In [29]:
matches.to_csv('facup_v1.csv', index = False)