In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import cufflinks as cf

In [2]:
fac = pd.read_csv(r'C:\Users\David\Python\scrapy\facup\00_fa_cup\facup_v3.csv')

Dropping years we will not use in merge, 1952 onwards has a consistent number of matches in each round and has some post-WW2 recovery time for teams/league to settle

In [3]:
fac = fac[fac.year > 1951]

Creating new dataframe concatenating all teams into a single column

In [4]:
home = fac[['home_team','round','year','stage']].rename(columns={'home_team':'team'})
away = fac[['away_team','round','year','stage']].rename(columns={'away_team':'team'})

teams = pd.concat([home,away], ignore_index = True)

Reducing dataframe so we keep only one row per year/team combination from the first stage that team appears in that particular year e.g. 

ARSENAL THIRD ROUND PROPER 1952 6 

(6 is the first stage arsenal appeared in 1952, third round proper)

In [5]:
first = teams.loc[teams.groupby(['year','team'])['stage'].idxmax()]

first.head()

Unnamed: 0,team,round,year,stage
19,ACCRINGTON STANLEY,FIRST ROUND PROPER,1952,8
10242,AFC BOURNEMOUTH,FIRST ROUND PROPER,1952,8
13,ALDERSHOT,FIRST ROUND PROPER,1952,8
10311,ARSENAL,THIRD ROUND PROPER,1952,6
10310,ASTON VILLA,THIRD ROUND PROPER,1952,6


#### Saving dataframe containing one row per team/year for merging

In [13]:
first.to_csv('facup_teams.csv', index=False)

In [6]:
first_count = first.groupby(['year','stage']).size().unstack()  #counting the number of teams entering each stage in each year 

first_count.head()

stage,6.0,8.0
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1952,44,80
1953,44,80
1954,44,80
1955,44,80
1956,44,80


In [7]:
first_count.iplot(kind = 'line', dimensions = (900,500))

Since 1952 there are consistently 80 teams entering in 1st round (stage 8) and 44 entering in 3rd round (stage 6).

In 2000 there were only 43 'new' teams entering 3rd as Manchester United elected to play in the Club World Cup instead [link](https://en.wikipedia.org/wiki/1999%E2%80%932000_FA_Cup), however this space was filled by 'Darlington' despite losing their 2nd round fixture.

Now checking for consistency in number of teams in top 4 divisions between 1952-2015.

In [8]:
tab = pd.read_csv(r'C:\Users\David\Python\scrapy\facup\01_league_tables\fl_pl_join_v1.csv')

tab = tab[tab.year > 1951]

tab.head()

Unnamed: 0,pos,team,w,d,l,f,a,pts,pld,year,tier
444,1,MANCHESTER UNITED,23,11,8,95,52,57,42,1952,11
445,2,TOTTENHAM HOTSPUR,22,9,11,76,51,53,42,1952,11
446,3,ARSENAL,21,11,10,80,61,53,42,1952,11
447,4,PORTSMOUTH,20,8,14,68,58,48,42,1952,11
448,5,BOLTON WANDERERS,19,10,13,65,61,48,42,1952,11


In [9]:
league_count = tab.groupby(['year','tier']).size().unstack()

league_count.head()

tier,11,21,31,32,41
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,22,22,24,24,
1953,22,22,24,24,
1954,22,22,24,24,
1955,22,22,24,24,
1956,22,22,24,24,


In [10]:
league_count.iplot(kind = 'bar', barmode = 'stack', bargap = 0.4,  dimensions = (900,500))

Investigating years in which there are more than 44 teams in the top two tiers as we know that there are only 44 teams entering in at stage 6

In [11]:
fac_1992 = first[(first.year == 1992) & (first.stage == 6)]
tab_1992 = tab[(tab.year == 1992) & (tab.tier < 30)].sort_values('team')

In [12]:
merge_1992 = pd.merge(fac_1992, tab_1992, how='outer')
merge_1992

Unnamed: 0,team,round,year,stage,pos,w,d,l,f,a,pts,pld,tier
0,ARSENAL,THIRD ROUND PROPER,1992,6.0,4.0,19.0,15.0,8.0,81.0,46.0,72.0,42.0,11.0
1,ASTON VILLA,THIRD ROUND PROPER,1992,6.0,7.0,17.0,9.0,16.0,48.0,44.0,60.0,42.0,11.0
2,BARNSLEY,THIRD ROUND PROPER,1992,6.0,16.0,16.0,11.0,19.0,45.0,57.0,59.0,46.0,21.0
3,BLACKBURN ROVERS,THIRD ROUND PROPER,1992,6.0,6.0,21.0,11.0,14.0,70.0,53.0,74.0,46.0,21.0
4,BRIGHTON & HOVE ALBION,THIRD ROUND PROPER,1992,6.0,23.0,12.0,11.0,23.0,56.0,77.0,47.0,46.0,21.0
5,BRISTOL CITY,THIRD ROUND PROPER,1992,6.0,17.0,13.0,15.0,18.0,55.0,71.0,54.0,46.0,21.0
6,BRISTOL ROVERS,THIRD ROUND PROPER,1992,6.0,13.0,16.0,14.0,16.0,60.0,63.0,62.0,46.0,21.0
7,CAMBRIDGE UNITED,THIRD ROUND PROPER,1992,6.0,5.0,19.0,17.0,10.0,65.0,47.0,74.0,46.0,21.0
8,CHARLTON ATHLETIC,THIRD ROUND PROPER,1992,6.0,7.0,20.0,11.0,15.0,54.0,48.0,71.0,46.0,21.0
9,CHELSEA,THIRD ROUND PROPER,1992,6.0,14.0,13.0,14.0,15.0,50.0,60.0,53.0,42.0,11.0


Between 1992-1995 there is some inconsistency in which stage of the FA Cup teams from each tier enter.

It is likely that as the leagues were restructured some teams were enterered into stages earlier than the rest of the teams in their tier. Worth being aware of this when we come to merge!