In [2]:
# import libraries
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup as bs

# comprehend list for years
years = [str(2000 + i) for i in range(5,19)]
this_year = '2019'
print(years)

# where do we get the data?
current_year_url = 'http://www.superrugby.co.nz/Grandstand'
url = 'http://www.superrugby.co.nz/Grandstand/HistoricalResults/' # year appends here

# getter function
def get_rugby_data(url, year):
    '''getting data from super rugby website'''
    if year == this_year:
        x = ''
    else:
        x = year
    page = requests.get(url + x)
    soup = bs(page.text, 'html.parser')
    return soup

['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']


In [28]:
# get all previous years data: run this once
for i in years:
    data = get_rugby_data(url, i)
    f = open("data/data_" + i + ".txt","w+")
    f.write(str(data))
    f.close()

In [31]:
# get this years data, run this after every round
data = get_rugby_data(current_year_url, this_year)
f = open("data/data_" + this_year + ".txt","w+")
f.write(str(data))
f.close()

In [66]:
#### organise data into pandas dataframes

# regex for finding round names
pattern = re.compile("^(Round|Week|Semifinal|Final|Qualifiers|Semis)(\ \d{1,2})?.*$")

def outcome(f):
    if f > 0:
        return 'W'
    elif f < 0:
        return 'L'
    elif f == 0:
        return 'D'
    else:
        return 'D'

def fix_round(f):
    if f[:4] == 'Week':
        return f[5:7]
    elif f[:5] == 'Round':
        return f[6:8]
    elif f[:10] == 'Qualifiers':
        return 'QF'
    elif f[:6] == 'Finals' or f == 'Semifinals' or f == 'Semis':
        return 'SF'
    elif f[:6] == 'Final ' or f == 'Final':
        return 'F'
    else:
        return f
    
def data_nicey(year):
    table_nice = []
    table_round = []
    with open('data/data_' + year + '.txt') as f:
        data = bs(f.read())
    rows = data.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols_nice = [ele.text.strip() for ele in cols]
        cols_round = [x.text.strip() for x in cols if pattern.match(x.text.strip())]
        table_nice.append([ele for ele in cols_nice if ele]) # Get rid of empty values
        table_round.append([ele for ele in cols_round if ele]) # Get rid of empty values
    df1 = pd.DataFrame(table_nice)
    df2 = pd.DataFrame(table_round).fillna(method='ffill')
    df = pd.concat([df1, df2], axis=1).dropna()
    df['year'] = year
    df.columns = ['date','teams','location','time','score','round','year']
    df['home'] = df['teams'].str.split('v').str[0]
    df['away'] = df['teams'].str.split('v').str[1]
    df['hp'] = df['score'].str.split('-').str[0].astype('int') # home points
    df['ap'] = df['score'].str.split('-').str[1].astype('int') # away points
    df['sm'] = df['hp'] - df['ap'] # score margin
    df['ho'] = [outcome(x) for x in df['sm']] # home outcome
    df['round'] = [fix_round(x) for x in df['round']]
    remove_columns = ['teams','score']
    df = df.drop(columns=remove_columns)
    return df
    
# data inconsistancies to fix:
# 2012 tables messed up, needs fixing
# finals data missing in 2008
# semifinals data missing in 2007
# remove bogus final data from 2006 (maybe order results, result index then drop lines?)
# 2005 no final

df = data_nicey('2012')

display(df.head(20))
display(df.tail(20))



Unnamed: 0,date,location,time,round,year,home,away,hp,ap,sm,ho
2,21 Jul,"AMI Stadium, Christchurch",7:35 PM,QF,2012,Crusaders,Bulls,28,13,15,W
3,21 Jul,Brisbane,9:40 PM,QF,2012,Reds,Sharks,17,30,-13,L
5,27 Jul,"FMG Stadium Waikato, Hamilton",7:35 PM,SF,2012,Chiefs,Crusaders,20,17,3,W
6,29 Jul,Cape Town,3:05 AM,SF,2012,Stormers,Sharks,19,26,-7,L
8,04 Aug,"FMG Stadium Waikato, Hamilton",7:35 PM,F,2012,Chiefs,Sharks,37,6,31,W
13,24 Feb,"Eden Park, Auckland",7:35 PM,1,2012,Blues,Crusaders,18,19,-1,L
14,24 Feb,Canberra,9:40 PM,1,2012,Brumbies,Force,19,17,2,W
15,25 Feb,Pretoria,6:10 AM,1,2012,Bulls,Sharks,18,13,5,W
16,25 Feb,"FMG Stadium Waikato, Hamilton",7:35 PM,1,2012,Chiefs,Highlanders,19,23,-4,L
17,25 Feb,Sydney,9:40 PM,1,2012,Warratahs,Reds,21,25,-4,L


Unnamed: 0,date,location,time,round,year,home,away,hp,ap,sm,ho
153,29 Jun,"Forsyth Barr Stadium, Dunedin",7:35 PM,19,2012,Highlanders,Chiefs,21,27,-6,L
154,29 Jun,Melbourne,9:40 PM,19,2012,Rebels,Reds,17,32,-15,L
155,30 Jun,"AMI Stadium, Christchurch",7:35 PM,19,2012,Crusaders,Hurricanes,22,23,-1,L
156,30 Jun,Perth,9:40 PM,19,2012,Force,Brumbies,17,28,-11,L
157,01 Jul,Cape Town,3:05 AM,19,2012,Stormers,Lions,27,17,10,W
158,01 Jul,Pretoria,5:10 AM,19,2012,Bulls,Cheetahs,40,24,16,W
163,06 Jul,"FMG Stadium Waikato, Hamilton",7:35 PM,20,2012,Chiefs,Crusaders,21,28,-7,L
164,06 Jul,Brisbane,9:40 PM,20,2012,Reds,Highlanders,19,13,6,W
165,07 Jul,Durban,5:10 AM,20,2012,Sharks,Bulls,32,10,22,W
166,07 Jul,"Eden Park, Auckland",7:35 PM,20,2012,Blues,Force,32,9,23,W
