In [1]:
# import libraries
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup as bs

# comprehend list for years
years = [str(2000 + i) for i in range(5,19)]
this_year = '2019'
print(years)

# where do we get the data?
current_year_url = 'http://www.superrugby.co.nz/Grandstand'
url = 'http://www.superrugby.co.nz/Grandstand/HistoricalResults/' # year appends here

# getter function
def get_rugby_data(url, year):
    '''getting data from super rugby website'''
    if year == this_year:
        x = ''
    else:
        x = year
    page = requests.get(url + x)
    soup = bs(page.text, 'html.parser')
    return soup

['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']


In [28]:
# get all previous years data: run this once
for i in years:
    data = get_rugby_data(url, i)
    f = open("data/data_" + i + ".txt","w+")
    f.write(str(data))
    f.close()

In [2]:
# get this years data, run this after every round
data = get_rugby_data(current_year_url, this_year)
f = open("data/data_" + this_year + ".txt","w+")
f.write(str(data))
f.close()

In [26]:
#### organise data into pandas dataframes

# regex for finding round names
pattern = re.compile("^(Round|Week|Semifinal|Final|Qualifiers|Semis)(\ \d{1,2})?.*$")

def outcome(f):
    if f > 0:
        return 'W'
    elif f < 0:
        return 'L'
    elif f == 0:
        return 'D'
    else:
        return 'D'

def fix_round(f):
    if f[:4] == 'Week':
        return f[5:7]
    elif f[:5] == 'Round':
        return f[6:8]
    elif f[:10] == 'Qualifiers':
        return 'QF' # quarter final
    elif f[:6] == 'Finals' or f == 'Semifinals' or f == 'Semis':
        return 'SF' # semi final
    elif f[:6] == 'Final ' or f == 'Final':
        return 'GF' # grand final
    else:
        return f
    
def data_nicey(year):
    table_nice = []
    table_round = []
    with open('data/data_' + year + '.txt') as f:
        data = bs(f.read())
    rows = data.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols_nice = [ele.text.strip() for ele in cols]
        cols_round = [x.text.strip() for x in cols if pattern.match(x.text.strip())]
        table_nice.append([ele for ele in cols_nice if ele]) # Get rid of empty values
        table_round.append([ele for ele in cols_round if ele]) # Get rid of empty values
    df1 = pd.DataFrame(table_nice)
    df2 = pd.DataFrame(table_round).fillna(method='ffill')
    df = pd.concat([df1, df2], axis=1).dropna()
    df['year'] = year
    df.columns = ['date','teams','location','time','score','round','year']
    df['home'] = df['teams'].str.split('v').str[0]
    df['away'] = df['teams'].str.split('v').str[1]
    df['hp'] = df['score'].str.split('-').str[0].astype('int') # home points
    df['ap'] = df['score'].str.split('-').str[1].astype('int') # away points
    df['sm'] = df['hp'] - df['ap'] # score margin
    df['ho'] = [outcome(x) for x in df['sm']] # home outcome
    df['round'] = [fix_round(x) for x in df['round']]
    remove_columns = ['teams','score']
    df = df.drop(columns=remove_columns)
    return df  

In [98]:
# data inconsistancies to fix:
# 2012 tables messed up, needs fixing
# finals data missing in 2008
# semifinals data missing in 2007

df_2005 = data_nicey('2005')
df_2006 = data_nicey('2006')
df_2007 = data_nicey('2007')
df_2008 = data_nicey('2008')

# fixing inconsistancies
df_2005.loc[(df_2005['date'] == '28 May'), 'round'] = "GF" # 2005 no final fixed
df_2006.drop(5, inplace=True) # remove bogus final data from 2006
df_2007 = df_2007.append({'date' : '12 May',
                          'location' : 'Durban', 
                          'time' : '3:00 PM', 
                          'round' : 'SF',
                          'year' : '2007',
                          'home' : 'Sharks',
                          'away' : 'Blues',
                          'hp' : '34',
                          'ap' : '18',
                          'sm' : '16',
                          'ho' : 'W'} , ignore_index=True)
df_2007 = df_2007.append({'date' : '12 May',
                          'location' : 'Pretoria', 
                          'time' : '5:30 PM', 
                          'round' : 'SF',
                          'year' : '2007',
                          'home' : 'Bulls',
                          'away' : 'Crusaders',
                          'hp' : '27',
                          'ap' : '12',
                          'sm' : '15',
                          'ho' : 'W'} , ignore_index=True)

#df.loc[(df['date'] == '19 May') & (df['year'] == '2006') & (df['round'] == 'GF')].drop() # 2005 no final fixed
    
display(df_2008)

Unnamed: 0,date,location,time,round,year,home,away,hp,ap,sm,ho
2,15 Feb,"AMI Stadium, Christchurch",7:35 PM,1,2008,Crusaders,Brumbies,34,3,31,W
3,15 Feb,Brisbane,10:05 PM,1,2008,Reds,Highlanders,22,16,6,W
4,16 Feb,Durban,6:10 AM,1,2008,Sharks,W Force,17,10,7,W
5,16 Feb,"Eden Park, Auckland",7:35 PM,1,2008,Blues,Chiefs,32,14,18,W
6,16 Feb,Sydney,9:40 PM,1,2008,Waratahs,Hurricanes,20,3,17,W
7,17 Feb,Cape Town,4:00 AM,1,2008,Stormers,Bulls,9,16,-7,L
8,17 Feb,Bloemfontein,6:05 AM,1,2008,Cheetahs,Lions,22,23,-1,L
10,22 Feb,"Westpac Stadium, Wellington",7:35 PM,2,2008,Hurricanes,Reds,23,18,5,W
11,23 Feb,Bloemfontein,5:00 AM,2,2008,Cheetahs,W Force,15,16,-1,L
12,23 Feb,Pretoria,7:05 AM,2,2008,Bulls,Crusaders,19,54,-35,L
