In [41]:
# import libraries
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup as bs

# comprehend list for years
years = [str(2000 + i) for i in range(5,19)]
this_year = '2019'
print(years)

# where do we get the data?
current_year_url = 'http://www.superrugby.co.nz/Grandstand'
url = 'http://www.superrugby.co.nz/Grandstand/HistoricalResults/' # year appends here

# getter function
def get_rugby_data(url, year):
    '''getting data from super rugby website'''
    if year == this_year:
        x = ''
    else:
        x = year
    page = requests.get(url + x)
    soup = bs(page.text, 'html.parser')
    return soup

['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']


In [28]:
# get all previous years data: run this once
for i in years:
    data = get_rugby_data(url, i)
    f = open("data/data_" + i + ".txt","w+")
    f.write(str(data))
    f.close()

In [31]:
# get this years data, run this after every round
data = get_rugby_data(current_year_url, this_year)
f = open("data/data_" + this_year + ".txt","w+")
f.write(str(data))
f.close()

In [107]:
# organise data into pandas dataframes

# regex for finding round names
pattern = re.compile("^(Round|Week|Semifinal|Final)(\ \d{1,2})?.*$")

def outcome(f):
    if f > 0:
        return 'W'
    elif f < 0:
        return 'L'
    else:
        return 'D'
    
def data_nicey(year):
    table_nice = []
    table_round = []
    with open('data/data_' + year + '.txt') as f:
        data = bs(f.read())
    rows = data.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols_nice = [ele.text.strip() for ele in cols]
        cols_round = [x.text.strip() for x in cols if pattern.match(x.text.strip())]
        table_nice.append([ele for ele in cols_nice if ele]) # Get rid of empty values
        table_round.append([ele for ele in cols_round if ele]) # Get rid of empty values
    df1 = pd.DataFrame(table_nice)
    df2 = pd.DataFrame(table_round).fillna(method='ffill')
    df = pd.concat([df1, df2], axis=1).dropna()
    df['year'] = year
    df.columns = ['date','teams','location','time','score','round','year']
    df['home'] = df['teams'].str.split('v').str[0]
    df['away'] = df['teams'].str.split('v').str[1]
    df['hp'] = df['score'].str.split('-').str[0].astype('int') # home points
    df['ap'] = df['score'].str.split('-').str[1].astype('int') # away points
    df['sm'] = df['hp'] - df['ap'] # score margin
    df['ho'] = [outcome(x) for x in df['sm']] # home outcome
    remove_columns = ['teams','score']
    df = df.drop(columns=remove_columns)
    return df
    
# ready to be dropped:    
# data_nicey('2005')

df = data_nicey('2010')

display(df.head(10))
display(df.tail(10))



Unnamed: 0,date,location,time,round,year,home,away,hp,ap,sm,ho
2,23 May,Soweto,3:00 AM,Finals,2010,Bulls,Crusaders,39,24,15,W
3,23 May,Cape Town,5:10 AM,Finals,2010,Stormers,Waratahs,25,6,19,W
4,30 May,Soweto,3:00 AM,Final - Bulls v Stormers,2010,Final - Bulls,Stormers,25,17,8,W
7,12 Feb,"Eden Park, Auckland",7:35 PM,Week 1,2010,Blues,Hurricanes,20,34,-14,L
8,13 Feb,Perth,12:05 AM,Week 1,2010,W Force,Brumbies,15,24,-9,L
9,13 Feb,Bloemfontein,6:10 AM,Week 1,2010,Cheetahs,Bulls,34,51,-17,L
10,13 Feb,"AMI Stadium, Christchurch",7:35 PM,Week 1,2010,Crusaders,Highlanders,32,17,15,W
11,13 Feb,Brisbane,9:40 PM,Week 1,2010,Reds,Waratahs,28,30,-2,L
12,14 Feb,Johannesburg,4:05 AM,Week 1,2010,Lions,Stormers,13,26,-13,L
13,14 Feb,Durban,6:10 AM,Week 1,2010,Sharks,Chiefs,18,19,-1,L


Unnamed: 0,date,location,time,round,year,home,away,hp,ap,sm,ho
107,09 May,Pretoria,1:00 AM,Week 13,2010,Bulls,Crusaders,40,35,5,W
108,09 May,Bloemfontein,3:05 AM,Week 13,2010,Cheetahs,W Force,29,14,15,W
109,09 May,Durban,5:10 AM,Week 13,2010,Sharks,Stormers,20,14,6,W
111,14 May,"AMI Stadium, Christchurch",7:35 PM,Week 14,2010,Crusaders,Brumbies,40,22,18,W
112,14 May,Sydney,9:40 PM,Week 14,2010,Waratahs,Hurricanes,32,16,16,W
113,15 May,Durban,5:10 AM,Week 14,2010,Sharks,W Force,27,22,5,W
114,15 May,"Eden Park, Auckland",7:35 PM,Week 14,2010,Blues,Chiefs,30,20,10,W
115,15 May,Brisbane,9:40 PM,Week 14,2010,Reds,Highlanders,38,36,2,W
116,16 May,Bloemfontein,1:00 AM,Week 14,2010,Cheetahs,Lions,59,10,49,W
117,16 May,Cape Town,3:05 AM,Week 14,2010,Stormers,Bulls,38,10,28,W
