In [87]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import sys
import timeit

# for pretty printing
from IPython.display import display

In [94]:
# parses matches from magyarfutball.hu from a given html page
# returns a list of dictionaries with the following keys:
#    date
#    home_goals
#    type (i.e. friendly, qualifier, etc.)
#    home_or_away (H or T)
#    away_goals
#    opponent
#    outcome (W, D or L)
#    opponent_goals
#    hungary_goals
def parse_matches(raw_html):
    rows = []
    # create a new parser from the html
    soup = BeautifulSoup(raw_html, "html.parser")
    # extract the match table only
    match_table = soup.select('table.big.matches.zebra')
    # ensure there is a table
    if len(match_table) > 0:
        # there should only be one item that matches the selection
        matches = match_table[0]
        # find all but the first row (as the first is the header)
        for m in matches.findAll('tr')[1:]:
            elem_class = m.attrs.get('class')
            if elem_class and elem_class[0] == 'non-official':
                # exclude 'non-official' games
                continue
            else:
                # extract individual tds
                td_date = m.select('td.date')[0]
                td_match = m.select('td.match')[0]
                td_outcome = m.select('td.official')[0]
                td_league = m.select('td.league')[0]
                td_score = m.select('td.result')[0]

                # extract information and add to dataset
                data_row = {}
                # date of match
                data_row['date'] = td_date.text
                # outcome (W,D,L)
                outcome_class = td_outcome.attrs['class'][1]
                if outcome_class == 'won':
                    data_row['outcome'] = 'W'
                elif outcome_class == 'tie':
                    data_row['outcome'] = 'D'
                elif outcome_class == 'lost':
                    data_row['outcome'] = 'L'
                else:
                    data_row['outcome'] = '?'
                # match type
                data_row['type'] = td_league.text
                # goals
                goals = td_score.text.split(' : ')
                home_goals = int(goals[0])
                away_goals = int(goals[1])
                data_row['home_goals'] = home_goals
                data_row['away_goals'] = away_goals
                # opponent, home_or_away (H/A), hungary_goals and opponent_goals
                participants = td_match.find('a').text.split(' - ')
                home_team = participants[0].strip()
                away_team = participants[1].strip()
                if home_team == 'Magyarország':
                    data_row['home_or_away'] = 'H'
                    data_row['opponent'] = away_team
                    data_row['hungary_goals'] = home_goals
                    data_row['opponent_goals'] = away_goals
                elif away_team == 'Magyarország':
                    data_row['home_or_away'] = 'A'
                    data_row['opponent'] = home_team
                    data_row['hungary_goals'] = away_goals
                    data_row['opponent_goals'] = home_goals
                # catch weird cases (things like Budapest instead of Hungary)
                else:
                    continue

                rows.append(data_row)
                
    # finally, return the list
    return rows


# for testing
#req2004 = requests.get('http://www.magyarfutball.hu/hu/magyar_valogatott/merkozesek/2004')
#matches2004 = parse_matches(req2004.text)
#for m in matches2004:
#    print(m)

In [95]:
dataset = {}
match_counter = 0

# loop through the years 1907 - current year
start_year = 1907
current_year = datetime.now().year

# start a timer
t1 = timeit.default_timer()

for y in range(start_year, current_year+1):
    print('Requesting {}...'.format(y))
    sys.stdout.flush()
    req = requests.get('http://www.magyarfutball.hu/hu/magyar_valogatott/merkozesek/{}'.format(y))
    if req.status_code == 200:
        print('\tRequest successful, now parsing...')
        raw_html = req.text
        matches = parse_matches(raw_html)
        print('\t{} matches found'.format(len(matches)))
        sys.stdout.flush()
        for m in matches:
            dataset[match_counter] = m
            match_counter += 1

t2 = timeit.default_timer()
print('\nDone! Scraped {} matches in {} seconds'.format(len(dataset), round(t2-t1,1)))

# create pandas DataFrame
df = pd.DataFrame.from_dict(dataset, orient='index')

Requesting 1907...
	Request successful, now parsing...
	3 matches found
Requesting 1908...
	Request successful, now parsing...
	4 matches found
Requesting 1909...
	Request successful, now parsing...
	6 matches found
Requesting 1910...
	Request successful, now parsing...
	3 matches found
Requesting 1911...
	Request successful, now parsing...
	7 matches found
Requesting 1912...
	Request successful, now parsing...
	10 matches found
Requesting 1913...
	Request successful, now parsing...
	3 matches found
Requesting 1914...
	Request successful, now parsing...
	6 matches found
Requesting 1915...
	Request successful, now parsing...
	3 matches found
Requesting 1916...
	Request successful, now parsing...
	4 matches found
Requesting 1917...
	Request successful, now parsing...
	5 matches found
Requesting 1918...
	Request successful, now parsing...
	4 matches found
Requesting 1919...
	Request successful, now parsing...
	3 matches found
Requesting 1920...
	Request successful, now parsing...
	3 match

In [97]:
# save DataFrame to csv
df.to_csv('hungarian_nt_matches.csv')