In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import sys
import timeit

# for pretty printing
from IPython.display import display

# Step 1: scrape raw data

In [2]:
# parses matches from magyarfutball.hu from a given html page
# returns a list of dictionaries with the following keys:
#    date
#    home_goals
#    type (i.e. friendly, qualifier, etc.)
#    home_or_away (H or T)
#    away_goals
#    opponent
#    outcome (W, D or L)
#    opponent_goals
#    hungary_goals
def parse_matches(raw_html, year):
    rows = []
    # create a new parser from the html
    soup = BeautifulSoup(raw_html, "html.parser")
    # extract the match table only
    match_table = soup.select('table.big.matches.zebra')
    # ensure there is a table
    if len(match_table) > 0:
        # there should only be one item that matches the selection
        matches = match_table[0]
        # find all but the first row (as the first is the header)
        for m in matches.findAll('tr')[1:]:
            elem_class = m.attrs.get('class')
            if elem_class and elem_class[0] == 'non-official':
                # exclude 'non-official' games
                continue
            else:
                # extract individual tds
                td_date = m.select('td.date')[0]
                td_match = m.select('td.match')[0]
                td_outcome = m.select('td.official')[0]
                td_league = m.select('td.league')[0]
                td_score = m.select('td.result')[0]

                # extract information and add to dataset
                data_row = {}
                # date of match
                data_row['date'] = "{} {}".format(td_date.text, year)
                # outcome (W,D,L)
                outcome_class = td_outcome.attrs['class'][1]
                if outcome_class == 'won':
                    data_row['outcome'] = 'W'
                elif outcome_class == 'tie':
                    data_row['outcome'] = 'D'
                elif outcome_class == 'lost':
                    data_row['outcome'] = 'L'
                else:
                    data_row['outcome'] = '?'
                # match type
                data_row['type'] = td_league.text
                # goals
                goals = td_score.text.split(' : ')
                home_goals = int(goals[0])
                away_goals = int(goals[1])
                data_row['home_goals'] = home_goals
                data_row['away_goals'] = away_goals
                # opponent, home_or_away (H/A), hungary_goals and opponent_goals
                participants = td_match.find('a').text.split(' - ')
                home_team = participants[0].strip()
                away_team = participants[1].strip()
                if home_team == 'Magyarország':
                    data_row['home_or_away'] = 'H'
                    data_row['opponent'] = away_team
                    data_row['hungary_goals'] = home_goals
                    data_row['opponent_goals'] = away_goals
                elif away_team == 'Magyarország':
                    data_row['home_or_away'] = 'A'
                    data_row['opponent'] = home_team
                    data_row['hungary_goals'] = away_goals
                    data_row['opponent_goals'] = home_goals
                # catch weird cases (things like Budapest instead of Hungary)
                else:
                    continue

                rows.append(data_row)
                
    # finally, return the list
    return rows


In [3]:
dataset = {}
match_counter = 0

# loop through the years 1907 - current year
start_year = 1907
current_year = datetime.now().year

# start a timer
t1 = timeit.default_timer()

for y in range(start_year, current_year + 1):
    print('Requesting {}...'.format(y))
    sys.stdout.flush()
    req = requests.get('http://www.magyarfutball.hu/hu/magyar_valogatott/merkozesek/{}'.format(y))
    if req.status_code == 200:
        print('\tRequest successful, now parsing...')
        raw_html = req.text
        matches = parse_matches(raw_html, y)
        print('\t{} matches found'.format(len(matches)))
        sys.stdout.flush()
        for m in matches:
            dataset[match_counter] = m
            match_counter += 1

t2 = timeit.default_timer()
print('\nDone! Scraped {} matches in {} seconds'.format(len(dataset), round(t2-t1,1)))

# create pandas DataFrame
df = pd.DataFrame.from_dict(dataset, orient='index')

Requesting 1907...
	Request successful, now parsing...
	3 matches found
Requesting 1908...
	Request successful, now parsing...
	4 matches found
Requesting 1909...
	Request successful, now parsing...
	6 matches found
Requesting 1910...
	Request successful, now parsing...
	3 matches found
Requesting 1911...
	Request successful, now parsing...
	7 matches found
Requesting 1912...
	Request successful, now parsing...
	10 matches found
Requesting 1913...
	Request successful, now parsing...
	3 matches found
Requesting 1914...
	Request successful, now parsing...
	6 matches found
Requesting 1915...
	Request successful, now parsing...
	3 matches found
Requesting 1916...
	Request successful, now parsing...
	4 matches found
Requesting 1917...
	Request successful, now parsing...
	5 matches found
Requesting 1918...
	Request successful, now parsing...
	4 matches found
Requesting 1919...
	Request successful, now parsing...
	3 matches found
Requesting 1920...
	Request successful, now parsing...
	3 match

# Step 2: data cleaning
First, get a proper date

In [15]:
months = {"január": 1,
          "február": 2,
          "március": 3,
          "április": 4,
          "május": 5,
          "június": 6,
          "július": 7,
          "augusztus": 8,
          "szeptember": 9,
          "október": 10,
          "november": 11,
          "december": 12
         }
# replace the first word with the relevant month number
df["date"] = df["date"].apply(lambda x: x.replace(x[:x.index(" ")], str(months[x[:x.index(" ")]])))
# then convert dates
df["date"] = pd.to_datetime(df["date"], format="%m %d. %Y")
df.head()

Unnamed: 0,away_goals,opponent,home_goals,home_or_away,opponent_goals,hungary_goals,outcome,type,date
0,1,Ausztria,4,H,1,4,W,barátságos,1907-11-03
1,3,Cseh- és Morvaország,5,A,5,3,L,barátságos,1907-10-06
2,2,Cseh- és Morvaország,5,H,2,5,W,barátságos,1907-04-07
3,3,Ausztria,5,H,3,5,W,barátságos,1908-11-01
4,7,Anglia,0,H,7,0,L,barátságos,1908-06-10


Define a function to get the match type (qualifier, Euro, WC, friendly etc.)

In [16]:
# create a new category for type of match
# friendly, euro qualifier, euro game, olympics, wc qualifier, wc game
def extract_match_type(match_type):
    mt = match_type.lower()
    if mt == 'barátságos':
        return 'Friendly'
    elif 'selejtező' in mt:
        return 'Qualifier'
        #if 'eb' in mt:
        #    return 'EC Qualifier'
        #elif 'vb' in mt:
        #    return 'WC Qualifier'
        #elif 'olimpia' in mt:
        #    return 'Olympic Qualifier'
        #else:
        #    return match_type
    elif 'olimpia' in mt:
        return 'Olympics'
    elif 'eb' in mt:
        return 'European Championship'
    elif 'vb' in mt:
        return 'World Cup'
    elif 'ek' in mt:
        return 'Friendly'
    else:
        return 'Other'

Get the match type

In [17]:
df['match_type'] = df['type'].map(lambda x: extract_match_type(x))
# get rid of 'Other' category
df.drop(df[df['match_type'] == 'Other'].index, inplace=True)
# ensure we have the right match types
print(df['match_type'].unique())

df.drop("type", axis=1, inplace=True)
df.head()

['Friendly' 'Olympics' 'World Cup' 'Qualifier' 'European Championship']


Unnamed: 0,away_goals,opponent,home_goals,home_or_away,opponent_goals,hungary_goals,outcome,date,match_type
0,1,Ausztria,4,H,1,4,W,1907-11-03,Friendly
1,3,Cseh- és Morvaország,5,A,5,3,L,1907-10-06,Friendly
2,2,Cseh- és Morvaország,5,H,2,5,W,1907-04-07,Friendly
3,3,Ausztria,5,H,3,5,W,1908-11-01,Friendly
4,7,Anglia,0,H,7,0,L,1908-06-10,Friendly


Map the opponents to English country names

In [18]:
countries = {
    "Albánia": "Albania",
    "Algéria": "Algeria",
    "Andorra": "Andorra",
    "Anglia": "England",
    "Antigua és Barbuda": "Antigua and Barbuda",
    "Argentína": "Argentina",
    "Ausztria": "Austria",
    "Ausztrália": "Australia",
    "Azerbajdzsán": "Azerbaijan",
    "Belgium": "Belgium",
    "Bolivia": "Bolivia",
    "Bosznia-Hercegovina": "Bosnia and Herzegovina",
    "Brazília": "Brazil",
    "Bulgária": "Bulgaria",
    "Chile": "Chile",
    "Ciprus": "Cyprus",
    "Cseh- és Morvaország": "Czech Republic",
    "Csehország": "Czech Republic",
    "Csehszlovákia": "Czechoslovakia",
    "Dánia": "Denmark",
    "Egyesült Arab Emírségek": "United Arab Emirates",
    "Egyesült Arab Köztársaság": "United Arab Republic",
    "Egyesült Államok": "USA",
    "Egyiptom": "Egypt",
    "Elefántcsontpart": "Cote d'Ivoire",
    "Fehéroroszország": "Belarus",
    "Feröer": "Faroe Islands",
    "Finnország": "Finland",
    "Franciaország": "France",
    "Grúzia": "Georgia",
    "Görögország": "Greece",
    "Holland-India": "Netherlands East-Indies",
    "Hollandia": "Netherlands",
    "Horvát Bánság": "Croatia",
    "Horvátország": "Croatia",
    "India": "India",
    "Irán": "Iran",
    "Izland": "Iceland",
    "Izrael": "Israel",
    "Japán": "Japan",
    "Jordánia": "Jordan",
    "Jugoszlávia": "Yugoslavia",
    "Kanada": "Canada",
    "Katar": "Qatar",
    "Kazahsztán": "Kazakhstan",
    "Kolumbia": "Colombia",
    "Koreai Köztársaság": "South Korea",
    "Kuvait": "Kuwait",
    "Kína": "China",
    "Lengyelország": "Poland",
    "Lettország": "Latvia",
    "Liechtenstein": "Liechtenstein",
    "Litvánia": "Lithuania",
    "Luxemburg": "Luxembourg",
    "Macedónia": "FYR Macedonia",
    "Mexikó": "Mexico",
    "Moldova": "Moldova",
    "Montenegró": "Montenegro",
    "Moszkva": "Russia",
    "Málta": "Malta",
    "Nagy-Britannia": "United Kingdom",
    "Norvégia": "Norway",
    "Német Demokratikus Köztársaság": "East Germany",
    "Német Szövetségi Köztársaság": "West Germany",
    "Német-Ausztria": "Austria",
    "Németország": "Germany",
    "Olaszország": "Italy",
    "Oroszország": "Russia",
    "Peru": "Peru",
    "Portugália": "Portugal",
    "Románia": "Romania",
    "Salvador": "El Salvador",
    "San Marino": "San Marino",
    "Skócia": "Scotland",
    "Spanyolország": "Spain",
    "Svájc": "Switzerland",
    "Svédország": "Sweden",
    "Szaúd-Arábia": "Saudi Arabia",
    "Szlovákia": "Slovakia",
    "Szlovénia": "Slovenia",
    "Szovjetunió": "Soviet Union",
    "Törökország": "Turkey",
    "Ukrajna": "Ukraine",
    "Uruguay": "Uruguay",
    "Wales": "Wales",
    "Ázsia válogatott": "Asian All Stars",
    "Észak-Írország": "Northern Ireland",
    "Észtország": "Estonia",
    "Írország": "Republic of Ireland",
    "Örményország": "Armenia",
    "Új-Zéland": "New Zealand"
}

In [19]:
df["opponent"] = df["opponent"].map(countries)
df.head()

Unnamed: 0,away_goals,opponent,home_goals,home_or_away,opponent_goals,hungary_goals,outcome,date,match_type
0,1,Austria,4,H,1,4,W,1907-11-03,Friendly
1,3,Czech Republic,5,A,5,3,L,1907-10-06,Friendly
2,2,Czech Republic,5,H,2,5,W,1907-04-07,Friendly
3,3,Austria,5,H,3,5,W,1908-11-01,Friendly
4,7,England,0,H,7,0,L,1908-06-10,Friendly


In [20]:
# save DataFrame to csv
df.to_csv('hungarian_nt_matches.csv', index=False)