In [3]:
# Import relevant libraries
from bs4 import BeautifulSoup as bs
import requests
import re

In [4]:
def fetch_season_links(): # Make a request
    req = requests.get('https://levskisofia.info/seasons/')
    page = bs(req.content)
    # Extract all links to the seasons in the 2010s and 2020s
    season_links = []
    for link in page.find_all("a",string=re.compile(r'^20[12]')):
        season_links.append(link['href'])
    
    return season_links


In [5]:
# Attributes we want: Opponent, Result, Scorers, Coach, Date, Hosted?, Tournament
def fetch_match_links(season_link):
    try:
        req = requests.get(season_link)
    except Exception as e:
        print(e)
    page = bs(req.content)

    tournament_links = [] 
    for link in page.select("td.plt.bds > a"):
        tournament_links.append(link['href']) #to-do: remove code duplication

    matches_links = []

    for tournament in tournament_links:
        req = requests.get(tournament)
        page = bs(req.content)
        for link in page.select("td.nobrlt.nobrrt > a"):
            matches_links.append(link['href'])
    
    return matches_links

In [6]:
# Result of match, paramters are levski's goalscore and the opponent's
def result(levski, opp):
    if (levski > opp):
        return "win"
    if (levski == opp):
        return "tie"
    return "loss"

def fetch_match_info(match_link): 
    try:
        req = requests.get(match_link)
    except Exception as e:
        print(e)
        
    page = bs(req.content)
    # We'll gather relevant match info in a dictionary for the match:
    match_info = {}
    # Select opponent's tag and extract the text:
    match_info['opponent_name'] = page.find("div",class_="description").find("a").text

    # Results of matches: '1:1', '3:1', '2:2'.
    score_pattern = re.compile(r'(\d):(\d)')
    # We're getting this result from the match description since it's standardized into "Levski's score : Opponent's score"
    text = page.select_one('div.description > p > span').text
    
    levski_goals = int(score_pattern.search(text).group(1))
    opp_goals = int(score_pattern.search(text).group(2))
 
    match_info['levski_goals'] = levski_goals
    match_info['opponent_goals'] = opp_goals
    match_info['result'] = result(levski_goals,opp_goals)

    # Levski's coach, they have a link on the website so that why we use .find("a") to distuingish them
    match_info['coach'] = page.find("div",class_="coach").find("a").text

    # Did Levski host the match? Levksi's home stadium is Стадион "Георги Аспарухов", София
    info_box = page.find("div",class_="info").text
    match_info['hosted'] = bool(re.search("Георги Аспарухов", info_box))

    # Add list of scorers
    if levski_goals > 0:
        scorers_tags = page.find_all("span", class_="gRes")
        match_info['scorers'] = [scorer.next_sibling.text for scorer in scorers_tags]

    # Day, date, hour of match
    date_info = page.select_one('div.info > p.p1').text.split(sep=", ")
    match_info['date'] = date_info[1]
   
    # Add tournament name
    match_info['tournament'] = page.select_one('div.info > p.p2 > a').text
    return match_info

In [7]:
fetch_match_info('https://levskisofia.info/match/19116605-sevlievo-sevlievo-levski-sofia-1-2/')

{'opponent_name': 'Видима-Раковски (Севлиево)',
 'levski_goals': 2,
 'opponent_goals': 1,
 'result': 'win',
 'coach': 'Ясен Петров',
 'hosted': False,
 'scorers': ['Гара Дембеле', 'Дарко Тасевски', 'Герасим Заков'],
 'date': '30 август 2010 година',
 'tournament': '"А" Футболна група'}

In [8]:
import itertools

def one_season_all_matches_info(season_link):
    matches = fetch_match_links(season_link)
    
    list_of_matches = []
    for match_link in matches:
        try:
            list_of_matches.append(fetch_match_info(match_link))
        except Exception as e:
            print(match_link)
            print(e)
    
    return list_of_matches

def scrape_matches():
    all_matches = []
    season_links = fetch_season_links()
    for season_link in season_links:
        all_matches.append(one_season_all_matches_info(season_link))
        
    return list(itertools.chain.from_iterable(all_matches))

    

In [9]:
all_matches = scrape_matches()

https://levskisofia.info/match/19116619-kaliakra-1923-kavarna-levski-sofia-0-3/
'NoneType' object has no attribute 'find'
https://levskisofia.info/match/19120403-liteks-lovech-levski-sofia-1-1/
'NoneType' object has no attribute 'group'
https://levskisofia.info/match/19240502-spartak-tarnava-levski-sofia-2-1/
'NoneType' object has no attribute 'group'
https://levskisofia.info/match/19320409-beroe-stara-zagora-levski-sofia-3-3/
'NoneType' object has no attribute 'group'
https://levskisofia.info/match/19420404-levski-sofia-cska-sofia-0-0/
'NoneType' object has no attribute 'group'
https://levskisofia.info/match/19616602-levski-sofia-liteks-lovech-2-2/
'NoneType' object has no attribute 'find'
https://levskisofia.info/match/19616611-liteks-lovech-levski-sofia-1-2/
'NoneType' object has no attribute 'find'
https://levskisofia.info/match/19616620-levski-sofia-liteks-lovech-3-0/
'NoneType' object has no attribute 'find'
https://levskisofia.info/match/19620401-pomorie-pomorie-levski-sofia-0-0

In [10]:
# TO-DO: matches with continuation for penalties aren't correctly processed
# TO-DO: Auto wins, anulled matches - no descriptions, goals for Levski even though they didn't score. *** Manual entry?
# TO-DO: exclude from getting pages of future matches
len(all_matches)

459

In [11]:
import json

def save_data(title, data):
  base_path = "../data/"
  with open(base_path + title, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
save_data("levski_new.json", all_matches)