# Scraper

## Library Imports

In [189]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
import pandas as pd
import os
import numpy as np
import pickle

import data_cleaning
import feature_engineering

In [116]:
if not os.path.exists('./data/to_predict'):
    os.makedirs('./data/to_predict')

leagues = ['primera_division', 'segunda_division', # Spain
            'serie_a', 'serie_b',  # Italy
            'bundesliga', '2_liga', # Germany
            'ligue_1', 'ligue_2', # France
            'premier_league', 'championship', # England
            'eredivisie', 'eerste_divisie', # Netherlands
            ]
for league in leagues:
    if not os.path.exists(f'./data/to_predict/{league}'):
        os.makedirs(f'./data/to_predict/{league}')

if not os.path.exists('./data/elo'):
    os.makedirs('./data/elo')

if not os.path.exists(f'./data/match_info'):
    os.makedirs('./data/match_info')

In [200]:
class Scraper():
    def __init__(self, league:str, url:str='https://www.besoccer.com/competition', year: int=2023) -> None: 
        self.league = league
        self.url = url
        self.year = year
        if self.league == 'championship' and self.year < 2023:
            self.matchday = 46
        elif self.league == 'eerste_divisie' and self.year < 2023:
            self.matchday = 38
        else:    
            r = requests.get(f"{self.url}/scores/{self.league}/{self.year}")
            # time.sleep(1)
            soup = bs(r.content, 'html.parser')            
            matchday_str = soup.find('div', {'class': 'panel-title'}).text
            self.matchday = [int(s) for s in matchday_str.split() if s.isdigit()][0]

    def scrape_league(self):
        results = {'Home_Team': [], 'Away_Team': [], 'Result': [], 'Link': [], 'Season': [], 'Round': [], 'League': []}
        for matchday in tqdm(range(self.matchday)):
            if self.league == 'championship' or self.league == 'eerste_divisie':
                r = requests.get(f"{self.url}/scores/{self.league}/{self.year}/group1/round{matchday+1}")
            else:    
                r = requests.get(f"{self.url}/scores/{self.league}/{self.year}/round{matchday+1}")
            # time.sleep(3)
            soup = bs(r.content, 'html.parser')
            matches_box = soup.find('div', {'class': 'panel-body p0 match-list-new'})
            matches = matches_box.find_all('a', {'class': 'match-link'})
            for match in matches:
                home_team = match.find('div', {'class': 'team-info ta-r'}).find('div', {'class': 'name'}).text.strip()
                away_team = match.find_all('div', {'class': 'team-info'})[1].find('div', {'class': 'name'}).text.strip()
                home_score_box = match.find('div', {'class': 'marker'}).find('span', {'class': 'r1'})
                if home_score_box:
                    home_score = home_score_box.text.strip()
                else:
                    home_score = 'N/A'
                away_score_box = match.find('div', {'class': 'marker'}).find('span', {'class': 'r2'})
                if away_score_box:
                    away_score = away_score_box.text.strip()
                else:
                    away_score = 'N/A'
                results['Home_Team'].append(home_team)
                results['Away_Team'].append(away_team)
                results['Result'].append(f'{home_score}-{away_score}')
                results['Link'].append(match.get('href'))
                results['Season'].append(self.year)
                results['Round'].append(matchday)
                results['League'].append(self.league)
        results_df = pd.DataFrame(results)
        results_df.to_csv(f'./data/to_predict/{self.league}/Results_{self.year}_{self.league}.csv', index=False)
        self.links = results['Link']
    
    def scrape_elo(self):
        
        elo_dict = {}
        for link in tqdm(self.links):
            #time.sleep(3)
            r = requests.get(link + '/analysis')
            soup = bs(r.content, 'html.parser')
            elo_box = soup.find('div', {'class': 'panel-body pn compare-data'})
            if elo_box:
                elo_row = elo_box.find_all('tr')[1]
                if len(elo_row) > 0:
                    home_elo_box = elo_row.find('td', {'class': 'team1-c'})
                    away_elo_box = elo_row.find('td', {'class': 'team2-c'})
                    if home_elo_box:
                        home_elo = home_elo_box.text.strip()
                    else:
                        home_elo = 'N/A'
                    if away_elo_box:
                        away_elo = away_elo_box.text.strip()
                    else:
                        away_elo = 'N/A'
                else:
                    home_elo = 'N/A'
                    away_elo = 'N/A'
            else:
                home_elo = 'N/A'
                away_elo = 'N/A'

            elo_dict[link] = {'Elo_home': home_elo, 
                            'Elo_away': away_elo}
        
        with open(f'./data/elo/elo_dict_{self.year}_{self.league}.pkl', 'wb') as f:
            pickle.dump(elo_dict, f)
    
    def scrape_cards(self):
        match_info = {'Link': [], 'Date_New': [], 'Referee': [], 'Home_Yellow': [], 'Home_Red': [], 
                        'Away_Yellow': [], 'Away_Red': []}
        for link in tqdm(self.links):
            r = requests.get(link)
            soup = bs(r.content, 'html.parser')
            date_box = soup.find('div', {'class': 'date header-match-date'})
            if date_box:
                date_temp = date_box.text.strip()
                date_new = self.edit_date(date_temp, self.year)
            else:
                date_new = np.nan
            ref_field = soup.find('div', string ='On field referee')
            if ref_field:
                ref = ref_field.next_element.next_element.next_element.text
            else:
                ref = 'N/A'
            home_away_cards = soup.find_all('div', {'class': 'cards'})
            home_yc, away_yc, home_rc, away_rc = '0', '0', '0', '0'
            if len(home_away_cards) > 0:
                for i, team_cards in enumerate(home_away_cards):
                    yc = team_cards.find('span', {'class': 'yc'})
                    rc = team_cards.find('span', {'class': 'rc'})
                    if yc:
                        if i==0:
                            home_yc = yc.text
                        if i==1:
                            away_yc = yc.text
                    if rc:
                        if i==0:
                            home_rc = rc.text
                        if i==1:
                            away_rc = rc.text
            match_info['Link'].append(link)
            match_info['Date_New'].append(date_new)
            match_info['Referee'].append(ref)
            match_info['Home_Yellow'].append(home_yc)
            match_info['Home_Red'].append(home_rc)
            match_info['Away_Yellow'].append(away_yc)
            match_info['Away_Red'].append(away_rc)
        match_info_df = (pd.DataFrame(match_info)
                            .assign(Date_New = lambda df_: pd.to_datetime(df_.Date_New, dayfirst=True))
        )
        match_info_df.to_csv(f'./data/match_info/Match_Info_{self.year}_{self.league}.csv', index=False)

    def edit_date(self, date, year):
        strip_day = ['FRI, ', 'SAT, ', 'SUN, ', 'MON, ', 'TUE, ', 'WED, ', 'THU, ']
        replace_month = {' JAN': f'-01-{year}', ' FEB': f'-02-{year}', ' MAR': f'-03-{year}',
                            ' APR': f'-04-{year}', ' MAY': f'-05-{year}', ' JUN': f'-06-{year}',
                            ' JUL': f'-07-{year}', ' AUG': f'-08-{year}', ' SEP': f'-09-{year}',
                            ' OCT': f'-10-{year}', ' NOV': f'-11-{year}', ' DEC': f'-12-{year}'}
        for day in strip_day:
            if date.find(day) > -1:
                date = date.replace(day, '')
        for month_str, month_num in replace_month.items():
            if date.find(month_str) > -1:
                date = date.replace(month_str, month_num)

        return date


In [201]:
prem_2023 = Scraper('premier_league')
prem_2023.scrape_league()
prem_2023.scrape_elo()
prem_2023.scrape_cards()

100%|██████████| 14/14 [00:05<00:00,  2.41it/s]
100%|██████████| 140/140 [01:54<00:00,  1.22it/s]
100%|██████████| 140/140 [01:18<00:00,  1.79it/s]


In [168]:
match_info_prem_2023_df = pd.read_csv('./data/match_info/Match_Info_2023_premier_league.csv')
match_info_prem_2023_df.head()
# pd.to_datetime(match_info_prem_2023_df.Date_New, dayfirst=True)

Unnamed: 0,Link,Date_New,Referee,Home_Yellow,Home_Red,Away_Yellow,Away_Red
0,https://www.besoccer.com/match/crystal-palace-...,05-08-2023 21:00,Anthony Taylor,1,0,2,0
1,https://www.besoccer.com/match/fulham/liverpoo...,06-08-2023 13:30,Andy Madley,2,0,2,0
2,https://www.besoccer.com/match/afc-bournemouth...,06-08-2023 16:00,Peter Bankes,3,0,3,0
3,https://www.besoccer.com/match/leeds-united-af...,06-08-2023 16:00,Robert Jones,2,0,3,0
4,https://www.besoccer.com/match/newcastle-unite...,06-08-2023 16:00,Simon Hooper,3,0,3,0


In [192]:
leagues = ['primera_division', 'segunda_division', # Spain
            'serie_a', 'serie_b',  # Italy
            'bundesliga', '2_liga', # Germany
            'ligue_1', 'ligue_2', # France
            'premier_league', 'championship', # England
            'eredivisie', 'eerste_divisie' # Netherlands
            ]

years = [2022, 2023]

In [None]:
for year in years:
    for league in leagues:
        print(f'Getting information about: {league} - {year}')
        league_year = Scraper(league, year=year)
        league_year.scrape_league()
        league_year.scrape_elo()
        league_year.scrape_cards()

In [89]:
elo_dict = dict()
for year in years:
    for league in leagues:
        elo_year_league = pickle.load(open(f'./data/elo/elo_dict_{year}_{league}.pkl', 'rb'))
        elo_dict.update(elo_year_league)

with open(f'./data/elo/elo_dict_predict.pkl', 'wb') as f:
    pickle.dump(elo_dict, f)

In [91]:
elo_dict = pickle.load(open(f'./data/elo/elo_dict_predict.pkl', 'rb'))

In [195]:
match_info_df = pd.DataFrame()
for year in years:
    for league in leagues:
        league_year_match_info_df = pd.read_csv(f'./data/match_info/Match_Info_{year}_{league}.csv')
        match_info_df = pd.concat([match_info_df, league_year_match_info_df])

match_info_df.to_csv(f'./data/match_info/Match_Info_predict.csv', index=False)

In [199]:
match_info_df = pd.read_csv('./data/match_info/Match_Info_predict.csv')