## SoccerCovid - Data Scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [None]:
def get_match_raw(url):
    return BeautifulSoup(requests.get(url).text, "html.parser")

def get_parced_data(raw):
    return BeautifulSoup(raw, 'html.parser')

def get_match_schedule(raw):
    if raw.find_all("time"):
        return raw.find_all("time")[0].string

def get_venue_name(raw):
    if raw.find_all("span", {"class": "sdc-site-match-header__detail-venue"}):
        return raw.find_all("span", {"class": "sdc-site-match-header__detail-venue"})[0].string

def get_match_title(raw):
    return raw.title.string

def get_team_names(title, title_regex):
    title_group = title_regex.match(title)
    if title_group:
        return title_group.group('teamA'), title_group.group('teamB')

def get_team_scores(title, title_regex):
    title_group = title_regex.match(title)
    if title_group:
        return title_group.group('teamA_score'), title_group.group('teamB_score')

def get_winner_details(record):
    if(record['teamA_score'] and record['teamA_score'] > record['teamB_score']):
        return record['teamA'], record['teamA_score']
    elif(record['teamA_score']):
        return record['teamB'], record['teamB_score']

In [None]:
base_url = 'https://www.skysports.com/premier-league-results/'

def get_match_details(year_range, file_name):
    print('Entered get_match_details')
    if not os.path.exists('match_details_'+file_name):
        if not os.path.exists('raw_'+file_name):
            site = str(requests.get(base_url + year_range).content) #Fetch HTML Page
            match_url_regex = r'<a href="(https:\/\/www\.skysports\.com\/football\/[a-zA-Z-\/]+\d+)" class="matches__item matches__link"'
            match_url_regex = re.compile(match_url_regex)
            match_urls = match_url_regex.findall(site)
            match_details = pd.DataFrame(match_urls, columns =['url'])    
            match_details['raw'] = match_details['url'].apply(get_match_raw)
            match_details.to_csv('raw_'+file_name)
        else:
            match_details = pd.read_csv('raw_'+file_name)
            match_details['raw'] = match_details['raw'].apply(get_parced_data)

            match_details['schedule'] = match_details['raw'].apply(get_match_schedule)
            match_details['venue'] = match_details['raw'].apply(get_venue_name)    
            match_details['raw_title'] = match_details['raw'].apply(get_match_title)
            title_regex = re.compile(r'(?P<teamA>[a-zA-Z_ \']*) (?P<teamA_score>\d?\d) - (?P<teamB_score>\d?\d) (?P<teamB>[a-zA-Z_ \']*) -')
            match_details[['teamA', 'teamB']] = pd.DataFrame(match_details['raw_title']
                                                             .apply(lambda x: get_team_names(x, title_regex))
                                                             .tolist(), index=match_details.index)
            match_details[['teamA_score', 'teamB_score']] = pd.DataFrame(match_details['raw_title']
                                                                         .apply(lambda x: get_team_scores(x, title_regex))
                                                                         .tolist(), index=match_details.index)
            match_details[['winner_Team','winner_score']] = pd.DataFrame(match_details
                                                                         .apply(get_winner_details, axis=1)
                                                                         .tolist(), index=match_details.index)  

            match_details.drop(columns=['raw','raw_title'], inplace=True)
            match_details.to_csv('match_details_'+file_name)
            
            print('Completed get_match_details')
            return match_details
    else:
        print('Completed get_match_details')
        return pd.read_csv('match_details_'+file_name)

In [None]:
chromedriver_path = 'C:/webdrivers/chromedriver'
service = Service(chromedriver_path)

# Link config
google_base_url = 'https://www.google.com/search?q='

class SoccerCovid:
    def __init__(self):
        self.final_data = {}
        pass

    def get_driver(self):
        return webdriver.Chrome(service=service)
    
    def get_url(self, param):
        param += ' wikipedia'
        param = param.replace(" ", "+") 
        return ''.join([google_base_url, param])
    
    def get_by_xpath(self, driver, xpath):
        return driver.find_elements(By.XPATH, xpath)
        
    def navigate_to_site(self, driver, url):
        driver.get(url)
        time.sleep(2)
        
    def get_wiki_url(self, driver, param, ext = False):
        param = str(param)
        param += ' Football Club' if ext else ''
        url = self.get_url(param)
        self.navigate_to_site(driver, url)
        xpath = "//a[contains(@href, 'wikipedia.org/wiki')]"
        url_list = driver.find_elements(By.XPATH, xpath)
        return url_list[0].get_attribute('href')
    
    def get_venue_location(self, match, driver):
        wiki_url = self.get_wiki_url(driver, match['venue'])
        self.navigate_to_site(driver, wiki_url)  
        xpath = "//th[contains(text(), 'Location' )]/following-sibling::td"
        raw_venue_location = self.get_by_xpath(driver, xpath)
        if raw_venue_location:
            return raw_venue_location[0].text
    
    def get_winner_location(self, match, driver):
        wiki_url = self.get_wiki_url(driver, match['winner_Team'], True)
        self.navigate_to_site(driver, wiki_url)
        xpath = "//th[contains(text(), 'Ground' )]/following-sibling::td/a"
        raw_venue_url = self.get_by_xpath(driver, xpath)
        if raw_venue_url:
            venue_url = raw_venue_url[0].get_attribute('href')
            self.navigate_to_site(driver, venue_url)
            xpath = "//th[contains(text(), 'Location' )]/following-sibling::td"
            raw_winner_loaction = self.get_by_xpath(driver, xpath)
            if raw_winner_loaction:
                return raw_winner_loaction[0].text
    
    def get_unique_venues(self, match_details, driver):
        unique_venue = pd.DataFrame()
        unique_venue['venue'] = match_details['venue'].unique()
        unique_venue['venue_location'] = unique_venue.apply(lambda x: self.get_venue_location(x, driver), axis = 1)
        return unique_venue
    
    def get_unique_winners(self, match_details, driver):
        unique_winner = pd.DataFrame()
        unique_winner['winner_Team'] = match_details['winner_Team'].unique()
        unique_winner['winner_location'] = unique_winner.apply(lambda x: self.get_winner_location(x, driver), axis = 1)
        return unique_winner
        
    def scrape_pages(self, match_details):
        driver = self.get_driver()
        
        unique_venue = self.get_unique_venues(match_details, driver)
        match_details = match_details.merge(unique_venue, on='venue', how='left')
        
        unique_winner = self.get_unique_winners(match_details, driver)
        match_details = match_details.merge(unique_winner, on='winner_Team', how='left')
        
        driver.quit()
        return match_details
    
    def start_scraping(self, match_details_df):
        return self.scrape_pages(match_details_df)

In [None]:
dates = ['2019-20', '2020-21', '2021-22']

for date in dates:
    file_name = 'covid_soccer_' + date + '.csv'
    if not os.path.exists(file_name):
        print(f'Started {file_name}')
        result_df = get_match_details(date, file_name)
        result_df = SoccerCovid().start_scraping(result_df)
        result_df.to_csv(file_name)
        print(f'Completed {file_name}')