## References:

1. https://www.espncricinfo.com
2. https://medium.com/swlh/web-scraping-cricinfo-data-c134fce79a33

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
class IPLDataScrapper:

    def __init__(self, year, data_dir='Data'):
        self.domain = 'https://www.espncricinfo.com'
        self.data_dir = data_dir
        if year >= 2010 and year <= 2020:
            self.year = year
            if self.year >= 2014 and self.year <= 2015:
                self.season_url = self.domain + '/scores/series/8048/season/{}/pepsi-indian-premier-league?view=results'.format(self.year)
            else:
                self.season_url = self.domain + '/scores/series/8048/season/{}/indian-premier-league?view=results'.format(self.year)
        else:
            raise ValueError("Scrapper is defined only for the seasons from 2010 to 2020")

    def __extract_match_urls(self):
        season_page = requests.get(self.season_url)
        if season_page.status_code == 200:
            soup = BeautifulSoup(season_page.content, 'html.parser')
            matches = soup.find_all(class_='col-md-8 col-16')
            match_urls = []
            for match in matches:
                match_url = self.domain + match.find('a', href=True)['href']
                match_urls.append(match_url)
        else:
            raise ValueError("Response status code: {}".format(season_page.status_code))

        return match_urls

    def __extract_batsman_data(self, soup):
        batsman_tables = soup.find_all(class_="table batsman")
        #assert len(batsman_tables) == 2

        columns = ['name', 'wicket', 'runs', 'balls', 'duration', 'fours', 'sixes', 'strike_rate']
        for inning, batsman_table in enumerate(batsman_tables, start=1):
            rows = batsman_table.find_all('tr')
            batsman_list = []
            for i in range(1, len(rows), 2):
                batsman_row = rows[i]
                cells = batsman_row.find_all('td')
                cells = [cell.text.strip() for cell in cells]

                if cells[0] == 'Extras':
                    row = ['Extras', 'Extras', cells[2], '0', '0', '0', '0', '0']
                    batsman_list.append(row)
                elif len(cells) > 7:
                    row = cells
                    batsman_list.append(row)
                else:
                    batsmen = [batsman.strip() for batsman in cells[0][len('Did not bat: '):].split(',')]
                    for batsman in batsmen:
                        row = [batsman, 'Did not bat', '0', '0', '0', '0', '0', '0']
                        batsman_list.append(row)
                
            batsman_df = pd.DataFrame(batsman_list, columns=columns)
            if inning == 1:
                batsman_df_1 = batsman_df
                batsman_df_1['inning'] = 1
            elif inning == 2:  
                batsman_df_2 = batsman_df
                batsman_df_2['inning'] = 2

        if len(batsman_tables) == 2:
            batsman_df = pd.concat([batsman_df_1, batsman_df_2])
        elif len(batsman_tables) == 1:
            batsman_df = batsman_df_1
        elif len(batsman_tables) == 0:
            batsman_df = pd.DataFrame(columns=columns)

        return batsman_df 


    def __extract_bowler_data(self, soup):
        bowler_tables = soup.find_all(class_="table bowler")
        #assert len(bowler_tables) == 2

        columns = ['name', 'overs', 'maidens', 'runs', 'wickets', 'economy', 'dots', 'fours', 'sixes', 'wides', 'no_balls']
        for inning, bowler_table in enumerate(bowler_tables, start=1):
            rows = bowler_table.find_all('tr')
            bowler_list = []
            for i in range(1, len(rows)):
                bowler_row = rows[i]
                cells = bowler_row.find_all('td')
                cells = [cell.text.strip() for cell in cells]
                row = cells
                bowler_list.append(row)
                
            bowler_df = pd.DataFrame(bowler_list, columns=columns)
            if inning == 1:
                bowler_df_1 = bowler_df
                bowler_df_1['inning'] = 1
            elif inning == 2:  
                bowler_df_2 = bowler_df
                bowler_df_2['inning'] = 2
            

        if len(bowler_tables) == 2:
            bowler_df = pd.concat([bowler_df_1, bowler_df_2])
        elif len(bowler_tables) == 1:
            bowler_df = bowler_df_1
        elif len(bowler_tables) == 0:
            bowler_df = pd.DataFrame(columns=columns)

        return bowler_df

    def scrape(self):
        self.match_urls = self.__extract_match_urls()
        self.season_dir = os.path.join(self.data_dir, str(self.year))
        if not os.path.exists(self.season_dir):
            os.mkdir(self.season_dir)
        for match_url in tqdm(self.match_urls, desc="Matches", leave=False):
            match_page = requests.get(match_url)
            soup = BeautifulSoup(match_page.content, 'html.parser')
            match_id, location, date, _ = soup.find(class_='desc text-truncate').get_text().split(',')
            match_id = match_id.replace('/', ' and ')
            batsman_df = self.__extract_batsman_data(soup)
            bowler_df = self.__extract_bowler_data(soup)
            match_dir = os.path.join(self.season_dir, match_id)
            if not os.path.exists(match_dir):
                os.mkdir(match_dir)
            batsman_df.to_csv(os.path.join(match_dir, 'batsman_df.csv'), index=False)
            bowler_df.to_csv(os.path.join(match_dir, 'bowler_df.csv'), index=False)


In [6]:
for year in tqdm(range(2011, 2020), 'Seasons'):
    scrapper = IPLDataScrapper(year=year)
    scrapper.scrape()

HBox(children=(FloatProgress(value=0.0, description='Seasons', max=9.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Matches', max=74.0, style=ProgressStyle(description_width…




UnboundLocalError: local variable 'batsman_df' referenced before assignment