# Required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

# 1 Results Table 

In [10]:
url = r'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'      # scrap data from espncricinfo website
html = requests.get(url, 'lxml')

In [11]:
html                                  # request successful

<Response [200]>

In [12]:
soup = BeautifulSoup(html.text , 'lxml')
table = soup.find('table', class_= 'engineTable')           # class "engineTable" will give all tables present in webpage
rows = table.tbody.find_all('tr')

In [13]:
table_dict = {}
for i in range(len(rows)):
    list1 = rows[i].find_all('td')
    list1 = [i.text for i in rows[i].find_all('td')]
    list1.append("https://www.espncricinfo.com" + rows[i].find_all('td')[6].a['href'])            # 6th element include scorecard link which is added with base url
    
    table_dict[i] = list1

In [17]:
heading = [i.text for i in table.thead.find_all('th')]                         # tag "thead" will give column names
heading

['Team 1', 'Team 2', 'Winner', 'Margin', 'Ground', 'Match Date', 'Scorecard']

In [18]:
heading.append('scorecard_link')
print(heading)

['Team 1', 'Team 2', 'Winner', 'Margin', 'Ground', 'Match Date', 'Scorecard', 'scorecard_link']


In [19]:
result_df = pd.DataFrame(table_dict, index = heading).T

In [21]:
result_df.head(2)              # shape : (45, 8)

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,scorecard_link
0,Namibia,Sri Lanka,Namibia,55 runs,Geelong,"Oct 16, 2022",T20I # 1823,https://www.espncricinfo.com/ci/engine/match/1...
1,Netherlands,U.A.E.,Netherlands,3 wickets,Geelong,"Oct 16, 2022",T20I # 1825,https://www.espncricinfo.com/ci/engine/match/1...


In [20]:
result_df.to_csv('Result_df.csv')                       # results table saved to csv file

# 2 Batting Data

In [23]:
soup_list = []                                           # soups saved in a list for each url extracted in result_df
for i in range(len(result_df['scorecard_link'])):         
    url = result_df['scorecard_link'][i]
    html = requests.get(url, 'lxml')
    soup = BeautifulSoup(html.text , 'lxml')
    soup_list.append(soup)

In [24]:
batting_df = pd.DataFrame(columns = ['batsmanName', 'dismissal', 'runs', 'balls', 'M', '4s','6s','SR','match','contry','battingPos'])
batting_df

Unnamed: 0,batsmanName,dismissal,runs,balls,M,4s,6s,SR,match,contry,battingPos


In [25]:
heading = ['batsmanName', 'dismissal', 'runs', 'balls', 'M', '4s','6s','SR']

In [26]:
for index in range(len(soup_list)):
    try:                                                               # to handle condition when cricket matches are abandoned(index : 20,24,25)
        contry_list = soup_list[index].find_all('span', class_ = "ds-text-title-xs ds-font-bold ds-capitalize")     # for scraping team names
        contry_list = [i.text for i in contry_list]
        match_name = contry_list[0] + ' VS ' + contry_list[1]              # team1 Vs team2

        # contry_list

        tables = soup_list[index].find_all('table')         # extract all score cards from the page with 'table' tag

        # batting stats for team 1
        row_dict1 = {}
        rows_t1 = tables[0].tbody.find_all('tr')        # table[0] is for batting score card of team 1


        for i in range(len(rows_t1)):
            list1 = rows_t1[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
            if list2[0] == '' or list2[0] =='Extras' or list2[0] == 'TOTAL' or list2[0].startswith('Did not bat') or list2[0].startswith('Fall of wickets'):
                pass
            else:
                row_dict1[i] = list2

        df1 = pd.DataFrame(row_dict1,index = heading).T

        player_count1 = df1.shape[0]                  # count of players batted in this inning.
        df1['match'] = [match_name] * player_count1

        df1['contry'] = [contry_list[0]] * player_count1

        df1['battingPos'] = [i+1 for i in range(player_count1)]

        # Batting stats for team 2

        row_dict2 = {}
        rows_t2 = tables[2].tbody.find_all('tr')                # table[2] is for batting score card of team 2


        for i in range(len(rows_t2)):
            list1 = rows_t2[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
            if list2[0] == '' or list2[0] =='Extras' or list2[0] == 'TOTAL' or list2[0].startswith('Did not bat') or list2[0].startswith('Fall of wickets'):
                pass
            else:
                row_dict2[i] = list2

        df2 = pd.DataFrame(row_dict2, index = heading).T

        player_count2 = df2.shape[0]                  # count of players batted in this inning.
        df2['match'] = [match_name] * player_count2

        df2['contry'] = [contry_list[1]] * player_count2

        df2['battingPos'] = [i+1 for i in range(player_count2)]

        batting_df = pd.concat([batting_df,df1,df2])
    except:
        pass                                    # to handle condition when matches are abandoned(index : 20,24,25)

In [28]:
batting_df.head(2)             # shape : (699, 11)

Unnamed: 0,batsmanName,dismissal,runs,balls,M,4s,6s,SR,match,contry,battingPos
0,Michael van Lingen,c Pramod Madushan b Chameera,3,6,7,0,0,50.0,Namibia VS Sri Lanka,Namibia,1
2,Divan la Cock,c Shanaka b Pramod Madushan,9,9,15,1,0,100.0,Namibia VS Sri Lanka,Namibia,2


In [33]:
batting_df.to_csv('Batter_df.csv')                      # batting df saved to csv file

# 3 Bowling data

In [34]:
heading = ['bowlerName','overs','maiden','runs','wickets','economy','0s','4s','6s','wides','noBalls']
bowling_df = pd.DataFrame(columns = heading + ['match','contry'])
bowling_df

Unnamed: 0,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls,match,contry


In [35]:
for index in range(len(soup_list)) :
    try:
        contry_list = soup_list[index].find_all('span', class_ = "ds-text-title-xs ds-font-bold ds-capitalize")     # for scraping team names
        contry_list = [i.text for i in contry_list]
        match_name = contry_list[0] + ' VS ' + contry_list[1] 
        
        tables = soup_list[index].find_all('table')         # extract all score cards from the page with 'table' tag
        
        row_dict1 = {}
        rows_t1 = tables[1].tbody.find_all('tr')        # table[1] is for bowling score card of team 1
        for i in range(len(rows_t1)):
            list1 = rows_t1[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
                if len(list2) == 11:
                    row_dict1[i] = list2

        df1 = pd.DataFrame(row_dict1, index = heading).T

        player_count1 = df1.shape[0]                  # count of players bowled in this inning.
        df1['match'] = [match_name] * player_count1

        df1['contry'] = [contry_list[1]] * player_count1


        row_dict2 = {}
        rows_t2 = tables[3].tbody.find_all('tr')        # table[3] is for bowling score card of team 2
        for i in range(len(rows_t2)):
            list1 = rows_t2[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
                if len(list2) == 11:
                    row_dict2[i] = list2

        df2 = pd.DataFrame(row_dict2, index = heading).T

        player_count2 = df2.shape[0]                  # count of players bowled in this inning.
        df2['match'] = [match_name] * player_count2

        df2['contry'] = [contry_list[0]] * player_count2

        bowling_df = pd.concat([bowling_df,df1,df2])
    except:
        pass

In [37]:
bowling_df.head(2)              # shape : (500, 13)

Unnamed: 0,bowlerName,overs,maiden,runs,wickets,economy,0s,4s,6s,wides,noBalls,match,contry
0,Maheesh Theekshana,4,0,23,1,5.75,7,0,0,2,0,Namibia VS Sri Lanka,Sri Lanka
2,Dushmantha Chameera,4,0,39,1,9.75,6,3,1,2,0,Namibia VS Sri Lanka,Sri Lanka


In [38]:
bowling_df.to_csv('Bowler_df.csv')                    # save df to csv

# 4 Players Indivisual Data

In [39]:
player_df = pd.DataFrame(columns = ['link','contry'])
player_df

Unnamed: 0,link,contry


In [40]:
for index in range(len(soup_list)) :
    try:
    
        contry_list = soup_list[index].find_all('span', class_ = "ds-text-title-xs ds-font-bold ds-capitalize")     # for scraping team names
        contry_list = [i.text for i in contry_list]

        tables = soup_list[index].find_all('table')         # extract all score cards from the page with 'table' tag

        row_dict = {}
        rows_t1 = tables[1].tbody.find_all('tr')        # table[1] is for bowling score card of team 2
        for i in range(len(rows_t1)):
            list1 = rows_t1[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j)
                if len(list2) == 11:
                    row_dict[list1[0].text] = ["https://www.espncricinfo.com" + list1[0].a['href'],contry_list[1]] 


        rows_t2 = tables[3].tbody.find_all('tr')        # table[3] is for bowling score card of team 1
        for i in range(len(rows_t2)):
            list1 = rows_t2[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
                if len(list2) == 11:
                    row_dict[list1[0].text] = ["https://www.espncricinfo.com" + list1[0].a['href'],contry_list[0]]


        rows_t1 = tables[0].tbody.find_all('tr')        # table[0] is for batting score card of team 1

        for i in range(len(rows_t1)):
            list1 = rows_t1[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
            if list2[0] == '' or list2[0] =='Extras' or list2[0] == 'TOTAL' or list2[0].startswith('Did not bat') or list2[0].startswith('Fall of wickets'):
                pass
            else:
                row_dict[re.sub('\xa0','',list1[0].text)] = ["https://www.espncricinfo.com" + list1[0].a['href'],contry_list[0]]  


        rows_t2 = tables[2].tbody.find_all('tr')           # table[2] is for batting score card of team 2


        for i in range(len(rows_t2)):
            list1 = rows_t2[i].find_all('td')
            list2 = []
            for j in list1:
                list2.append(j.text)
            if list2[0] == '' or list2[0] =='Extras' or list2[0] == 'TOTAL' or list2[0].startswith('Did not bat') or list2[0].startswith('Fall of wickets'):
                pass
            else:
                row_dict[re.sub('\xa0','',list1[0].text)] = ["https://www.espncricinfo.com" + list1[0].a['href'],contry_list[1]]


        df = pd.DataFrame(row_dict, index = ['link','contry']).T
        player_df = pd.concat([player_df,df])
        
    except:
        pass

In [41]:
player_df = player_df.drop_duplicates()                 # Removed duplicate rows

In [42]:
info_dict = {}

In [45]:
for index in range(player_df.shape[0]):             
#     print(index)
    url = player_df['link'][index]
    html = requests.get(url, 'lxml')
#     print(html)
    soup = BeautifulSoup(html.text , 'lxml')
    deatils = soup.find('div',class_ = 'ds-grid lg:ds-grid-cols-3 ds-grid-cols-2 ds-gap-4 ds-mb-8')
    data_list = deatils.find_all('div')   
    key_list,value_list = [],[]
    for j in data_list:                              # list comprehention failed here, simple for loop used to hangle th error
        try:
            key_list.append(j.p.text)                # key is parameter  , ex: batting style, playing role etc
            value_list.append(j.h5.text)   
        except:
            pass

    data_dict = dict(zip(key_list,value_list))      # created dictionary with key and values

    info_list = [np.nan,np.nan,np.nan]              # initiated list with null values
    try:
        info_list[0] = data_dict['Batting Style']   # this is to avoid error, if batting style is not mentioned in web page
    except :
        pass

    try:
        info_list[1] = data_dict['Bowling Style']  # this is to avoid error, if bowling style is not mentioned in web page
    except :
        pass

    try:
        info_list[2] = data_dict['Playing Role']  # this is to avoid error, if role is not mentioned in web page
    except :
        pass


    info_dict[index] = info_list    

In [47]:
df = pd.DataFrame(info_dict,index = ['battingStyle','bowlingStyle','playingRole']).T

In [48]:
df.head(2)            # shape : (213, 3)

Unnamed: 0,battingStyle,bowlingStyle,playingRole
0,Right hand Bat,Right arm Offbreak,Bowler
1,Right hand Bat,Right arm Fast,Bowler


In [50]:
df['index'] = player_df.index             # index column with player names as value added in df, to match row lables for perfect concat
df = df.set_index('index')
df.head(2)

Unnamed: 0_level_0,battingStyle,bowlingStyle,playingRole
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maheesh Theekshana,Right hand Bat,Right arm Offbreak,Bowler
Dushmantha Chameera,Right hand Bat,Right arm Fast,Bowler


In [51]:
final_player_df = pd.concat([player_df,df], axis = 1)

In [54]:
final_player_df.head(2)         # shape : (213, 5)

Unnamed: 0,link,contry,battingStyle,bowlingStyle,playingRole
Maheesh Theekshana,https://www.espncricinfo.com/cricketers/mahees...,Sri Lanka,Right hand Bat,Right arm Offbreak,Bowler
Dushmantha Chameera,https://www.espncricinfo.com/cricketers/dushma...,Sri Lanka,Right hand Bat,Right arm Fast,Bowler


In [56]:
final_player_df.to_csv('Player_df.csv')             # df saved to csv