In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

#Scrape match summary data
years = ['2020-2020', '2021-2021', '2022-2022', '2024-2024']
base_url = "https://www.espncricinfo.com/records/year/team-match-results/{}/twenty20-internationals-3?team=6"

all_rows = []
all_headers = None

for yr in years:
    url = base_url.format(yr)
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('div', class_='ds-p-0')

    # Extract headers
    if all_headers is None:
        all_headers = [th.text.strip() for th in table.find('thead').find_all('td')] + ['Scorecard Link']
    
    # Extract rows for df
    for tr in table.find('tbody').find_all('tr'):
        cells = [td.text.strip() for td in tr.find_all('td')]
        scorecard_td = tr.find_all('td')[-1]
        scorecard_link = scorecard_td.find('a')['href'] if scorecard_td.find('a') else None
        full_scorecard_link = "https://www.espncricinfo.com" + scorecard_link if scorecard_link else None
        cells.append(full_scorecard_link)
        all_rows.append(cells)

# Create match summary df
match_summary_df = pd.DataFrame(all_rows, columns=all_headers)

# Extract match ID from string
match_summary_df = match_summary_df[match_summary_df['Scorecard Link'].notna()]
match_summary_df['Match_id'] = match_summary_df['Scorecard'].str.extract(r'T20I # (\d+)').astype(int)

# Create DataFrame with 'Match_id' and 'Scorecard Link' for future use
matches_df = match_summary_df[['Match_id', 'Scorecard Link']]

In [2]:
matches_df.head()

Unnamed: 0,Match_id,Scorecard Link
0,1116,https://www.espncricinfo.com/series/india-in-a...
1,1115,https://www.espncricinfo.com/series/india-in-a...
2,1114,https://www.espncricinfo.com/series/india-in-a...
3,1037,https://www.espncricinfo.com/series/india-in-n...
4,1036,https://www.espncricinfo.com/series/india-in-n...


In [3]:
matches_df.tail()

Unnamed: 0,Match_id,Scorecard Link
76,2658,https://www.espncricinfo.com/series/icc-men-s-...
77,2639,https://www.espncricinfo.com/series/icc-men-s-...
78,2435,https://www.espncricinfo.com/series/afghanista...
79,2431,https://www.espncricinfo.com/series/afghanista...
80,2428,https://www.espncricinfo.com/series/afghanista...


In [4]:
def extract_table_data(table):
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = [td.text.strip() for td in tr.find_all('td')]
        if len(cells) > 1:  # Avoid empty rows
            rows.append(cells)
    return headers, rows

def scrape_match_scorecard(url, match_id):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request was unsuccessful
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Define class names for tables
    batting_table_class = 'ci-scorecard-table'
    bowling_table_class = 'ds-w-full ds-table ds-table-md ds-table-auto'
    
    # Find all batting and bowling tables
    batting_tables = soup.find_all('table', class_=batting_table_class)
    bowling_tables = soup.find_all('table', class_=bowling_table_class)
    
    # Extract batting data and add an "Innings" and "Match ID" column
    batting_rows = []
    batting_headers = None
    for i, table in enumerate(batting_tables):
        headers, rows = extract_table_data(table)
        innings = 'First Innings' if i < len(batting_tables) / 2 else 'Second Innings'
        if batting_headers is None:
            batting_headers = headers + ['Innings', 'Match ID']
        for row in rows:
            row.append(innings)
            row.append(match_id)
        batting_rows.extend(rows)
    batting_rows = [row for row in batting_rows if 'TOTAL' not in row and 'Extras' not in row]
    
    # Extract bowling data and add an "Innings" and "Match ID" column
    bowling_rows = []
    bowling_headers = None
    for i, table in enumerate(bowling_tables):
        headers, rows = extract_table_data(table)
        innings = 'First Innings' if i < len(bowling_tables) / 2 else 'Second Innings'
        if bowling_headers is None:
            bowling_headers = headers + ['Innings', 'Match ID']
        for row in rows:
            row.append(innings)
            row.append(match_id)
        bowling_rows.extend(rows)
    
    # Debugging information
    # print(f"Scraping {url}")
    # print(f"Batting Headers: {batting_headers}")
    # print(f"First Batting Row: {batting_rows[0] if batting_rows else 'No Data'}")
    # print(f"Bowling Headers: {bowling_headers}")
    # print(f"First Bowling Row: {bowling_rows[0] if bowling_rows else 'No Data'}")
    
    # Ensure correct number of columns
    if batting_headers and batting_rows:
        assert len(batting_headers) == len(batting_rows[0]), "Mismatch in batting columns"
    if bowling_headers and bowling_rows:
        assert len(bowling_headers) == len(bowling_rows[0]), "Mismatch in bowling columns"
    
    batting_df = pd.DataFrame(batting_rows, columns=batting_headers)
    bowling_df = pd.DataFrame(bowling_rows, columns=bowling_headers)
    
    return batting_df, bowling_df

# Main logic to iterate over match URLs and combine DataFrames
all_batting_dfs = []
all_bowling_dfs = []

for _, row in matches_df.iterrows():
    match_id = row['Match_id']
    url = row['Scorecard Link']
    try:
        batting_df, bowling_df = scrape_match_scorecard(url, match_id)
        all_batting_dfs.append(batting_df)
        all_bowling_dfs.append(bowling_df)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        continue

# Combine all individual DataFrames into one
final_batting_df = pd.concat(all_batting_dfs, ignore_index=True)
final_bowling_df = pd.concat(all_bowling_dfs, ignore_index=True)

# Print the final DataFrames
print("Final Batting DataFrame")
print(final_batting_df)
print("\n")

print("Final Bowling DataFrame")
print(final_bowling_df)
print("\n")

Error scraping https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2021-22-1267897/afghanistan-vs-india-33rd-match-group-2-1273744/full-scorecard: 10 columns passed, passed data had 11 columns
Error scraping https://www.espncricinfo.com/series/south-africa-in-india-2022-23-1327499/india-vs-south-africa-2nd-t20i-1327507/full-scorecard: 10 columns passed, passed data had 11 columns
Final Batting DataFrame
               BATTING                                              R   B   M  \
0       Matthew Wade †                               lbw b Thakur  80  53   -   
1      Aaron Finch (c)               c Pandya b Washington Sundar   0   2   -   
2         Steven Smith                        b Washington Sundar  24  23   -   
3        Glenn Maxwell                                b Natarajan  54  36   -   
4     Moises Henriques                                    not out   5   2   -   
...                ...                                        ...  ..  ..  ..   
1206      Shubman 

In [5]:
final_batting_df

Unnamed: 0,BATTING,Unnamed: 2,R,B,M,4s,6s,SR,Innings,Match ID
0,Matthew Wade †,lbw b Thakur,80,53,-,7,2,150.94,First Innings,1116
1,Aaron Finch (c),c Pandya b Washington Sundar,0,2,-,0,0,0.00,First Innings,1116
2,Steven Smith,b Washington Sundar,24,23,-,1,0,104.34,First Innings,1116
3,Glenn Maxwell,b Natarajan,54,36,-,3,3,150.00,First Innings,1116
4,Moises Henriques,not out,5,2,-,1,0,250.00,First Innings,1116
...,...,...,...,...,...,...,...,...,...,...
1206,Shubman Gill,st †Rahmanullah Gurbaz b Mujeeb Ur Rahman,23,12,16,5,0,191.66,Second Innings,2428
1207,Tilak Varma,c Gulbadin Naib b Azmatullah Omarzai,26,22,34,2,1,118.18,Second Innings,2428
1208,Shivam Dube,not out,60,40,61,5,2,150.00,Second Innings,2428
1209,Jitesh Sharma †,c Ibrahim Zadran b Mujeeb Ur Rahman,31,20,21,5,0,155.00,Second Innings,2428


# cleaning

In [6]:
final_batting_df = final_batting_df.rename(columns={'': 'dismissed by'})

In [10]:
final_batting_df['dismissed by']

0                                    lbw b Thakur
1                    c Pandya b Washington Sundar
2                             b Washington Sundar
3                                     b Natarajan
4                                         not out
                          ...                    
1206    st †Rahmanullah Gurbaz b Mujeeb Ur Rahman
1207         c Gulbadin Naib b Azmatullah Omarzai
1208                                      not out
1209          c Ibrahim Zadran b Mujeeb Ur Rahman
1210                                      not out
Name: dismissed by, Length: 1211, dtype: object

In [13]:
# Function to extract fielder and bowler from dismissal info
def extract_credit(dismissal_info):
    if dismissal_info == 'not out':
        return pd.Series([None, None, 'not out'], index=['Fielder', 'Bowler', 'Status'])
    
    parts = dismissal_info.split(' b ')
    if len(parts) == 2:
        bowler = parts[1].strip()
        if parts[0].startswith('c '):
            fielder = parts[0][2:].strip()
            return pd.Series([fielder, bowler, 'caught'], index=['Fielder', 'Bowler', 'Status'])
        elif parts[0].startswith('st †'):
            fielder = parts[0][3:].strip()
            return pd.Series([fielder, bowler, 'stumped'], index=['Fielder', 'Bowler', 'Status'])
        else:
            return pd.Series([None, bowler, parts[0].strip()], index=['Fielder', 'Bowler', 'Status'])
    else:
        return pd.Series([None, dismissal_info.split(' ')[-1], dismissal_info.split(' ')[0]], index=['Fielder', 'Bowler', 'Status'])

# Apply the function to the 'dismissed by' column
final_batting_df[['Fielder', 'Bowler', 'Status']] = final_batting_df['dismissed by'].apply(extract_credit)
final_batting_df

Unnamed: 0,BATTING,dismissed by,R,B,M,4s,6s,SR,Innings,Match ID,Fielder,Bowler,Status
0,Matthew Wade †,lbw b Thakur,80,53,-,7,2,150.94,First Innings,1116,,Thakur,lbw
1,Aaron Finch (c),c Pandya b Washington Sundar,0,2,-,0,0,0.00,First Innings,1116,Pandya,Washington Sundar,caught
2,Steven Smith,b Washington Sundar,24,23,-,1,0,104.34,First Innings,1116,,Sundar,b
3,Glenn Maxwell,b Natarajan,54,36,-,3,3,150.00,First Innings,1116,,Natarajan,b
4,Moises Henriques,not out,5,2,-,1,0,250.00,First Innings,1116,,,not out
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,Shubman Gill,st †Rahmanullah Gurbaz b Mujeeb Ur Rahman,23,12,16,5,0,191.66,Second Innings,2428,†Rahmanullah Gurbaz,Mujeeb Ur Rahman,stumped
1207,Tilak Varma,c Gulbadin Naib b Azmatullah Omarzai,26,22,34,2,1,118.18,Second Innings,2428,Gulbadin Naib,Azmatullah Omarzai,caught
1208,Shivam Dube,not out,60,40,61,5,2,150.00,Second Innings,2428,,,not out
1209,Jitesh Sharma †,c Ibrahim Zadran b Mujeeb Ur Rahman,31,20,21,5,0,155.00,Second Innings,2428,Ibrahim Zadran,Mujeeb Ur Rahman,caught


In [17]:
# Function to clean player names
def clean_name(name):
    # Remove special characters and extra spaces
    name = re.sub(r'†|\(c\)', '', name)
    return name.strip()

# Apply the function to the 'BATTING' column
final_batting_df['BATTING'] = final_batting_df['BATTING'].apply(clean_name)

In [18]:
final_batting_df

Unnamed: 0,BATTING,dismissed by,R,B,M,4s,6s,SR,Innings,Match ID,Fielder,Bowler,Status
0,Matthew Wade,lbw b Thakur,80,53,-,7,2,150.94,First Innings,1116,,Thakur,lbw
1,Aaron Finch,c Pandya b Washington Sundar,0,2,-,0,0,0.00,First Innings,1116,Pandya,Washington Sundar,caught
2,Steven Smith,b Washington Sundar,24,23,-,1,0,104.34,First Innings,1116,,Sundar,b
3,Glenn Maxwell,b Natarajan,54,36,-,3,3,150.00,First Innings,1116,,Natarajan,b
4,Moises Henriques,not out,5,2,-,1,0,250.00,First Innings,1116,,,not out
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1206,Shubman Gill,st †Rahmanullah Gurbaz b Mujeeb Ur Rahman,23,12,16,5,0,191.66,Second Innings,2428,†Rahmanullah Gurbaz,Mujeeb Ur Rahman,stumped
1207,Tilak Varma,c Gulbadin Naib b Azmatullah Omarzai,26,22,34,2,1,118.18,Second Innings,2428,Gulbadin Naib,Azmatullah Omarzai,caught
1208,Shivam Dube,not out,60,40,61,5,2,150.00,Second Innings,2428,,,not out
1209,Jitesh Sharma,c Ibrahim Zadran b Mujeeb Ur Rahman,31,20,21,5,0,155.00,Second Innings,2428,Ibrahim Zadran,Mujeeb Ur Rahman,caught


In [22]:
csv_file_path = 'batting_summary.csv'
final_batting_df.to_csv(csv_file_path, index=False)

In [6]:
final_bowling_df

Unnamed: 0,BOWLING,O,M,R,W,ECON,0s,4s,6s,WD,NB,Innings,Match ID
0,Deepak Chahar,4,0,34,0,8.50,8,4,0,1,0,First Innings,1116
1,Washington Sundar,4,0,34,2,8.50,9,4,0,1,0,First Innings,1116
2,T Natarajan,4,0,33,1,8.25,8,3,0,2,0,First Innings,1116
3,Yuzvendra Chahal,4,0,41,0,10.25,3,1,2,2,1,First Innings,1116
4,Shardul Thakur,4,0,43,1,10.75,6,2,3,0,0,First Innings,1116
...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,Mujeeb Ur Rahman,4,1,21,2,5.25,15,4,0,0,0,Second Innings,2428
888,Mohammad Nabi,2,0,24,0,12.00,2,1,1,2,0,Second Innings,2428
889,Naveen-ul-Haq,3.3,0,43,0,12.28,1,4,2,0,0,Second Innings,2428
890,Azmatullah Omarzai,4,0,33,1,8.25,5,4,0,0,0,Second Innings,2428


In [24]:
csv_file_path = 'bowling_summary.csv'
final_bowling_df.to_csv(csv_file_path, index=False)

In [22]:
match_summary_df.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard,Scorecard Link,Match_id
0,Australia,India,Australia,12 runs,Sydney,"Dec 8, 2020",T20I # 1116,https://www.espncricinfo.com/series/india-in-a...,1116
1,Australia,India,India,6 wickets,Sydney,"Dec 6, 2020",T20I # 1115,https://www.espncricinfo.com/series/india-in-a...,1115
2,Australia,India,India,11 runs,Canberra,"Dec 4, 2020",T20I # 1114,https://www.espncricinfo.com/series/india-in-a...,1114
3,New Zealand,India,India,7 runs,Mount Maunganui,"Feb 2, 2020",T20I # 1037,https://www.espncricinfo.com/series/india-in-n...,1037
4,New Zealand,India,tied,-,Wellington,"Jan 31, 2020",T20I # 1036,https://www.espncricinfo.com/series/india-in-n...,1036


In [23]:
match_summary_df = match_summary_df.drop('Scorecard', axis=1)

In [24]:
match_summary_df.head()

Unnamed: 0,Team 1,Team 2,Winner,Margin,Ground,Match Date,Scorecard Link,Match_id
0,Australia,India,Australia,12 runs,Sydney,"Dec 8, 2020",https://www.espncricinfo.com/series/india-in-a...,1116
1,Australia,India,India,6 wickets,Sydney,"Dec 6, 2020",https://www.espncricinfo.com/series/india-in-a...,1115
2,Australia,India,India,11 runs,Canberra,"Dec 4, 2020",https://www.espncricinfo.com/series/india-in-a...,1114
3,New Zealand,India,India,7 runs,Mount Maunganui,"Feb 2, 2020",https://www.espncricinfo.com/series/india-in-n...,1037
4,New Zealand,India,tied,-,Wellington,"Jan 31, 2020",https://www.espncricinfo.com/series/india-in-n...,1036


In [26]:
csv_file_path = 'match_summary.csv'
match_summary_df.to_csv(csv_file_path, index=False)