In [1]:
%%time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm 


def setup_driver():
    """Initialize Chrome driver with optimized settings"""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-images')  # Disable image loading
    return webdriver.Chrome(options=options)

CPU times: total: 422 ms
Wall time: 11.1 s


In [2]:
def get_match_summary(driver, url):
    """Fetch match summary data and scorecard links"""
    match_data = []
    scorecard_links = []
    
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ds-table"))
        )
        
        table = driver.find_element(By.CLASS_NAME, "ds-table")
        rows = table.find_elements(By.TAG_NAME, "tr")
        
        # Skip header row
        for row in rows[1:]:
            try:
                cols = row.find_elements(By.TAG_NAME, "td")
                cols_text = [col.text for col in cols]
                
                if cols_text:
                    match_data.append({
                        "Team1": cols_text[0],
                        "Team2": cols_text[1],
                        "Winner": cols_text[2],
                        "Margin": cols_text[3],
                        "Ground": cols_text[4],
                        "Match_Date": cols_text[5],
                        "Scorecard": ""  # Will be updated with link
                    })
                    
                    # Get scorecard link
                    for col in cols:
                        link = col.find_element(By.TAG_NAME, "a") if col.find_elements(By.TAG_NAME, "a") else None
                        if link:
                            href = link.get_attribute("href")
                            if href and "scorecard" in href:
                                scorecard_links.append(href)
                                match_data[-1]["Scorecard"] = href  # Update scorecard link
            
            except Exception as e:
                print(f"Error processing match summary row: {str(e)}")
                continue
                
    except Exception as e:
        print(f"Error fetching match summary: {str(e)}")
        
    return match_data, scorecard_links

def get_scorecard_data(href, progress_bar=None):
    """Process a single scorecard page"""
    driver = setup_driver()
    batting_data = []
    bowling_data = []
    
    try:
        driver.get(href)
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ds-table"))
        )
        
        # Get team names from the match header
        try:
            team_names = driver.find_elements(By.CLASS_NAME, "ds-text-tight-l")
            team1 = team_names[0].text.strip()
            team2 = team_names[1].text.strip()
        except:
            team1, team2 = "Unknown", "Unknown"
        
        # Process all tables
        all_tables = driver.find_elements(By.CLASS_NAME, "ds-table")
        current_innings = 1  # Track which innings we're processing
        batting_position = 1  # Initialize batting position counter
        
        for table in all_tables:
            # Determine batting and bowling teams based on innings
            batting_team = team1 if current_innings % 2 == 1 else team2
            bowling_team = team2 if current_innings % 2 == 1 else team1
            
            # Batting table
            if "ci-scorecard-table" in table.get_attribute("class"):
                rows = table.find_elements(By.TAG_NAME, "tr")[1:]  
                batting_position = 1  # Reset position counter for each innings
                
                for row in rows:
                    try:
                        cells = row.find_elements(By.TAG_NAME, "td")
                        if cells and len(cells) >= 8:
                            # Check for not out based on correct class
                            is_not_out = "ci-scorecard-player-notout" in cells[0].get_attribute("class")
                            
                            # Skip extras/total rows
                            player_name = cells[0].text.strip()
                            if not any(x in player_name.lower() for x in ['extras', 'total']):
                                batting_data.append({
                                    "Match_URL": href,
                                    "Innings": current_innings,
                                    "Batting_Team": batting_team,
                                    "Bowling_Team": bowling_team,
                                    "Position": batting_position,
                                    "Player": player_name,
                                    "Out_Status": "not out" if is_not_out else "out",
                                    "Runs": cells[2].text.strip(),
                                    "Balls": cells[3].text.strip(),
                                    "Fours": cells[5].text.strip(),
                                    "Sixes": cells[6].text.strip(),
                                    "SR": cells[7].text.strip()
                                })
                                batting_position += 1  # Increment position only for valid batsmen
                    except (IndexError, NoSuchElementException):
                        continue
                current_innings += 1  # Increment innings counter after processing a batting table
            
            # Bowling table
            else:
                try:
                    headers = table.find_elements(By.TAG_NAME, "th")
                    header_texts = [h.text.strip() for h in headers]
                    
                    if "O" in header_texts and "W" in header_texts:
                        rows = table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip header
                        for row in rows:
                            try:
                                cells = row.find_elements(By.TAG_NAME, "td")
                                if cells and len(cells) >= 10:
                                    bowling_data.append({
                                        "Match_URL": href,
                                        "Innings": current_innings - 1,  # Adjust innings number for bowling
                                        "Bowling_Team": bowling_team,
                                        "Batting_Team": batting_team,
                                        "Bowler": cells[0].text.strip(),
                                        "Overs": cells[1].text.strip(),
                                        "Maidens": cells[2].text.strip(),
                                        "Runs": cells[3].text.strip(),
                                        "Wickets": cells[4].text.strip(),
                                        "Economy": cells[5].text.strip(),
                                        "Dots": cells[6].text.strip(),
                                        "Fours": cells[7].text.strip(),
                                        "Sixes": cells[8].text.strip(),
                                        "WD": cells[9].text.strip(),
                                        "NB": cells[10].text.strip() if len(cells) > 10 else "0"
                                    })
                            except (IndexError, NoSuchElementException):
                                continue
                except Exception as e:
                    print(f"Error processing bowling table: {str(e)}")
                    continue
        
        if progress_bar:
            progress_bar.update(1)
            
    except Exception as e:
        print(f"Error processing scorecard {href}: {str(e)}")
    finally:
        driver.quit()
        
    return batting_data, bowling_data

In [3]:
def main(max_matches=None):
    url = "https://www.espncricinfo.com/records/season/team-match-results/2024-2024?trophy=117"
    driver = setup_driver()
    
    try:
        # Get match summary and scorecard links
        match_data, scorecard_links = get_match_summary(driver, url)
        
        if max_matches:
            scorecard_links = scorecard_links[:max_matches]
            match_data = match_data[:max_matches]
        
        # Process scorecards in parallel
        all_batting_data = []
        all_bowling_data = []
        
        with tqdm(total=len(scorecard_links), desc="Processing scorecards") as pbar:
            with ThreadPoolExecutor(max_workers=4) as executor:
                futures = [executor.submit(get_scorecard_data, href, pbar) 
                          for href in scorecard_links]
                
                for future in futures:
                    batting_data, bowling_data = future.result()
                    all_batting_data.extend(batting_data)
                    all_bowling_data.extend(bowling_data)
        
        # Create DataFrames
        match_df = pd.DataFrame(match_data)
        batting_df = pd.DataFrame(all_batting_data)
        bowling_df = pd.DataFrame(all_bowling_data)
        
        return match_df, batting_df, bowling_df
        
    finally:
        driver.quit()


In [4]:
%%time
if __name__ == "__main__":
    match_df, batting_df, bowling_df = main()

Processing scorecards: 100%|██████████| 72/72 [18:28<00:00, 15.40s/it]


CPU times: total: 43.7 s
Wall time: 19min 13s


In [5]:
bowling_df

Unnamed: 0,Match_URL,Innings,Bowling_Team,Batting_Team,Bowler,Overs,Maidens,Runs,Wickets,Economy,Dots,Fours,Sixes,WD,NB
0,https://www.espncricinfo.com/series/indian-pre...,1,SRH,KKR,Mitchell Starc,3,0,14,2,4.66,11,2,0,0,0
1,https://www.espncricinfo.com/series/indian-pre...,1,SRH,KKR,Vaibhav Arora,3,0,24,1,8.00,9,2,1,4,0
2,https://www.espncricinfo.com/series/indian-pre...,1,SRH,KKR,Harshit Rana,4,1,24,2,6.00,13,2,1,1,0
3,https://www.espncricinfo.com/series/indian-pre...,1,SRH,KKR,Sunil Narine,4,0,16,1,4.00,9,0,0,0,0
4,https://www.espncricinfo.com/series/indian-pre...,1,SRH,KKR,Andre Russell,2.3,0,19,3,7.60,7,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,https://www.espncricinfo.com/series/indian-pre...,2,CSK,RCB,Alzarri Joseph,3.4,0,38,0,10.36,5,1,2,4,0
853,https://www.espncricinfo.com/series/indian-pre...,2,CSK,RCB,Karn Sharma,2,0,24,1,12.00,4,0,3,0,0
854,https://www.espncricinfo.com/series/indian-pre...,2,CSK,RCB,Mayank Dagar,2,0,6,0,3.00,6,0,0,0,0
855,https://www.espncricinfo.com/series/indian-pre...,2,CSK,RCB,Cameron Green,3,0,27,2,9.00,5,1,1,3,0


In [6]:
match_df.to_csv('match_data.csv', index=False)
batting_df.to_csv('batting_data.csv', index=False)
bowling_df.to_csv('bowling_data.csv', index=False)