In [40]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
import csv
import time

In [41]:
class IPLScraper:
    def __init__(self):
        self.setup_driver()
        
    def setup_driver(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--blink-settings=imagesEnabled=false')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.page_load_strategy = 'eager'
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
        
    def get_all_team_links(self):
        """Get links for all IPL teams"""
        self.driver.get("https://www.espncricinfo.com/series/indian-premier-league-2024-1410320/squads")
        team_elements = self.wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "div.ds-flex.ds-space-x-2 a")
            )
        )
        return [elem.get_attribute('href') for elem in team_elements]

    def get_player_links_for_team(self, team_url):
        """Get all player links for a specific team"""
        self.driver.get(team_url)
        time.sleep(2)  # Allow dynamic content to load
        player_elements = self.wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "div.ds-flex.ds-space-x-2 a")
            )
        )
        return [elem.get_attribute('href') for elem in player_elements]
    
    def get_player_info(self, url):
        try:
            self.driver.get(url)
            
            # First get the player's name
            name_xpath = "//p[contains(@class, 'ds-text-tight-m') and contains(text(), 'Full Name')]/following-sibling::span[contains(@class, 'ds-text-title-s')]/p"
            name = self.wait.until(EC.presence_of_element_located((By.XPATH, name_xpath))).text.strip()
            
            # Get all images and find the one with alt text containing the player's name
            images = self.driver.find_elements(By.TAG_NAME, "img")
            player_image = ""
            name_parts = name.lower().split()
            
            for img in images:
                alt_text = img.get_attribute('alt').lower()
                if any(part in alt_text for part in name_parts):
                    player_image = img.get_attribute('src')
                    break
            
            selectors = {
                'batting_style': "//p[contains(@class, 'ds-text-tight-m') and contains(text(), 'Batting Style')]/following-sibling::span[contains(@class, 'ds-text-title-s')]/p",
                'bowling_style': "//p[contains(@class, 'ds-text-tight-m') and contains(text(), 'Bowling Style')]/following-sibling::span[contains(@class, 'ds-text-title-s')]/p",
                'playing_role': "//p[contains(@class, 'ds-text-tight-m') and contains(text(), 'Playing Role')]/following-sibling::span[contains(@class, 'ds-text-title-s')]/p"
            }
            
            info = {
                'name': name,
                'image': player_image
            }
            
            for key, xpath in selectors.items():
                try:
                    element = self.driver.find_element(By.XPATH, xpath)
                    info[key] = element.text.strip()
                except:
                    info[key] = ""
            
            info['description'] = self.get_description()
            
            return info if info['name'] else None
            
        except Exception as e:
            print(f"\nError extracting info from {url}: {e}")
            return None

    def get_description(self):
        try:
            elements = self.driver.find_elements(
                By.CSS_SELECTOR, "div.ci-player-bio-content p"
            )
            return " ".join([elem.text.strip() for elem in elements if elem.text.strip()])
        except:
            return ""
    
    def scrape_players(self):
        # Get all team links
        team_links = self.get_all_team_links()
        print(f"Found {len(team_links)} teams")
        
        # Collect all player links
        all_player_links = []
        for team_url in tqdm(team_links, desc="Collecting team rosters", unit="team"):
            player_links = self.get_player_links_for_team(team_url)
            all_player_links.extend(player_links)
        
        print(f"\nFound {len(all_player_links)} players total")
        
        # Create/open CSV file
        with open('ipl_players.csv', 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['name', 'image', 'batting_style', 'bowling_style', 'playing_role', 'description']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            
            # Scrape each player's info with progress bar
            for url in tqdm(all_player_links, desc="Scraping player data", unit="player"):
                player_info = self.get_player_info(url)
                if player_info:
                    writer.writerow(player_info)
    
    def close(self):
        self.driver.quit()

In [42]:
def main():
    scraper = IPLScraper()
    try:
        scraper.scrape_players()
    finally:
        scraper.close()
        print("\nScraping completed. Check ipl_players.csv for results.")

In [43]:
if __name__ == "__main__":
    main()

Found 10 teams


Collecting team rosters: 100%|██████████| 10/10 [00:41<00:00,  4.18s/team]



Found 262 players total


Scraping player data: 100%|██████████| 262/262 [13:27<00:00,  3.08s/player]



Scraping completed. Check ipl_players.csv for results.
