# Scrapper Plan
- Identify all compeitetitions
- Identify top 30 voting code bases
- Collect the top 30 voting notebooks
- Collect the top 30 voting notebooks' comments 

In [None]:
import os
import random
import time
import json
import traceback  # <-- Add this
import re
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# File with the competitions data from the previous scraping run.
CHECKPOINT_FILE = './data/competitions_list.json'

def load_competitions():
    """Load competitions data from the checkpoint JSON file."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            try:
                comps = json.load(f)
                print(f"Loaded {len(comps)} competitions from checkpoint.")
                return comps
            except Exception as e:
                print("Error loading checkpoint:", e)
    return []

def save_json(data, filepath):
    """Save data as JSON to the given filepath."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"Saved JSON data to {filepath}")

def setup_driver():
    """Initialize undetected_chromedriver with some options."""
    options = uc.ChromeOptions()
    # Uncomment to run headless:
    options.add_argument('--headless')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')
    driver = uc.Chrome(options=options)
    driver.maximize_window()
    return driver

def scroll_and_collect_solutions(driver, url, target_count=50):
    driver.get(url)
    collected = {}
    
    while len(collected) < target_count:
        #Wait for the list items to be present.
        print(f"Collecting solutions from {url}...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, "//li[contains(@class, 'MuiListItem-root')]"))
        )
        cards = driver.find_elements(By.XPATH, "//li[contains(@class, 'MuiListItem-root')]")
        print(f"Found {len(cards)} cards on the page.")
        for card in cards:
            # get the outerHTML of the card
            outer_html = card.get_attribute("outerHTML")
            # check if the outerHTML contians the string "Score: "
            if "Score: " not in outer_html:
                continue
            score_elems = card.find_elements(By.XPATH, ".//span[contains(text(), 'Score: ')]")
            print(f"Found {len(score_elems)} score elements in the card.")
            if len(score_elems) == 0:
                continue
            score_text = score_elems[0].text  # e.g., "Score: 0.80143"
            score_match = re.search(r"Score: (\d+\.\d+)", score_text)
            if not score_match:
                print("Could not extract score from text. Skipping card...")
                continue
            score_val = float(score_match.group(1))
            if score_val < 0.75:
                continue
            # Find the anchor element with the solution URL.
            a_elem = card.find_element(By.XPATH, ".//a[contains(@class, 'sc-uYFMi') and contains(@href, '/code/')]")
            solution_url = a_elem.get_attribute("href")
            collected[solution_url] = score_val
            print(f"Collected solution: {solution_url} (Score: {score_val})")

        # Scroll down to load more solutions.
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    
    return collected

def download_solution(driver, solution_url, competition_folder):
    """
    Navigate to solution URL and download the notebook code.
    """
    try:
        # 1. Configure download path FIRST
        os.makedirs(competition_folder, exist_ok=True)
        driver.execute_cdp_cmd("Page.setDownloadBehavior", {
            "behavior": "allow",
            "downloadPath": os.path.abspath(competition_folder)
        })

        # 2. Load page
        print(f"🌐 Navigating to: {solution_url}")
        driver.get(solution_url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # 3. Click "More options" with explicit verification
        try:
            more_options_btn = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((
                    By.XPATH,
                    "//button[contains(@aria-label, 'More options for this notebook')]"
                ))
            )
            print("🔵 Found more options button")
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", more_options_btn)
            more_options_btn.click()
            print("🟢 Clicked more options button")
        except Exception as e:
            print(f"❌ Failed at more options button: {str(e)}")
            return False

        # 4. Locate download menu item
        try:
            download_item = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((
                    By.XPATH, "//*[normalize-space()='Download code']"
                ))
            )
            print("🔵 Found download menu item")
        except Exception as e:
            print(f"❌ Failed to find download item: {str(e)}")
            print("Current page HTML:")
            print(driver.page_source[:2000])  # Print first 2000 chars of HTML
            return False

        # 5. Execute click with multiple fallback methods
        try:
            print("⚡ Attempting click via ActionChains")
            ActionChains(driver).move_to_element(download_item).pause(0.5).click().perform()
            
            # Verify click worked
            WebDriverWait(driver, 5).until(
                lambda d: any(fname.endswith('.ipynb') for fname in os.listdir(competition_folder)
            ))
            print("✅ Download verified")
            return True
        except Exception as e:
            print(f"❌ Primary click failed: {str(e)}")
            print("🔄 Attempting JavaScript click as fallback")
            try:
                driver.execute_script("arguments[0].click();", download_item)
                time.sleep(5)
                if any(fname.endswith('.ipynb') for fname in os.listdir(competition_folder)):
                    print("✅ Fallback click succeeded")
                    return True
                raise Exception("No file appeared after fallback click")
            except Exception as e2:
                print(f"❌ All click attempts failed: {str(e2)}")
                return False

    except Exception as e:
        print(f"🔥 Critical failure: {traceback.format_exc()}")
        return False
def process_competition(driver, competition):
    """
    For a given competition, navigate to its code section (using sortBy=voteCount etc.),
    scroll and collect up to 50 solution URLs (with Score > 0.75), save these URLs to a JSON file,
    and then for each solution, navigate and click the download code button.
    """
    slug = competition.get("slug")
    # Create competition folder structure.
    base_folder = "./data/notebooks"
    comp_folder = os.path.join(base_folder, slug)
    os.makedirs(comp_folder, exist_ok=True)
    
    # Construct the code page URL.
    base_code_url = competition.get("code_url")  # e.g. "https://www.kaggle.com/competitions/titanic/code"
    code_page_url = base_code_url + "?sortBy=voteCount&excludeNonAccessedDatasources=true"
    if "competitionId" in competition:
        code_page_url += "&competitionId=" + str(competition["competitionId"])
    print(f"\nProcessing competition '{slug}' using URL: {code_page_url}")
    
    # Scroll and collect solution URLs.
    solution_urls = scroll_and_collect_solutions(driver, code_page_url, target_count=5)
    
    # Save the collected solution URLs as JSON under the competition folder.
    solutions_json_path = os.path.join(comp_folder, "solutions.json")
    save_json(solution_urls, solutions_json_path)
    
    # For each collected solution, download the code.
    for idx, sol_url in enumerate(solution_urls):
        print(f"Downloading solution {idx+1}/{len(solution_urls)}: {sol_url}")
        success = download_solution(driver, sol_url, comp_folder)
        if not success:
            print(f"Skipping solution: {sol_url}")
        # Optionally, add a short sleep between downloads.
        time.sleep(abs(random.normalvariate(5, 1)))

def main():
    competitions = load_competitions()
    if not competitions:
        print("No competitions to process. Exiting.")
        return
    
    driver = setup_driver()
    count = 0
    # Iterate over each competition.
    for comp in competitions:
        try:
            process_competition(driver, comp)
            count += 1
            if count ==1:
                break
        except Exception as e:
            print(f"Error processing competition {comp.get('slug', 'unknown')}: {e}")
            break
    
    driver.quit()
    print("All competitions processed.")

if __name__ == '__main__':
    main()


Loaded 552 competitions from checkpoint.

Processing competition 'titanic' using URL: https://www.kaggle.com/competitions/titanic/code?sortBy=voteCount&excludeNonAccessedDatasources=true
Collecting solutions from https://www.kaggle.com/competitions/titanic/code?sortBy=voteCount&excludeNonAccessedDatasources=true...
Found 28 cards on the page.
Collecting solutions from https://www.kaggle.com/competitions/titanic/code?sortBy=voteCount&excludeNonAccessedDatasources=true...
Found 29 cards on the page.
Found 1 score elements in the card.
Collected solution: https://www.kaggle.com/code/gusthema/titanic-competition-w-tensorflow-decision-forests/comments (Score: 0.80143)
Found 1 score elements in the card.
Collected solution: https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy/comments (Score: 0.88516)
Found 1 score elements in the card.
Collected solution: https://www.kaggle.com/code/mrisdal/exploring-survival-on-the-titanic/comments (Score: 0.80382)
Found 1

In [None]:
import os
import random
import time
import json
import traceback  # <-- Add this
import re
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# File with the competitions data from the previous scraping run.
CHECKPOINT_FILE = './data/competitions_list.json'

def load_competitions():
    """Load competitions data from the checkpoint JSON file."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            try:
                comps = json.load(f)
                print(f"Loaded {len(comps)} competitions from checkpoint.")
                return comps
            except Exception as e:
                print("Error loading checkpoint:", e)
    return []

def save_json(data, filepath):
    """Save data as JSON to the given filepath."""
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"Saved JSON data to {filepath}")

def setup_driver():
    """Initialize undetected_chromedriver with some options."""
    options = uc.ChromeOptions()
    # Uncomment to run headless:
    options.add_argument('--headless')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")
    options.add_argument('--disable-blink-features=AutomationControlled')
    driver = uc.Chrome(options=options)
    return driver

def scroll_and_collect_solutions(driver, url, target_count=100):
    driver.get(url)
    collected = {}
    
    while len(collected) < target_count:
        #Wait for the list items to be present.
        print(f"Collecting solutions from {url}...")
        WebDriverWait(driver, 5).until(
            EC.presence_of_all_elements_located((By.XPATH, "//li[contains(@class, 'MuiListItem-root')]"))
        )
        cards = driver.find_elements(By.XPATH, "//li[contains(@class, 'MuiListItem-root')]")
        print(f"Found {len(cards)} cards on the page.")
        for card in cards:
            # get the outerHTML of the card
            outer_html = card.get_attribute("outerHTML")
            # check if the outerHTML contians the string "Score: "
            if "Score: " not in outer_html:
                continue
            score_elems = card.find_elements(By.XPATH, ".//span[contains(text(), 'Score: ')]")
            print(f"Found {len(score_elems)} score elements in the card.")
            if len(score_elems) == 0:
                continue
            score_text = score_elems[0].text  # e.g., "Score: 0.80143"
            score_match = re.search(r"Score: (\d+\.\d+)", score_text)
            if not score_match:
                print("Could not extract score from text. Skipping card...")
                continue
            score_val = float(score_match.group(1))
            if score_val < 0.75:
                continue
            # Find the anchor element with the solution URL.
            a_elem = card.find_element(By.XPATH, ".//a[contains(@class, 'sc-uYFMi') and contains(@href, '/code/')]")
            solution_url = a_elem.get_attribute("href")
            # Find the anchor element with the solution votes.
            votes_elem = card.find_element(By.XPATH, ".//span[matches(@aria-label, '\\d+ votes')]")
            # Extract the number of votes.
            votes_match = re.search(r"(\d+) votes", votes_elem.get_attribute("aria-label"))
            votes = int(votes_match.group(1)) 
            collected[solution_url] = {"score": score_val, "votes": votes}
            print(f"Collected solution: {solution_url} (Score: {score_val})")

        # Scroll down to load more solutions.
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    
    return collected

def download_solution(driver, solution_url, competition_folder):
    """
    Navigate to solution URL and download the notebook code.
    """
    try:
        # 1. Configure download path FIRST
        os.makedirs(competition_folder, exist_ok=True)
        driver.execute_cdp_cmd("Page.setDownloadBehavior", {
            "behavior": "allow",
            "downloadPath": os.path.abspath(competition_folder)
        })

        # 2. Load page
        print(f"🌐 Navigating to: {solution_url}")
        driver.get(solution_url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # 3. Click "More options" with explicit verification
        try:
            more_options_btn = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((
                    By.XPATH,
                    "//button[contains(@aria-label, 'More options for this notebook')]"
                ))
            )
            print("🔵 Found more options button")
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", more_options_btn)
            more_options_btn.click()
            print("🟢 Clicked more options button")
        except Exception as e:
            print(f"❌ Failed at more options button: {str(e)}")
            return False

        # 4. Locate download menu item
        try:
            download_item = WebDriverWait(driver, 15).until(
                EC.element_to_be_clickable((
                    By.XPATH, "//*[normalize-space()='Download code']"
                ))
            )
            print("🔵 Found download menu item")
        except Exception as e:
            print(f"❌ Failed to find download item: {str(e)}")
            print("Current page HTML:")
            print(driver.page_source[:2000])  # Print first 2000 chars of HTML
            return False

        # 5. Execute click with multiple fallback methods
        try:
            print("⚡ Attempting click via ActionChains")
            ActionChains(driver).move_to_element(download_item).pause(0.5).click().perform()
            
            # Verify click worked
            WebDriverWait(driver, 5).until(
                lambda d: any(fname.endswith('.ipynb') for fname in os.listdir(competition_folder)
            ))
            print("✅ Download verified")
            return True
        except Exception as e:
            print(f"❌ Primary click failed: {str(e)}")
            print("🔄 Attempting JavaScript click as fallback")
            try:
                driver.execute_script("arguments[0].click();", download_item)
                time.sleep(5)
                if any(fname.endswith('.ipynb') for fname in os.listdir(competition_folder)):
                    print("✅ Fallback click succeeded")
                    return True
                raise Exception("No file appeared after fallback click")
            except Exception as e2:
                print(f"❌ All click attempts failed: {str(e2)}")
                return False

    except Exception as e:
        print(f"🔥 Critical failure: {traceback.format_exc()}")
        return False
def process_competition(driver, competition):
    """
    For a given competition, navigate to its code section (using sortBy=voteCount etc.),
    scroll and collect up to 50 solution URLs (with Score > 0.75), save these URLs to a JSON file,
    and then for each solution, navigate and click the download code button.
    """
    slug = competition.get("competition_name")
    # Create competition folder structure.
    base_folder = "./data/test/notebooks/"
    comp_folder = os.path.join(base_folder, slug)
    os.makedirs(comp_folder, exist_ok=True)
    
    # Construct the code page URL.
    base_code_url = competition.get("code_url")  # e.g. "https://www.kaggle.com/competitions/titanic/code"
    code_page_url = base_code_url + "?sortBy=voteCount&excludeNonAccessedDatasources=true"
    print(f"\nProcessing competition '{slug}' using URL: {code_page_url}")
    
    # Scroll and collect solution URLs.
    solution_urls = scroll_and_collect_solutions(driver, code_page_url, target_count=5)
    
    # Save the collected solution URLs as JSON under the competition folder.
    solutions_json_path = os.path.join(comp_folder, "notebooks.json")
    save_json(solution_urls, solutions_json_path)
    
    # For each collected solution, download the code.
    for idx, sol_url in enumerate(solution_urls):
        print(f"Downloading solution {idx+1}/{len(solution_urls)}: {sol_url}")
        success = download_solution(driver, sol_url, comp_folder)
        if not success:
            print(f"Skipping solution: {sol_url}")
        # Optionally, add a short sleep between downloads.
        time.sleep(abs(random.normalvariate(5, 1)))

def main():
    competitions = load_competitions()
    if not competitions:
        print("No competitions to process. Exiting.")
        return
    
    driver = setup_driver()
    count = 0
    # Iterate over each competition.
    for comp in competitions:
        try:
            process_competition(driver, comp)
            count += 1
            if count ==1:
                break
        except Exception as e:
            print(f"Error processing competition {comp.get('slug', 'unknown')}: {e}")
            break
    
    driver.quit()
    print("All competitions processed.")

if __name__ == '__main__':
    main()
