In [15]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Remove the comment on this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_scholars_from_page(driver, url):
    """Extract scholars from a single page."""
    try:
        print(f"🌐 Navigating to URL: {url}")
        driver.get(url)

        # Wait until the scholar cards are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result_box_container'))
        )

        scholars_data = []

        # Find scholar cards on the page
        scholar_cards = driver.find_elements(By.CLASS_NAME, 'result_box_container')
        
        if not scholar_cards:
            print(f"⚠️ No scholars found on page: {url}")
        
        for card in scholar_cards:
            try:
                # Extract scholar rank
                rank_element = card.find_element(By.CLASS_NAME, 'item1')
                rank = rank_element.text.strip().replace('#', '') if rank_element else 'N/A'
                
                # Extract scholar name
                name_element = card.find_element(By.CLASS_NAME, 'scholar_ranking_name')
                name = name_element.text.strip() if name_element else 'N/A'
                
                # Extract scholar institution
                institution_element = card.find_element(By.CLASS_NAME, 'fds_rankings_result_container').find_element(By.TAG_NAME, 'a')
                institution = institution_element.text.strip() if institution_element else 'N/A'

                # Store the scholar's data in a dictionary
                scholar_data = {
                    'Rank': rank,
                    'Name': name,
                    'Institution': institution,
                }
                scholars_data.append(scholar_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a scholar card: {e}")
                continue

        return scholars_data

    except Exception as e:
        print(f"⚠️ Error in extract_scholars_from_page: {e}")
        return None


def crawl_scholargps(base_url, max_pages=100):
    """Crawl multiple pages of scholar data from ScholarGPS."""
    driver = init_browser()
    all_scholars = []

    try:
        for page_num in range(1, max_pages + 1):
            print(f"📄 Crawling page {page_num}...")
            url = f"{base_url}&p={page_num}"
            
            page_scholars = extract_scholars_from_page(driver, url)
            
            if page_scholars:
                all_scholars.extend(page_scholars)

            # Add a human-like delay between page requests
            sleep_time = random.uniform(5, 20)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_scholars


# URL for "Highly Ranked Scholars" page (2022 for Computer Science)
base_url = "https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science"

# Start crawling for the first 3 pages
all_scholars = crawl_scholargps(base_url, max_pages=50)

# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df.to_csv('highly_ranked_scholars_2022_3.csv', index=False)

print(f"✅ Data successfully saved to highly_ranked_scholars_2022_3.csv. Total scholars extracted: {len(all_scholars)}")


🚀 Browser started successfully.
📄 Crawling page 1...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=1
😴 Sleeping for 12.91 seconds...
📄 Crawling page 2...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=2
😴 Sleeping for 15.74 seconds...
📄 Crawling page 3...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=3
😴 Sleeping for 13.38 seconds...
📄 Crawling page 4...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=4
😴 Sleeping for 8.27 seconds...
📄 Crawling page 5...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&

In [19]:
# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df = df[['Rank', 'Name']].copy()
df.to_csv('highly_ranked_scholars_2022_4.csv', index=False)