### cs scholars

In [15]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Remove the comment on this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_scholars_from_page(driver, url):
    """Extract scholars from a single page."""
    try:
        print(f"🌐 Navigating to URL: {url}")
        driver.get(url)

        # Wait until the scholar cards are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result_box_container'))
        )

        scholars_data = []

        # Find scholar cards on the page
        scholar_cards = driver.find_elements(By.CLASS_NAME, 'result_box_container')
        
        if not scholar_cards:
            print(f"⚠️ No scholars found on page: {url}")
        
        for card in scholar_cards:
            try:
                # Extract scholar rank
                rank_element = card.find_element(By.CLASS_NAME, 'item1')
                rank = rank_element.text.strip().replace('#', '') if rank_element else 'N/A'
                
                # Extract scholar name
                name_element = card.find_element(By.CLASS_NAME, 'scholar_ranking_name')
                name = name_element.text.strip() if name_element else 'N/A'
                
                # Extract scholar institution
                institution_element = card.find_element(By.CLASS_NAME, 'fds_rankings_result_container').find_element(By.TAG_NAME, 'a')
                institution = institution_element.text.strip() if institution_element else 'N/A'

                # Store the scholar's data in a dictionary
                scholar_data = {
                    'Rank': rank,
                    'Name': name,
                    'Institution': institution,
                }
                scholars_data.append(scholar_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a scholar card: {e}")
                continue

        return scholars_data

    except Exception as e:
        print(f"⚠️ Error in extract_scholars_from_page: {e}")
        return None


def crawl_scholargps(base_url, max_pages=100):
    """Crawl multiple pages of scholar data from ScholarGPS."""
    driver = init_browser()
    all_scholars = []

    try:
        for page_num in range(1, max_pages + 1):
            print(f"📄 Crawling page {page_num}...")
            url = f"{base_url}&p={page_num}"
            
            page_scholars = extract_scholars_from_page(driver, url)
            
            if page_scholars:
                all_scholars.extend(page_scholars)

            # Add a human-like delay between page requests
            sleep_time = random.uniform(5, 20)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_scholars


# URL for "Highly Ranked Scholars" page (2022 for Computer Science)
base_url = "https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science"

# Start crawling for the first 3 pages
all_scholars = crawl_scholargps(base_url, max_pages=50)

# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df.to_csv('highly_ranked_scholars_2022_3.csv', index=False)

print(f"✅ Data successfully saved to highly_ranked_scholars_2022_3.csv. Total scholars extracted: {len(all_scholars)}")


🚀 Browser started successfully.
📄 Crawling page 1...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=1
😴 Sleeping for 12.91 seconds...
📄 Crawling page 2...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=2
😴 Sleeping for 15.74 seconds...
📄 Crawling page 3...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=3
😴 Sleeping for 13.38 seconds...
📄 Crawling page 4...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&p=4
😴 Sleeping for 8.27 seconds...
📄 Crawling page 5...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&field=Engineering+and+Computer+Science&discipline=Computer+Science&

In [19]:
# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df = df[['Rank', 'Name']].copy()
df.to_csv('highly_ranked_scholars_2022_4.csv', index=False)

## stat scholars

In [3]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Remove the comment on this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_scholars_from_page(driver, url):
    """Extract scholars from a single page."""
    try:
        print(f"🌐 Navigating to URL: {url}")
        driver.get(url)

        # Wait until the scholar cards are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result_box_container'))
        )

        scholars_data = []

        # Find scholar cards on the page
        scholar_cards = driver.find_elements(By.CLASS_NAME, 'result_box_container')
        
        if not scholar_cards:
            print(f"⚠️ No scholars found on page: {url}")
        
        for card in scholar_cards:
            try:
                # Extract scholar rank
                rank_element = card.find_element(By.CLASS_NAME, 'item1')
                rank = rank_element.text.strip().replace('#', '') if rank_element else 'N/A'
                
                # Extract scholar name
                name_element = card.find_element(By.CLASS_NAME, 'scholar_ranking_name')
                name = name_element.text.strip() if name_element else 'N/A'
                
                # Extract scholar institution
                institution_element = card.find_element(By.CLASS_NAME, 'fds_rankings_result_container').find_element(By.TAG_NAME, 'a')
                institution = institution_element.text.strip() if institution_element else 'N/A'

                # Store the scholar's data in a dictionary
                scholar_data = {
                    'Rank': rank,
                    'Name': name,
                    'Institution': institution,
                }
                scholars_data.append(scholar_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a scholar card: {e}")
                continue

        return scholars_data

    except Exception as e:
        print(f"⚠️ Error in extract_scholars_from_page: {e}")
        return None


def crawl_scholargps(base_url, max_pages=100):
    """Crawl multiple pages of scholar data from ScholarGPS."""
    driver = init_browser()
    all_scholars = []

    try:
        for page_num in range(1, max_pages + 1):
            print(f"📄 Crawling page {page_num}...")
            url = f"{base_url}&p={page_num}"
            
            page_scholars = extract_scholars_from_page(driver, url)
            
            if page_scholars:
                all_scholars.extend(page_scholars)

            # Add a human-like delay between page requests
            sleep_time = random.uniform(5, 20)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_scholars


# URL for "Highly Ranked Scholars" page (2022 for STAT)
base_url = "https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics"

# Start crawling for the first 3 pages
all_scholars = crawl_scholargps(base_url, max_pages=3)

# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df = df[['Rank', 'Name']].copy()
df.to_csv('highly_ranked_scholars_2022_stat_prior5.csv', index=False)

print(f"✅ Data successfully saved to highly_ranked_scholars_2022_stat_prior5.csv. Total scholars extracted: {len(all_scholars)}")


🚀 Browser started successfully.
📄 Crawling page 1...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=1
😴 Sleeping for 18.86 seconds...
📄 Crawling page 2...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=2
😴 Sleeping for 9.79 seconds...
📄 Crawling page 3...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=3
😴 Sleeping for 19.33 seconds...
🛑 Browser closed.
✅ Data successfully saved to highly_ranked_scholars_2022_stat_prior5.csv. Total scholars extracted: 42


In [2]:
# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df = df[['Rank', 'Name']].copy()
df.to_csv('highly_ranked_scholars_2022_stat.csv', index=False)

#### ver2

In [5]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Remove the comment on this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_scholars_from_page(driver, url):
    """Extract scholars from a single page."""
    try:
        print(f"🌐 Navigating to URL: {url}")
        driver.get(url)

        # Wait until the scholar cards are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result_box_container'))
        )

        scholars_data = []

        # Find scholar cards on the page
        scholar_cards = driver.find_elements(By.CLASS_NAME, 'result_box_container')
        
        if not scholar_cards:
            print(f"⚠️ No scholars found on page: {url}")
        
        for card in scholar_cards:
            try:
                # Extract scholar rank
                rank_element = card.find_element(By.CLASS_NAME, 'item1')
                rank = rank_element.text.strip().replace('#', '') if rank_element else 'N/A'
                
                # Extract scholar name and profile link
                name_element = card.find_element(By.CLASS_NAME, 'scholar_ranking_name')
                name = name_element.text.strip() if name_element else 'N/A'
                profile_link = name_element.get_attribute('href') if name_element else 'N/A'
                
                # Extract scholar institution
                institution_element = card.find_element(By.CLASS_NAME, 'fds_rankings_result_container').find_element(By.TAG_NAME, 'a')
                institution = institution_element.text.strip() if institution_element else 'N/A'

                # Store the scholar's data in a dictionary
                scholar_data = {
                    'Rank': rank,
                    'Name': name,
                    'Profile Link': profile_link,
                    'Institution': institution,
                }
                scholars_data.append(scholar_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a scholar card: {e}")
                continue

        return scholars_data

    except Exception as e:
        print(f"⚠️ Error in extract_scholars_from_page: {e}")
        return None


def crawl_scholargps(base_url, max_pages=100):
    """Crawl multiple pages of scholar data from ScholarGPS."""
    driver = init_browser()
    all_scholars = []

    try:
        for page_num in range(1, max_pages + 1):
            print(f"📄 Crawling page {page_num}...")
            url = f"{base_url}&p={page_num}"
            
            page_scholars = extract_scholars_from_page(driver, url)
            
            if page_scholars:
                all_scholars.extend(page_scholars)

            # Add a human-like delay between page requests
            sleep_time = random.uniform(5, 20)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_scholars


# URL for "Highly Ranked Scholars" page (2022 for STAT)
base_url = "https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics"

# Start crawling for the first 3 pages
all_scholars = crawl_scholargps(base_url, max_pages=3)

# Save the extracted scholar data to a CSV file
df = pd.DataFrame(all_scholars)
df = df[['Rank', 'Name', 'Profile Link']].copy()
df.to_csv('highly_ranked_scholars_2022_stat_prior5.csv', index=False)

print(f"✅ Data successfully saved to highly_ranked_scholars_2022_stat_prior5.csv. Total scholars extracted: {len(all_scholars)}")


🚀 Browser started successfully.
📄 Crawling page 1...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=1
😴 Sleeping for 6.95 seconds...
📄 Crawling page 2...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=2
😴 Sleeping for 6.40 seconds...
📄 Crawling page 3...
🌐 Navigating to URL: https://scholargps.com/highly-ranked-scholars?year=2022&ranking_duration=LAST_5_YEARS&field=Physical+Sciences+and+Mathematics&discipline=Statistics&p=3
😴 Sleeping for 19.47 seconds...
🛑 Browser closed.
✅ Data successfully saved to highly_ranked_scholars_2022_stat_prior5.csv. Total scholars extracted: 42


#### publications per scholar

In [12]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Remove the comment on this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_publications_from_scholar(driver, scholar_profile_url, scholar_name):
    """Extract publications from a scholar's profile page."""
    try:
        print(f"🌐 Navigating to Scholar URL: {scholar_profile_url}")
        driver.get(scholar_profile_url)

        # Wait until the publication entries are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-content'))
        )

        publications_data = []

        # Find publication entries on the scholar's page
        publication_blocks = driver.find_elements(By.CLASS_NAME, 'result-content')
        
        if not publication_blocks:
            print(f"⚠️ No publications found on page: {scholar_profile_url}")
        
        for block in publication_blocks:
            try:
                # Extract publication title and link
                title_element = block.find_element(By.CLASS_NAME, 'publication_title').find_element(By.TAG_NAME, 'a')
                title = title_element.text.strip() if title_element else 'N/A'
                publication_link = title_element.get_attribute('href') if title_element else 'N/A'

                # Extract publication authors
                authors_element = block.find_element(By.CLASS_NAME, 'publication_block.authors')
                authors = authors_element.text.strip() if authors_element else 'N/A'
                
                # Extract publication journal/conference name and year
                journal_element = block.find_element(By.CLASS_NAME, 'publication_block.sub-title')
                journal_and_year = journal_element.text.strip() if journal_element else 'N/A'

                # Extract DOI link
                try:
                    doi_element = block.find_element(By.CLASS_NAME, 'doi_container').find_element(By.TAG_NAME, 'a')
                    doi_link = doi_element.get_attribute('href') if doi_element else 'N/A'
                except Exception:
                    doi_link = 'N/A'

                # Store the publication's data in a dictionary
                publication_data = {
                    'Scholar Name': scholar_name,
                    'Title': title,
                    'Publication Link': publication_link,
                    'Authors': authors,
                    'Journal/Conference': journal_and_year,
                    'DOI': doi_link,
                    'Scholar Profile': scholar_profile_url
                }
                publications_data.append(publication_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a publication block: {e}")
                continue

        return publications_data

    except Exception as e:
        print(f"⚠️ Error in extract_publications_from_scholar: {e}")
        return None


def crawl_publications_for_scholars(scholars_data, max_scholars=10):
    """Crawl publications for multiple scholars using their profile URLs."""
    driver = init_browser()
    all_publications = []

    try:
        for index, scholar in enumerate(scholars_data[:max_scholars]):
            print(f"📄 Extracting publications for Scholar {index + 1}/{max_scholars}: {scholar['Name']}")
            scholar_name = scholar['Name']
            scholar_profile_url = scholar['Profile Link']
            
            scholar_publications = extract_publications_from_scholar(driver, scholar_profile_url, scholar_name)
            
            if scholar_publications:
                all_publications.extend(scholar_publications)

            # Add a human-like delay between requests
            sleep_time = random.uniform(5, 20)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_publications

link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = []
for index in range(len(link_df)):
    current_scholar = link_df.iloc[index, :]
    current_dic = {}
    current_dic['Name'] = current_scholar['Name']
    current_dic['Profile Link'] = current_scholar['Profile Link']
    scholars_data.append(current_dic)

'''
# Sample scholar data
scholars_data = [
    {'Name': 'Guiwu Wei', 'Profile Link': 'https://scholargps.com/scholars/78021622765809/guiwu-wei'},
    {'Name': 'Ronald R. Yager', 'Profile Link': 'https://scholargps.com/scholars/86433654059655/ronald-r-yager'},
    {'Name': 'Lotfi A. Zadeh', 'Profile Link': 'https://scholargps.com/scholars/29688952374272/lotfi-a-zadeh'},
]
'''

# Crawl the first 3 scholars' publications
all_publications = crawl_publications_for_scholars(scholars_data, max_scholars=1000)

# Save the extracted publication data to a CSV file
df = pd.DataFrame(all_publications)
df.to_csv('stat_prior5year_scholars_publications.csv', index=False)

print(f"✅ Data successfully saved to scholars_publications.csv. Total publications extracted: {len(all_publications)}")


🚀 Browser started successfully.
📄 Extracting publications for Scholar 1/1000: Guiwu Wei
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei
😴 Sleeping for 5.18 seconds...
📄 Extracting publications for Scholar 2/1000: Mike Thelwall
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/37192170410417/mike-thelwall
😴 Sleeping for 10.07 seconds...
📄 Extracting publications for Scholar 3/1000: Andrew Gelman
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/54874277864582/andrew-gelman
😴 Sleeping for 16.08 seconds...
📄 Extracting publications for Scholar 4/1000: Malin Song
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/54475957797840/malin-song
😴 Sleeping for 14.50 seconds...
📄 Extracting publications for Scholar 5/1000: Daniel Rueckert
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/32739527174271/daniel-rueckert
😴 Sleeping for 6.36 seconds...
📄 Extracting publications for Scholar 6/1000: Jianzhou Wang
🌐 Navigati

In [10]:
link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = []
for index in range(len(link_df['Name'])):
    current_scholar = link_df.iloc[index, :]
    current_dic = {}
    current_dic['Name'] = current_scholar['Name']
    current_dic['Profile Link'] = current_scholar['Profile Link']
    scholars_data.append(current_dic)
len(scholars_data) == len(link_df)


True

##### fix page

In [None]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Uncomment this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_number):
    """Extract publications from a specific page of a scholar's profile."""
    try:
        url = f"{scholar_profile_url}?p={page_number}"
        print(f"🌐 Navigating to Scholar URL: {url}")
        driver.get(url)

        # Wait until the publication entries are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-content'))
        )

        publications_data = []

        # Find publication entries on the scholar's page
        publication_blocks = driver.find_elements(By.CLASS_NAME, 'result-content')
        
        if not publication_blocks:
            print(f"⚠️ No publications found on page: {url}")
        
        for block in publication_blocks:
            try:
                # Extract publication title and link
                title_element = block.find_element(By.CLASS_NAME, 'publication_title').find_element(By.TAG_NAME, 'a')
                title = title_element.text.strip() if title_element else 'N/A'
                publication_link = title_element.get_attribute('href') if title_element else 'N/A'

                # Extract publication authors
                authors_element = block.find_element(By.CLASS_NAME, 'publication_block.authors')
                authors = authors_element.text.strip() if authors_element else 'N/A'
                
                # Extract publication journal/conference name and year
                journal_element = block.find_element(By.CLASS_NAME, 'publication_block.sub-title')
                journal_and_year = journal_element.text.strip() if journal_element else 'N/A'

                # Extract DOI link
                try:
                    doi_element = block.find_element(By.CLASS_NAME, 'doi_container').find_element(By.TAG_NAME, 'a')
                    doi_link = doi_element.get_attribute('href') if doi_element else 'N/A'
                except Exception:
                    doi_link = 'N/A'
                    
                # Extract publication journal/conference name and year
                citation_element = block.find_element(By.CLASS_NAME, 'publication_block.source')
                citation_count = citation_element.text.strip() if citation_element else 'N/A'

                # Store the publication's data in a dictionary
                publication_data = {
                    'Scholar Name': scholar_name,
                    'Title': title,
                    'Publication Link': publication_link,
                    'Authors': authors,
                    'Journal/Conference': journal_and_year,
                    'DOI': doi_link,
                    'Scholar Profile': scholar_profile_url,
                    'Page': page_number,
                    'Citation': citation_count
                }
                publications_data.append(publication_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a publication block: {e}")
                continue

        return publications_data

    except Exception as e:
        print(f"⚠️ Error in extract_publications_from_page: {e}")
        return None


def crawl_publications_for_scholar(driver, scholar_name, scholar_profile_url, max_pages=5):
    """Crawl all pages of publications for a single scholar."""
    all_publications = []
    for page_num in range(1, max_pages + 1):
        print(f"📄 Extracting publications for {scholar_name} (Page {page_num})...")
        
        page_publications = extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_num)
        
        if page_publications:
            all_publications.extend(page_publications)
        else:
            # If no publications are found on this page, it's likely the last page
            print(f"🚫 No more publications found for {scholar_name} on page {page_num}. Stopping.")
            break

        # Add a human-like delay between requests
        sleep_time = random.uniform(5, 15)
        print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
        time.sleep(sleep_time)
    
    return all_publications


def crawl_publications_for_all_scholars(scholars_data, max_pages=5):
    """Crawl publications for multiple scholars using their profile URLs."""
    driver = init_browser()
    all_publications = []

    try:
        for index, scholar in enumerate(scholars_data):
            print(f"📄 Extracting publications for Scholar {index + 1}/{len(scholars_data)}: {scholar['Name']}")
            scholar_name = scholar['Name']
            scholar_profile_url = scholar['Profile Link']
            
            scholar_publications = crawl_publications_for_scholar(driver, scholar_name, scholar_profile_url, max_pages)
            
            print(f"Found {len(scholar_publications)} papers for scholar: {scholar_name}")
            
            if scholar_publications:
                all_publications.extend(scholar_publications)

            # Add a human-like delay between scholars
            sleep_time = random.uniform(10, 30)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds before next scholar...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_publications


link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = []
for index in range(len(link_df)):
    current_scholar = link_df.iloc[index, :]
    current_dic = {}
    current_dic['Name'] = current_scholar['Name']
    current_dic['Profile Link'] = current_scholar['Profile Link']
    scholars_data.append(current_dic)

# Crawl publications for all listed scholars
all_publications = crawl_publications_for_all_scholars(scholars_data, max_pages=20)

# Save the extracted publication data to a CSV file
df = pd.DataFrame(all_publications)
df.to_csv('stat_prior5year_scholars_publications_2.csv', index=False)

print(f"✅ Data successfully saved to stat_prior5year_scholars_publications_2.csv. Total publications extracted: {len(all_publications)}")


🚀 Browser started successfully.
📄 Extracting publications for Scholar 1/42: Guiwu Wei
📄 Extracting publications for Guiwu Wei (Page 1)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=1
😴 Sleeping for 9.49 seconds...
📄 Extracting publications for Guiwu Wei (Page 2)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=2
😴 Sleeping for 8.95 seconds...
📄 Extracting publications for Guiwu Wei (Page 3)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=3
😴 Sleeping for 7.75 seconds...
📄 Extracting publications for Guiwu Wei (Page 4)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=4
😴 Sleeping for 10.61 seconds...
📄 Extracting publications for Guiwu Wei (Page 5)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=5
😴 Sleeping for 10.56 seconds...
📄 Extracting publications for Guiwu Wei (Page 6

KeyboardInterrupt: 

###### ver2

In [2]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re


def init_browser():
    """Initialize the browser with required options and settings."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    # Uncomment this line if you want to run in headless mode
    # chrome_options.add_argument("--headless")  

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)  # Wait up to 30 seconds for page to load

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_number):
    """Extract publications from a specific page of a scholar's profile."""
    try:
        url = f"{scholar_profile_url}?p={page_number}"
        print(f"🌐 Navigating to Scholar URL: {url}")
        driver.get(url)

        # Wait until the publication entries are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-content'))
        )

        publications_data = []

        # Find publication entries on the scholar's page
        publication_blocks = driver.find_elements(By.CLASS_NAME, 'result-content')
        
        if not publication_blocks:
            print(f"⚠️ No publications found on page: {url}")
        
        for block in publication_blocks:
            try:
                # Extract publication title and link
                title_element = block.find_element(By.CLASS_NAME, 'publication_title').find_element(By.TAG_NAME, 'a')
                title = title_element.text.strip() if title_element else 'N/A'
                publication_link = title_element.get_attribute('href') if title_element else 'N/A'

                # Extract publication authors
                authors_element = block.find_element(By.CLASS_NAME, 'publication_block.authors')
                authors = authors_element.text.strip() if authors_element else 'N/A'
                
                # Extract publication journal/conference name and year
                journal_element = block.find_element(By.CLASS_NAME, 'publication_block.sub-title')
                journal_and_year = journal_element.text.strip() if journal_element else 'N/A'

                # Extract DOI link
                try:
                    doi_element = block.find_element(By.CLASS_NAME, 'doi_container').find_element(By.TAG_NAME, 'a')
                    doi_link = doi_element.get_attribute('href') if doi_element else 'N/A'
                except Exception:
                    doi_link = 'N/A'

                # Extract citation count (only the number from 'Cited by (4)')
                try:
                    source_block = block.find_element(By.CLASS_NAME, 'publication_block.source')
                    cited_by_element = source_block.find_element(By.XPATH, ".//a[contains(text(), 'Cited by')]")
                    cited_by_text = cited_by_element.text.strip() if cited_by_element else 'N/A'
                    # Extract only the number from 'Cited by (4)'
                    citation_count_match = re.search(r'Cited by \((\d+)\)', cited_by_text)
                    citation_count = int(citation_count_match.group(1)) if citation_count_match else 0
                except Exception as e:
                    citation_count = 0

                # Store the publication's data in a dictionary
                publication_data = {
                    'Scholar Name': scholar_name,
                    'Title': title,
                    'Publication Link': publication_link,
                    'Authors': authors,
                    'Journal/Conference': journal_and_year,
                    'DOI': doi_link,
                    'Citation Count': citation_count,
                    'Scholar Profile': scholar_profile_url,
                    'Page': page_number
                }
                publications_data.append(publication_data)
            except Exception as e:
                print(f"⚠️ Error extracting data for a publication block: {e}")
                continue

        return publications_data

    except Exception as e:
        print(f"⚠️ Error in extract_publications_from_page: {e}")
        return None


def crawl_publications_for_scholar(driver, scholar_name, scholar_profile_url, max_pages=5):
    """Crawl all pages of publications for a single scholar."""
    all_publications = []
    for page_num in range(1, max_pages + 1):
        print(f"📄 Extracting publications for {scholar_name} (Page {page_num})...")
        
        page_publications = extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_num)
        
        if page_publications:
            all_publications.extend(page_publications)
        else:
            print(f"🚫 No more publications found for {scholar_name} on page {page_num}. Stopping.")
            break

        sleep_time = random.uniform(5, 15)
        print(f"😴 Sleeping for {sleep_time:.2f} seconds...")
        time.sleep(sleep_time)
    
    return all_publications


def crawl_publications_for_all_scholars(scholars_data, max_pages=5):
    """Crawl publications for multiple scholars using their profile URLs."""
    driver = init_browser()
    all_publications = []

    try:
        for index, scholar in enumerate(scholars_data):
            print(f"📄 Extracting publications for Scholar {index + 1}/{len(scholars_data)}: {scholar['Name']}")
            scholar_name = scholar['Name']
            scholar_profile_url = scholar['Profile Link']
            
            scholar_publications = crawl_publications_for_scholar(driver, scholar_name, scholar_profile_url, max_pages)
            
            # Save the extracted publication data to a CSV file
            scholar_df = pd.DataFrame(scholar_publications)
            scholar_df.to_csv(f'./stat/stat_{scholar_name}_prior5year_scholars_publications.csv', index=False)
            
            if scholar_publications:
                all_publications.extend(scholar_publications)

            sleep_time = random.uniform(10, 30)
            print(f"😴 Sleeping for {sleep_time:.2f} seconds before next scholar...")
            time.sleep(sleep_time)
    
    finally:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_publications


link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = []
for index in range(len(link_df)):
    current_scholar = link_df.iloc[index, :]
    current_dic = {}
    current_dic['Name'] = current_scholar['Name']
    current_dic['Profile Link'] = current_scholar['Profile Link']
    scholars_data.append(current_dic)

# Crawl publications for all listed scholars
all_publications = crawl_publications_for_all_scholars(scholars_data, max_pages=20)

# Save the extracted publication data to a CSV file
df = pd.DataFrame(all_publications)
df.to_csv('stat_prior5year_scholars_publications_2.csv', index=False)

print(f"✅ Data successfully saved to stat_prior5year_scholars_publications_2.csv. Total publications extracted: {len(all_publications)}")


🚀 Browser started successfully.
📄 Extracting publications for Scholar 1/42: Guiwu Wei
📄 Extracting publications for Guiwu Wei (Page 1)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=1
😴 Sleeping for 8.04 seconds...
📄 Extracting publications for Guiwu Wei (Page 2)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=2
😴 Sleeping for 5.35 seconds...
📄 Extracting publications for Guiwu Wei (Page 3)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=3
😴 Sleeping for 14.42 seconds...
📄 Extracting publications for Guiwu Wei (Page 4)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=4
😴 Sleeping for 6.63 seconds...
📄 Extracting publications for Guiwu Wei (Page 5)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=5
😴 Sleeping for 11.86 seconds...
📄 Extracting publications for Guiwu Wei (Page 6

KeyboardInterrupt: 

###### continue from Gary S. Collins

In [6]:
scholars_crawled = ['Guiwu Wei', 'Mike Thelwall', 'Andrew Gelman', 'Malin Song', 'Daniel Rueckert', 'Jianzhou Wang', 'Muhammad Aslam', 'Deyu Meng', 'Douglas G. Altman', 'Kyungdo Han']
link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = []
for index in range(len(link_df)):
    current_scholar = link_df.iloc[index, :]
    print(current_scholar['Name'])
    if current_scholar['Name'] in scholars_crawled:
        continue
    
    current_dic = {}
    current_dic['Name'] = current_scholar['Name']
    current_dic['Profile Link'] = current_scholar['Profile Link']
    scholars_data.append(current_dic)

# Crawl publications for all listed scholars
all_publications = crawl_publications_for_all_scholars(scholars_data, max_pages=20)

# Save the extracted publication data to a CSV file
df = pd.DataFrame(all_publications)
df.to_csv('stat_prior5year_scholars_publications_2.csv', index=False)

print(f"✅ Data successfully saved to stat_prior5year_scholars_publications_2.csv. Total publications extracted: {len(all_publications)}")


Guiwu Wei
Mike Thelwall
Andrew Gelman
Malin Song
Daniel Rueckert
Jianzhou Wang
Muhammad Aslam
Deyu Meng
Douglas G. Altman
Kyungdo Han
Gary S. Collins
Johann S. De Bono
Serge Hercberg
Trevor Hastie
Massimo Ciccozzi
Rob J. Hyndman
Andreas Holzinger
Peng Ding
Yong Du
Naomi Altman
Giovanni Sotgiu
Gerta Rücker
Martin J. Wainwright
Satya N. Majumdar
Dylan S. Small
Martin Krzywinski
Adam M. Phillippy
Abdul Haq
David B. Dunson
Stuart J. Pocock
Giovanni Corrao
Bin Xu
Marcel A. L. M. Van Assen
Antonio Gasparrini
Yvan Saeys
Maxim Finkelstein
Umapada Pal
Anna Chaimani
José Crossa
Sergey Koren
Jiu-Ying Dong
Jun Cheng
🚀 Browser started successfully.
📄 Extracting publications for Scholar 1/32: Gary S. Collins
📄 Extracting publications for Gary S. Collins (Page 1)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/97605358492583/gary-s-collins?p=1
⚠️ Error in extract_publications_from_page: Message: no such window: target window already closed
from unknown error: web view not found
  (Ses

KeyboardInterrupt: 

###### test: using proxy id

In [18]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re


# List of Proxies (replace with valid proxies)
PROXY_LIST = [
    'http://103.152.112.120:80',
    'http://34.93.180.113:8660',
    'http://34.100.138.252:8660',
    'http://103.56.206.65:4995',
    'http://35.215.216.90:80'
]


def get_random_proxy():
    """Return a random proxy from the list."""
    proxy = random.choice(PROXY_LIST)
    print(f"🔄 Switching to new proxy: {proxy}")
    return proxy


def init_browser(proxy=None):
    """Initialize the browser with proxy and required options."""
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")

    if proxy:
        chrome_options.add_argument(f'--proxy-server={proxy}')

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(30)

    # Remove the "navigator.webdriver" property to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    print("🚀 Browser started successfully.")
    return driver


def extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_number):
    """Extract publications from a specific page of a scholar's profile."""
    try:
        url = f"{scholar_profile_url}?p={page_number}"
        print(f"🌐 Navigating to Scholar URL: {url}")
        driver.get(url)

        # Check for CAPTCHA or rate limit
        if "rate limit" in driver.page_source.lower() or "too many requests" in driver.page_source.lower() or "captcha" in driver.page_source.lower():
            print("⚠️ Rate limit or CAPTCHA detected! Retrying after a long wait...")
            return "rate_limit"

        # Wait until the publication entries are present on the page
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-content'))
        )

        publications_data = []
        publication_blocks = driver.find_elements(By.CLASS_NAME, 'result-content')
        
        for block in publication_blocks:
            try:
                title_element = block.find_element(By.CLASS_NAME, 'publication_title').find_element(By.TAG_NAME, 'a')
                title = title_element.text.strip()
                publication_link = title_element.get_attribute('href')
                publication_data = {
                    'Scholar Name': scholar_name,
                    'Title': title,
                    'Publication Link': publication_link
                }
                publications_data.append(publication_data)
            except Exception as e:
                continue

        return publications_data

    except Exception as e:
        print(f"⚠️ Error in extract_publications_from_page: {e}")
        return None


def handle_rate_limit():
    """Handle rate limits by waiting and switching proxy."""
    wait_time = random.uniform(120, 300)  # Wait 2 to 5 minutes
    print(f"⚠️ Rate limit detected! Sleeping for {wait_time:.2f} seconds...")
    time.sleep(wait_time)


def crawl_publications_for_scholar(scholar_name, scholar_profile_url, max_pages=5):
    """Crawl all pages of publications for a single scholar."""
    all_publications = []
    driver = None
    for page_num in range(1, max_pages + 1):
        try:
            if not driver:
                proxy = get_random_proxy()
                driver = init_browser(proxy)

            print(f"📄 Extracting publications for {scholar_name} (Page {page_num})...")
            result = extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_num)
            
            if result == "rate_limit":
                driver.quit()
                handle_rate_limit()
                proxy = get_random_proxy()
                driver = init_browser(proxy)
                result = extract_publications_from_page(driver, scholar_profile_url, scholar_name, page_num)
            
            if result:
                all_publications.extend(result)
            else:
                print(f"🚫 No more publications found for {scholar_name} on page {page_num}. Stopping.")
                break

            time.sleep(random.uniform(5, 15))

        except Exception as e:
            print(f"❌ Error for scholar {scholar_name} on page {page_num}: {e}")
            break

    if driver:
        driver.quit()
        print("🛑 Browser closed.")
    
    return all_publications


def crawl_publications_for_all_scholars(scholars_data, max_pages=5):
    """Crawl publications for multiple scholars using their profile URLs."""
    all_publications = []

    for index, scholar in enumerate(scholars_data):
        scholar_name = scholar['Name']
        scholar_profile_url = scholar['Profile Link']
        print(f"📄 Extracting publications for Scholar {index + 1}/{len(scholars_data)}: {scholar_name}")
        
        scholar_publications = crawl_publications_for_scholar(scholar_name, scholar_profile_url, max_pages)
        
        if scholar_publications:
            all_publications.extend(scholar_publications)

        time.sleep(random.uniform(10, 30))
    
    return all_publications


# Read scholar list from CSV file
link_df = pd.read_csv('highly_ranked_scholars_2022_stat_prior5.csv')
scholars_data = link_df.to_dict(orient='records')

# Crawl publications for all listed scholars
all_publications = crawl_publications_for_all_scholars(scholars_data, max_pages=20)

# Save the extracted publication data to a CSV file
df = pd.DataFrame(all_publications)
df.to_csv('stat_prior5year_scholars_publications_2.csv', index=False)

print(f"✅ Data successfully saved to stat_prior5year_scholars_publications_2.csv. Total publications extracted: {len(all_publications)}")


📄 Extracting publications for Scholar 1/42: Guiwu Wei
🔄 Switching to new proxy: http://34.100.138.252:8660
🚀 Browser started successfully.
📄 Extracting publications for Guiwu Wei (Page 1)...
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=1
⚠️ Rate limit or CAPTCHA detected! Retrying after a long wait...
⚠️ Rate limit detected! Sleeping for 288.87 seconds...
🔄 Switching to new proxy: http://103.152.112.120:80
🚀 Browser started successfully.
🌐 Navigating to Scholar URL: https://scholargps.com/scholars/78021622765809/guiwu-wei?p=1
⚠️ Error in extract_publications_from_page: Message: unknown error: net::ERR_TUNNEL_CONNECTION_FAILED
  (Session info: chrome=128.0.6613.85)
Stacktrace:
0   chromedriver                        0x000000010483d208 cxxbridge1$str$ptr + 1927396
1   chromedriver                        0x000000010483566c cxxbridge1$str$ptr + 1895752
2   chromedriver                        0x0000000104430808 cxxbridge1$string$len + 89564
3   chr

KeyboardInterrupt: 