In [2]:
!pip install splinter
!pip install selenium
!pip install beautifulsoup4

Collecting splinter
  Downloading splinter-0.21.0-py3-none-any.whl.metadata (3.9 kB)
Downloading splinter-0.21.0-py3-none-any.whl (40 kB)
Installing collected packages: splinter
Successfully installed splinter-0.21.0
Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.5.0 (from urllib3[socks]~=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.7.14-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.meta

In [6]:
!pip install webdriver-manager
!pip install pandas

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [3]:
from splinter import Browser
from bs4 import BeautifulSoup
import time

# 1. launch browser (change to the path to your driver)
browser = Browser('firefox', headless=True)  # or 'chrome'

url = "https://rankings.newsweek.com/worlds-best-hospitals-2023"
browser.visit(url)
time.sleep(5)  # wait initial load

all_rows = []

while True:
    # parse current page
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find("table")  # or locate by CSS class/id if available
    if not table:
        print("Table not found")
        break
    rows = table.select("tr")[1:]  # skip header
    for tr in rows:
        cols = tr.find_all(["td","th"])
        values = [c.get_text(strip=True) for c in cols]
        all_rows.append(values)

    # find “Load more” or “Next page” button
    # adjust selector based on actual button attributes
    btn = browser.find_by_text("Load more") or browser.find_by_text("Next")
    if btn and btn.visible:
        btn.first.click()
        time.sleep(3)  # wait for new rows to load
    else:
        break

browser.quit()

# Now all_rows holds scraped rows
for r in all_rows:
    print(r)


['1', 'Mayo Clinic - Rochester', 'United States', 'Rochester', 'MN']
['2', 'Cleveland Clinic', 'United States', 'Cleveland', 'OH']
['3', 'Massachusetts General Hospital', 'United States', 'Boston', 'MA']
['4', 'The Johns Hopkins Hospital', 'United States', 'Baltimore', 'MD']
['5', 'Toronto General - University Health Network', 'Canada', 'Toronto', '']
['6', 'Karolinska Universitetssjukhuset', 'Sweden', 'Solna', '']
['7', 'Charit? - Universit?tsmedizin Berlin', 'Germany', 'Berlin', '']
['8', 'AP-HP - H?pital Universitaire Piti? Salp?tri?re', 'France', 'Paris', '']
['9', 'Singapore General Hospital', 'Singapore', 'Singapore', '']
['10', 'UCLA Health - Ronald Reagan Medical Center', 'United States', 'Los Angeles', 'CA']
['11', 'Sheba Medical Center', 'Israel', 'Ramat Gan', '']
['12', 'Universit?tsspital Z?rich', 'Switzerland', 'Z?rich', '']
['13', 'Universit?tsklinikum Heidelberg', 'Germany', 'Heidelberg', '']
['14', 'Centre Hospitalier Universitaire Vaudois', 'Switzerland', 'Lausanne', '

In [8]:
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setup using Service (compatible with new Selenium versions)
service = Service(executable_path=ChromeDriverManager().install())
browser = Browser('chrome', service=service, headless=False)

url = "https://rankings.newsweek.com/worlds-best-hospitals-2023"
browser.visit(url)
time.sleep(5)

all_data = []
headers = []

for i in range(5):  # Adjust number of pages
    print(f"Scraping page {i+1}...")

    # Parse with BeautifulSoup
    soup = BeautifulSoup(browser.html, "html.parser")
    table = soup.find("table")

    if i == 0:
        headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]

    rows = table.find("tbody").find_all("tr")
    for row in rows:
        cells = [cell.get_text(strip=True) for cell in row.find_all("td")]
        all_data.append(cells)

    # Click next page
    next_button = browser.find_by_css('button[aria-label="Go to next page"]')
    if next_button and next_button.visible:
        next_button.click()
        time.sleep(3)
    else:
        break

browser.quit()

# Create DataFrame
df = pd.DataFrame(all_data, columns=headers)

# Show first few rows
print(df.head())


Scraping page 1...
  Rank                             Publication name        Country       City  \
0    1                      Mayo Clinic - Rochester  United States  Rochester   
1    2                             Cleveland Clinic  United States  Cleveland   
2    3               Massachusetts General Hospital  United States     Boston   
3    4                   The Johns Hopkins Hospital  United States  Baltimore   
4    5  Toronto General - University Health Network         Canada    Toronto   

  State (US only)  
0              MN  
1              OH  
2              MA  
3              MD  
4                  


In [10]:
len(df)

50

In [11]:
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up browser (Chrome with webdriver-manager)
service = Service(executable_path=ChromeDriverManager().install())
browser = Browser('chrome', service=service, headless=False)

# Visit Newsweek's hospital rankings page
url = "https://rankings.newsweek.com/worlds-best-hospitals-2023"
browser.visit(url)
time.sleep(5)  # allow JavaScript to render

all_data = []
headers = []

# Loop through the first 5 pages
for i in range(5):
    print(f"Scraping page {i + 1}...")

    # Parse page content
    soup = BeautifulSoup(browser.html, "html.parser")
    table = soup.find("table")

    # Grab headers only on first page
    if i == 0:
        headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]

    # Grab rows
    rows = table.find("tbody").find_all("tr")
    for row in rows:
        cells = [td.get_text(strip=True) for td in row.find_all("td")]
        all_data.append(cells)

    # Click "Next page" button using correct selector
    next_button = browser.find_by_css('button[aria-label="Next page"]')
    if next_button and next_button.visible:
        next_button.click()
        time.sleep(3)  # wait for new page to load
    else:
        print("Next button not found or no longer available.")
        break

browser.quit()

# Create DataFrame
df = pd.DataFrame(all_data, columns=headers)

# Display preview
print(df.head())


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
  Rank                             Publication name        Country       City  \
0    1                      Mayo Clinic - Rochester  United States  Rochester   
1    2                             Cleveland Clinic  United States  Cleveland   
2    3               Massachusetts General Hospital  United States     Boston   
3    4                   The Johns Hopkins Hospital  United States  Baltimore   
4    5  Toronto General - University Health Network         Canada    Toronto   

  State (US only)  
0              MN  
1              OH  
2              MA  
3              MD  
4                  


In [12]:
df

Unnamed: 0,Rank,Publication name,Country,City,State (US only)
0,1,Mayo Clinic - Rochester,United States,Rochester,MN
1,2,Cleveland Clinic,United States,Cleveland,OH
2,3,Massachusetts General Hospital,United States,Boston,MA
3,4,The Johns Hopkins Hospital,United States,Baltimore,MD
4,5,Toronto General - University Health Network,Canada,Toronto,
...,...,...,...,...,...
245,246,Yokohama Municipal Citizen's Hospital,Japan,Yokohama,
246,247,Chungnam National University Hospital,South Korea,Daejeon,
247,248,Centro M?dico ABC Campus Santa Fe,Mexico,Ciudad de M?xico,
248,249,National Taiwan University Hospital,Taiwan,Taipei City,
