In [12]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine


In [13]:
# Set URL and initiate beautiful soup
url = 'https://www.findaphd.com/phds/united-kingdom/?g0w900&PG={}'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')


In [14]:

# Retrieve PhD data from URL
def get_data_from_url(url):
    response = requests.get(url)
    data = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        div_elements = soup.find_all('div', class_='w-100 card shadow-sm p-4')
        for div in div_elements:
            # Scrape data by tag
            project_url = div.select_one('a.h4.text-dark')['href']
            project_title = div.select_one('a.h4.text-dark')['title']
            university_name = div.select_one('span.phd-result__dept-inst--title').text
            supervisors_info = div.select_one('a.phd-result__key-info.super')
            if supervisors_info:
                supervisors_info = supervisors_info.text.replace('\xa0', ' ').strip()
            else:
                supervisors_info = "N/A"
            applications_info = div.select_one('a.hoverTitle.subButton.badge.text-wrap.badge-light.card-badge.p-2.m-1.font-weight-light').text.replace('\xa0', ' ').strip()

            # Create a dictionary of data
            result_dict = {
                'project_title': project_title,
                'project_url': project_url,
                'university_name': university_name,
                'supervisors_info': supervisors_info,
                'applications_info': applications_info
            }

            data.append(result_dict)
        return data
    else:
        return None


In [15]:
# Loop through all pages
def scrape_multiple_pages(start_page, end_page):
    base_url = 'https://www.findaphd.com/phds/united-kingdom/?g0w900&PG={}'

    all_data = []
    for page_number in range(start_page, end_page + 1):
        url = base_url.format(page_number)
        data = get_data_from_url(url)
        if data:
            all_data.extend(data)
            print(f"page {page_number} and {len(data)}")

        # Add a 1 second delay between requests to avoid getting shut out by servers
        time.sleep(1)

    return all_data


In [16]:
# Find last page number
a_element = soup.find("li", class_="on page-item").find("a", class_="page-link text-dark")

if a_element:
    end_page_number = int(a_element.text)
else:
    print("Element not found.")


In [17]:
# Append all pages from website
start_page_number = 1
page_numbers = [num for num in range(start_page_number, end_page_number + 1, 30)]
page_numbers.append(end_page_number)


In [18]:

# Scrape data! (B.N. we ran later tests using subsets of data, hence the results below. If you use the code now, you will receive all entries from FindaPhd)
result_data = scrape_multiple_pages(start_page_number, end_page_number+1)


page 1 and 15
page 2 and 15
page 3 and 15
page 4 and 15
page 5 and 15


In [19]:
# Convert the list of dictionaries into Pandas DataFrame
df = pd.DataFrame(result_data)

# Save as CSV file
df.to_csv('output.csv', index=False)


In [20]:
# Inspect dataframe (B.N. we ran later tests using subsets of data, hence the results below. If you use the code now, you will receive all entries from FindaPhd)
df.describe()


Unnamed: 0,project_title,project_url,university_name,supervisors_info,applications_info
count,75,75,75,75,75
unique,75,75,13,59,12
top,PhD Research Project: Effect of accelerated ca...,/phds/project/effect-of-accelerated-carbonatio...,Kingston University,Supervisor: Dr I Centre for Additive Manufacture,Year round applications
freq,1,1,55,4,62
