In [1]:
! pip install beautifulsoup4
! pip install html5lib



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [3]:
# URL of the web page you want to extract tables from
url = 'https://intranet.londonmet.ac.uk/course-catalogue/courses/search-results'

# Send an HTTP GET request to the URL
response = requests.get(url)

In [4]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the first table element in the parsed HTML
    table = soup.find('table')
    
    # Prepare to collect rows of data
    data = []
    
    if table:
        # Find all rows in the table
        rows = table.find_all('tr')
        
        # Get headers from the first row
        headers = [header.text.strip() for header in rows[0].find_all('th')]
        
        # Iterate over all rows (excluding the header row)
        for row in rows[1:]:
            cells = row.find_all('td')
            row_data = {}
            for index, cell in enumerate(cells):
                # Check if the cell contains an 'a' tag (hyperlink)
                a_tag = cell.find('a')
                if a_tag:
                    # Get the text and URL from the 'a' tag
                    row_data[headers[index]] = {'text': a_tag.text.strip(), 'url': a_tag['href']}
                else:
                    # Get text if no 'a' tag is present
                    row_data[headers[index]] = {'text': cell.text.strip()}
            data.append(row_data)
        
        # Convert the list of dictionaries into a DataFrame
        df = pd.DataFrame(data)
        display(df)
    else:
        print("No tables found in the HTML.")
else:
    print(f"Failed to retrieve data from URL. Status code: {response.status_code}")


Unnamed: 0,Code,Full title,Type,Info
0,"{'text': 'APARAPRB', 'url': '/course-catalogue...",{'text': 'Architect Apprenticeship (Level 7) (...,{'text': 'Postgraduate'},{'text': ''}
1,{'text': 'APMAAEPP'},{'text': 'Architecture Apprenticeship (RIBA 2 ...,{'text': 'Postgraduate'},{'text': ''}
2,{'text': 'APPAFEWL'},{'text': 'PGCE Primary Apprenticeship 5-11 (We...,{'text': 'Postgraduate'},{'text': ''}
3,{'text': 'APPAFSWL'},{'text': 'PGCE Primary Apprenticeship 5-11 SEN...,{'text': 'Postgraduate'},{'text': ''}
4,{'text': 'APPAHTSA'},{'text': 'PGCE Primary Apprenticeship (Hackney...,{'text': 'Postgraduate'},{'text': ''}
...,...,...,...,...
382,{'text': 'UOIFPSCI'},{'text': 'International Foundation Programme S...,{'text': 'Undergraduate'},{'text': ''}
383,{'text': 'UOIFPSSH'},{'text': 'International Foundation Programme S...,{'text': 'Undergraduate'},{'text': ''}
384,{'text': 'USCIMCPM'},{'text': 'CIM Certificate in Professional Mark...,{'text': 'Undergraduate'},{'text': ''}
385,{'text': 'USICMCM3'},{'text': 'CICM Diploma in Credit Management Le...,{'text': 'Undergraduate'},{'text': ''}


In [5]:
# Expand all the json cells in the DataFrame
df_expanded = df.applymap(lambda x: x['text'] if isinstance(x, dict) else x)
df_expanded['URL'] = df['Full title'].apply(lambda x: np.nan if not isinstance(x, dict) or 'url' not in x else x['url'])
df_expanded.columns = ['course_code', 'full_title', 'type', 'info', 'url']
display(df_expanded)

  df_expanded = df.applymap(lambda x: x['text'] if isinstance(x, dict) else x)


Unnamed: 0,course_code,full_title,type,info,url
0,APARAPRB,Architect Apprenticeship (Level 7) (RIBA Part ...,Postgraduate,,/course-catalogue/course-specifications/2023-2...
1,APMAAEPP,Architecture Apprenticeship (RIBA 2 and 3) - M...,Postgraduate,,
2,APPAFEWL,PGCE Primary Apprenticeship 5-11 (West London ...,Postgraduate,,
3,APPAFSWL,PGCE Primary Apprenticeship 5-11 SEND (West Lo...,Postgraduate,,
4,APPAHTSA,PGCE Primary Apprenticeship (Hackney Teaching ...,Postgraduate,,
...,...,...,...,...,...
382,UOIFPSCI,International Foundation Programme Sciences,Undergraduate,,
383,UOIFPSSH,International Foundation Programme Social Scie...,Undergraduate,,
384,USCIMCPM,CIM Certificate in Professional Marketing,Undergraduate,,
385,USICMCM3,CICM Diploma in Credit Management Level 3,Undergraduate,,


In [6]:
df_expanded.to_csv('courses.csv', index=False)

In [7]:
soup = BeautifulSoup(response.text, 'html.parser')
results_header = soup.find('div', {'id':'results_header'}).text.replace('\n\n', '\n').replace('\n\n', '\n').strip()

with open('scrapper_info.txt', 'w') as file:
  file.write(results_header)

-----

In [22]:
base_url = 'https://intranet.londonmet.ac.uk'
data = []

def find_entry_requirements(sp):
    for div in sp.find_all('div', {'class':"panel-default"}):
        if div.find('h3').text == 'Entry requirements':
            return div.find('div', {'class':'panel-body'})


for url in df_expanded[df_expanded.url.notnull()]['url']:
    response = requests.get(base_url + url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser').find('div', {'id':'course_specification_and_structures'})
        data.append({
            'url': url,
            'title' : BeautifulSoup(response.text, 'html.parser').find('h1').text.strip(),
            'course_requirements': soup.find('table', {'class':'specifications'}),
            'entry_requirements' : find_entry_requirements(soup)
        })
        
    else:
        data.append('Failed to retrieve course info.')

In [23]:
course_specifications = pd.DataFrame(data)
course_specifications.to_csv('course_specifications.csv', index=False)

---------

In [46]:
response = requests.get('https://www.londonmet.ac.uk/international/applying/entry-requirements-by-country/') 

In [72]:
entry_requirements_country = []

data = BeautifulSoup(response.text)

for country in data.find_all('div', {'class':'col-md-6'}):
    if country.find('h2'):
        country_url = "https://www.londonmet.ac.uk" + country.find('a')['href']
        
        new_soup = BeautifulSoup(requests.get(country_url).text).find_all('table')
        
        entry_requirements_country.append({
            'country': country.find('h2').text,
            'url': country_url,
            'academic' : new_soup[0],
            'mathematics and english' : new_soup[1],
        })

df = pd.DataFrame(entry_requirements_country)
df.to_csv('entry_requirements_country.csv', index=False)

------

In [74]:
! pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.3-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.3-py2.py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.3/251.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.3


In [77]:
data = pd.read_excel("2024-tariff-tables-march-24.xlsx", sheet_name="TARIFF POINTS BENCHMARK", header=9)
data

Unnamed: 0,Qualification Title,Awarding Body,GLH,Size band,Grade,Grade Band,TARIFF POINTS,Qualification offered in,Qualification Type,Awarding Body full title
0,A Level (Double Award) and Advanced VCE (Doub...,Various,720,(4+4),A*A*,"(14,14)",112,Multiple UK nations,Qualification Type unavailable,Various
1,A Level (Double Award) and Advanced VCE (Doub...,Various,720,(4+4),A*A,"(14,12)",104,Multiple UK nations,Qualification Type unavailable,Various
2,A Level (Double Award) and Advanced VCE (Doub...,Various,720,(4+4),AA,"(12,12)",96,Multiple UK nations,Qualification Type unavailable,Various
3,A Level (Double Award) and Advanced VCE (Doub...,Various,720,(4+4),AB,"(12,10)",88,Multiple UK nations,Qualification Type unavailable,Various
4,A Level (Double Award) and Advanced VCE (Doub...,Various,720,(4+4),BB,"(10,10)",80,Multiple UK nations,Qualification Type unavailable,Various
...,...,...,...,...,...,...,...,...,...,...
227,T level,Various,1000+,(4+4+4),P (A*-C),8,96,England only,Qualification Type unavailable,Various
228,T level,Various,1000+,(4+4+4),P (D or E),6,72,England only,Qualification Type unavailable,Various
229,Welsh Baccalaureate Advanced Diploma – Core (...,WJEC,180,2,P,6,12,Wales only,Qualification Type unavailable,WJEC - CBAC
230,WEW/PSE/WRE/II component,WJEC,350,4,P,9,36,Wales only,Qualification Type unavailable,WJEC - CBAC


In [78]:
data.to_csv('tariff_points.csv', index=False)