In [2]:
#import relevant packages
import requests
import pandas as pd
import csv
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

This scraping exerscise is carried out in two phases. In the first phase, I extract the names and URLs of universities in the first page of a universities ranking website. This is saved as a csv file using pandas DataFrame.Saving the file is an optional step, the generated dataframe can be used directly for the next phase of data extraction. In Phase Two, the file is read and the url of each school is accessed to extract specific details about the school

Phase One - get the contents of the first page using selenium and beautifulsoup

In [3]:
#The target page has options of displaying 10 or 25 entries on each page load. Set the page limit to the highest-25
url = "https://www.topuniversities.com/universities/?pagerlimit=[25]"
#define options for selenium
options = Options()
TIMEOUT = 5
options.add_argument("--start-maximized")  # Maximize the browser window
options.add_argument("--disable-notifications")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", 
                                {"profile.default_content_setting_values.notifications": 2 
                                })
driver = webdriver.Chrome(options=options)

# Timeout needed for Web page to render
time.sleep(TIMEOUT)

driver.get(url)
driver.implicitly_wait(10)
html_data = driver.page_source
soup = BeautifulSoup(html_data, 'html.parser')

#focus on the sections/div that contains the required information
sch = soup.find_all('div', class_='university-wrap col-lg-12')
#print(soup.prettify()) - the output is truncated because of the size
print(sch)

The chromedriver version (120.0.6099.109) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (121.0.6167.85); currently, chromedriver 121.0.6167.85 is recommended for chrome 121.*, so it is advised to delete the driver in PATH and retry


[<div class="university-wrap col-lg-12">
<div class="card" data-gtm-vis-first-on-screen309007_2357="57716" data-gtm-vis-has-fired309007_2357="1" data-gtm-vis-recent-on-screen309007_2357="57716" data-gtm-vis-total-visible-time309007_2357="100">
<div class="card-wrap">
<div class="left_img">
<div class="img-wrap overlay_dark" style="background-image: url('https://img.youtube.com/vi/fs5bgM54bMs/0.jpg')">
<div class="card-img-overlay">
<div class="recommended">Ad Feature</div>
<div class="def-img">
<div class="gallery_icn d-none" onclick="Drupal.behaviors.tu_d8.openGallerywithID('university', 296886, 'image')">
<a href="javascript:void(0)"><i aria-hidden="true" class="far fa-images"></i></a>
</div>
<div class="video_ply" onclick="Drupal.behaviors.tu_d8.openGallerywithID('university', 296886, 'video')">
<a href="javascript:void(0)"><i aria-hidden="true" class="fas fa-play"></i></a>
</div>
</div>
</div>
</div>
</div>
<div class="card-body">
<div class="uni-det">
<h2>
<a href="/universities/u

In [15]:
#create a list to hold the extracted information and proceed to extract the name and url of each school listed on the page
school_data = []

#navigate through retrieved content to extract name and url
for item in sch:
    school_data.append({
    "name": item.find('h2').find('span', class_='bold-text').get_text(),
    "url": "https://www.topuniversities.com" + item.find('h2').find('a').get('href')
    })
    
print(school_data)

#convert list to pandas dataframe
school_df = pd.DataFrame(school_data)

#write dataframe to a csv file. drop the index column
#the purpose of saving to csv file for possible re-use of the file. The dataframe can be used as it is directly for the next phase
school_df.to_csv("school_data.csv", index=False)
school_data
#school_df


[{'name': 'Universidad de Lima', 'url': 'https://www.topuniversities.com/universities/universidad-de-lima'}, {'name': 'Tecnológico de Monterrey', 'url': 'https://www.topuniversities.com/universities/tecnologico-de-monterrey'}, {'name': 'Rennes School of Business', 'url': 'https://www.topuniversities.com/universities/rennes-school-business'}, {'name': 'Alma Mater Studiorum - Università di Bologna', 'url': 'https://www.topuniversities.com/universities/alma-mater-studiorum-universita-di-bologna'}, {'name': 'emlyon business school', 'url': 'https://www.topuniversities.com/universities/emlyon-business-school'}, {'name': 'European University, Georgia', 'url': 'https://www.topuniversities.com/universities/european-university-georgia'}, {'name': 'Universität Heidelberg', 'url': 'https://www.topuniversities.com/universities/universitat-heidelberg'}, {'name': 'Penn State University: Smeal College of Business', 'url': 'https://www.topuniversities.com/universities/penn-state-university/penn-state-

[{'name': 'Universidad de Lima',
  'url': 'https://www.topuniversities.com/universities/universidad-de-lima'},
 {'name': 'Tecnológico de Monterrey',
  'url': 'https://www.topuniversities.com/universities/tecnologico-de-monterrey'},
 {'name': 'Rennes School of Business',
  'url': 'https://www.topuniversities.com/universities/rennes-school-business'},
 {'name': 'Alma Mater Studiorum - Università di Bologna',
  'url': 'https://www.topuniversities.com/universities/alma-mater-studiorum-universita-di-bologna'},
 {'name': 'emlyon business school',
  'url': 'https://www.topuniversities.com/universities/emlyon-business-school'},
 {'name': 'European University, Georgia',
  'url': 'https://www.topuniversities.com/universities/european-university-georgia'},
 {'name': 'Universität Heidelberg',
  'url': 'https://www.topuniversities.com/universities/universitat-heidelberg'},
 {'name': 'Penn State University: Smeal College of Business',
  'url': 'https://www.topuniversities.com/universities/penn-state

Phase Two - Read the saved file and use the url column to extract details of each university. Selenium is not necessary in this phase. Only BeautifulSoup is used. Alternatively, I proceeded to call the pandas dataframe generated in the last phase

In [5]:
#extract the url column of the dataframe. the colums has twenty-five entries
urls = school_df['url']
#convert pandas column to a pandas list to enable iteration
url_list = urls.tolist()
url_list


['https://www.topuniversities.com/universities/universidad-de-lima',
 'https://www.topuniversities.com/universities/tecnologico-de-monterrey',
 'https://www.topuniversities.com/universities/rennes-school-business',
 'https://www.topuniversities.com/universities/alma-mater-studiorum-universita-di-bologna',
 'https://www.topuniversities.com/universities/emlyon-business-school',
 'https://www.topuniversities.com/universities/european-university-georgia',
 'https://www.topuniversities.com/universities/universitat-heidelberg',
 'https://www.topuniversities.com/universities/penn-state-university/penn-state-university-smeal-college-business',
 'https://www.topuniversities.com/universities/london-school-economics-political-science-lse',
 'https://www.topuniversities.com/universities/universidad-europea-de-madrid',
 'https://www.topuniversities.com/universities/coventry-university-london',
 'https://www.topuniversities.com/universities/srh-berlin-university-applied-sciences',
 'https://www.topu

In [6]:
#initialize a empty python list to hold the extracted information for each of the twnty five universities
uni_data = []

#iterate over the urls in the url list
for url in url_list:
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, 'lxml')

        #get the name of each univsersity or return an empty value when name is not available
        if soup.find(class_='univ-logo-n-name') is not None:
            name = soup.find(class_='univ-logo-n-name').find('h1').find('a').text
        else:
            name = ""
        #extract location(s) of each university
        campuses = soup.find(class_='campus-locations-outer-wrap').find_all('li', class_='campus-locations-links')
        campus_dict = {}
        for campus in campuses:
            name = campus.find('span').text.split(",")[0].strip()
            lat = campus['latitude']
            lon = campus['longitude']
            campus_dict[name] = {"latitude": lat, "longitude": lon}
        
        #extract list of undergraduate programs offered by each university
        if soup.find(id='ugtab') is not None:
            ugprog = soup.find(id='ugtab').find_all(class_='views-row')
            undergrad = []
            for program in ugprog:
                ugprog = program.find('h4').get_text(strip=True)
                undergrad.append(ugprog)
        else:
            undergrad = ''
        #extract list of masters programs offered and return empty when no master level program is offered by the university
        if soup.find(id='pgtab') is not None:    
            pgprog = soup.find(id='pgtab').find_all(class_='views-row')
            masters = []
            for program in pgprog:
                pgname = program.find('h4').get_text(strip=True)
                masters.append(pgname)
        else:
            masters = ''
        #extract list of MBA programs offered and return empty when no mba program is offered by the university
        if soup.find(id ='mbatab') is not None:
            mbaprog = soup.find(id='mbatab').find_all(class_='views-row')
            mba = []
            for program in mbaprog:
                mbaname = program.find('h4').get_text(strip=True)
                mba.append(mbaname)
        else:
            mba = ''
        #extract list of PhD programs offered and return empty when no phd program is offered by the university
        if soup.find(id ='phdtab') is not None:
            phdprog = soup.find(id='phdtab').find_all(class_='views-row')
            phd = []
            for program in phdprog:
                phdname = program.find('h4').get_text(strip=True)
                phd.append(phdname)
        else:
            phd = ''
        #extract the current raking of each university based on the available ranking systems
        if soup.find(class_='nav nav-pills rnk-list qs-ranking-p2 qs-ranking-p2-js no-scroll-bar') is not None:
            #create an empty dictionary to hold the all the current rankings for each university. A university can have zero or many ranking in different categories. the dictionary allows a key-value representation of the rankings
            current_ranking = {}
            ranking = soup.find(class_='nav nav-pills rnk-list qs-ranking-p2 qs-ranking-p2-js no-scroll-bar').find_all('li')
            for item in ranking:
                rank_system = item.find('h3').text
                rank = item.find('div', class_='latest_rank').get_text(strip=True).replace("#", "")
                #populate the dictionary
                current_ranking[rank_system] = rank
        else:
            current_ranking = ''
        
        #extract historical ranking information for years available
        if soup.find(id = 'rank-data') is not None:
            rankdata = soup.find('div', id = 'rank-data').find_all('li')
            #create a ductionary to hold the key/value pairs of the rankings; where the year is the value and the rank is the key
            rank_hist={}
            for item in rankdata:
                yr= item.text[:4]
                ranked = item.find(class_='d-rank-res').get_text(strip=True).replace("#", "")
                rank_hist[yr] = ranked
        else:
            rank_hist = ''

        #extract total school population including total students, international students and staff
        sch_population = {}
        staff_stud = soup.find_all(class_='studstaff-subsection')
        for item in staff_stud:
            description = item.find(class_='studstaff-subsection-title').text
            figure = item.find(class_='studstaff-subsection-count').text
            sch_population[description] = figure
         #extract each university website
        website = soup.find('div', class_='univ-logo-n-name').find('h1').find('a').get('href')

         #extract the first year tuition for each category of students
        tuition = {}
        fees = soup.find_all(class_='univ-subsection')
        for item in fees:
            student_type = item.find('h4', class_='univ-subsection-title').text
            amount = item.find('div', class_='univ-subsection-value').find('div').text
            tuition[student_type] = amount
        #populate the extracted details in a list named data
        data = [name, campus_dict, sch_population, undergrad, masters, mba, phd, tuition, website, current_ranking, rank_hist]
        #append the extracted details in data to the list that was created initially
        uni_data.append(data)
        
uni_data

[['Universidad de Lima',
  {'Universidad de Lima': {'latitude': '-12.0847243',
    'longitude': '-76.9710095'}},
  {'Total students': '14,445',
   'International students': '290',
   'Total faculty staff': '678'},
  ['Accounting',
   'Economics',
   'International Business',
   'Management',
   'Marketing',
   'Communication',
   'Architecture',
   'Civil Engineering',
   'Industrial Engineering',
   'Systems Engineering',
   'Law',
   'Psychology'],
  ['Master in Communication and Content Management',
   'Master of Business Administration (MBA)',
   'Master of Business Law',
   'Master of Operations and Projects Management',
   'Master of Taxation and Fiscal Policy',
   'Master’s Program in Banking and Finance',
   'Master’s Program in Innovation Management',
   'Master’s Program in Marketing and Business Management'],
  '',
  ['Doctoral Program in Business Administration'],
  {},
  'http://www.ulima.edu.pe',
  {'QS World University Rankings': '1001-1200',
   'Latin America and the Ca

In [7]:
#convert the list to a pandas DataFrame with header name for each column
uni_df = pd.DataFrame(uni_data, columns=['Name of School', 'Locations', 'Population', 'Undergraduate Programs', 'Masters Programs', 'MBA Programs', 'PhD Programs', 'First Year Tuition', 'School Website', 'Current Ranking', 'Historical Rankings'])
uni_df

Unnamed: 0,Name of School,Locations,Population,Undergraduate Programs,Masters Programs,MBA Programs,PhD Programs,First Year Tuition,School Website,Current Ranking,Historical Rankings
0,Universidad de Lima,{'Universidad de Lima': {'latitude': '-12.0847...,"{'Total students': '14,445', 'International st...","[Accounting, Economics, International Business...",[Master in Communication and Content Managemen...,,[Doctoral Program in Business Administration],{},http://www.ulima.edu.pe,"{'QS World University Rankings': '1001-1200', ...","{'2020': '801-1000', '2021': '1001+', '2022': ..."
1,Tecnológico de Monterrey,{'Tecnológico de Monterrey': {'latitude': '25....,"{'Total students': '50,441', 'International st...","[ARQ Architecture, CI Civil Engineer, LUB Bach...","[Master in Business Analytics, Master in Busin...","[EGADE - W. P. Carey Executive MBA, EGADE MBA ...","[Ph.D. in Business Administration, Ph.D. in Fi...","{'Domestic Students': '1,800 USD', 'Internatio...",https://tec.mx/en,"{'QS World University Rankings': '=184', 'QS W...","{'2012': '=306', '2014': '=279', '2015': '253'..."
2,Rennes School of Business - Paris Campus,{'Rennes School of Business - Rennes Campus': ...,{},[Bachelor in Management 1st year],"[Master in Culture, Creative and Luxury Indust...","[Executive MBA, Sustainable EMBA Paris, IMBA]","[Doctoral Program in Management (PhD), Global ...","{'Domestic Students': '-', 'International Stud...",https://www.rennes-sb.com/,{'World University Rankings - Masters in Suppl...,"{'2022': '51+', '2023': '51+', '2024': '51+'}"
3,Representacion en Buenos Aires,"{'Bologna': {'latitude': '44.4984', 'longitude...","{'Total students': '71,509', 'International st...",[Bachelor of Marketing and Economics of the Ag...,[Master in Plant and Agricultural Biotechnolog...,[Master in Business Administration],,{},http://www.unibo.it/en/,"{'QS World University Rankings': '=154', 'QS W...","{'2012': '194', '2014': '188', '2015': '=182',..."
4,emlyon business school Shanghai campus,{'emlyon business school Lyon Campus': {'latit...,{},[Bachelor of Arts in Acting & Entrepreneurship...,[MSc in Data Science & Artificial Intelligence...,"[Executive MBA, International MBA Full Time, I...",[Global DBA (Asia Track)],"{'Domestic Students': '11,500 EUR', 'Internati...",https://masters.em-lyon.com/en?utm_source=qs&u...,"{'QS WUR Ranking By Subject': '56', 'World Uni...","{'2015': '151-200', '2019': '401-450', '2020':..."
5,European University - Campus II,{'European University': {'latitude': '41.77229...,{},"[Bachelor Program in Business Administration, ...",[Integrated (Bachelor-Master) Educational Prog...,,,{},https://eu.edu.ge/en,,
6,Grabengasse 1,"{'Grabengasse 1': {'latitude': '49.4112774', '...","{'Total students': '19,632', 'International st...","[Bachelor's Degree in Anthropology, Bachelor's...","[Master's Degree in Anthropology, Master's Deg...",,,"{'Domestic Students': '151 EUR', 'Internationa...",https://www.uni-heidelberg.de/en,"{'QS World University Rankings': '=87', 'QS WU...","{'2012': '55', '2014': '50', '2015': '49', '20..."
7,University Park Campus,{'University Park Campus': {'latitude': '40.80...,{},,"[Master of Supply Chain Management, Penn State...","[Penn State Smeal MBA, Penn State Online MBA]",,{},https://www.smeal.psu.edu/,,
8,Main Campus,"{'Main Campus': {'latitude': '51.5146', 'longi...","{'Total students': '11,976', 'International st...","[BSc Accounting and Finance, BA Anthropology a...","[MSc Accounting and Finance, MSc Accounting, O...",[TRIUM Global Executive MBA],"[MRes/PhD Accounting (Accounting, Organisation...",{},http://www.lse.ac.uk/study-at-lse?utm_source=q...,"{'QS World University Rankings': '45', 'QS WUR...","{'2012': '69', '2014': '68', '2015': '=71', '2..."
9,Alcobendas Campus,{'Villaviciosa de Odón Campus': {'latitude': '...,{},[Bachelor’s Degree in Fundamentals of the Arch...,"[Master’s in Architecture, International Maste...","[MBA in Sports Management, Master in Internati...",,{},https://universidadeuropea.com/en/,"{'QS Sustainability Ranking': '1051-1100', 'Eu...",{'2024': '1051-1100'}


In [8]:
#save the extracted information as both json and csv files

#save to .csv without the index colums that is default with dataframes
uni_df.to_csv('University_Data.csv', index=False)

#there are two approaches to saving to json. the first retains the index and separate each records according to columns and appends the index as key
uni_df.to_json('University_Data.json')
#the second approach where orient parameter is defined, saves each record as a row from the dataframe
uni_df.to_json('University_Data_1.json', orient="records")

The following section is all about packaging the above step as one function that get the url list as a parameter, extracts the information needed and saves file

In [9]:
def sch_data(url_list):
    #create a header of the csv file that the information will be written
    header = ['Name of School', 'Locations', 'Population', 'Undergraduate Programs', 'Masters Programs', 'MBA Programs', 'PhD Programs', 'First Year Tuition', 'School Website', 'Current Ranking', 'Historocal Rankings']
    #create a csv file and attach the headers
    with open('School_Info.csv', 'w', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
    #n = 1
    for url in url_list:
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, 'lxml')

        #get the name of each univsersity
        if soup.find(class_='univ-logo-n-name') is not None:
            name = soup.find(class_='univ-logo-n-name').find('h1').find('a').text
        else:
            name = ""
        #extract location(s) of each university
        campuses = soup.find(class_='campus-locations-outer-wrap').find_all('li', class_='campus-locations-links')
        campus_dict = {}
        for campus in campuses:
            name = campus.find('span').text.split(",")[0].strip()
            lat = campus['latitude']
            lon = campus['longitude']
            campus_dict[name] = {"latitude": lat, "longitude": lon}
        
        #extract list of undergraduate programs offered by each university
        if soup.find(id='ugtab') is not None:
            ugprog = soup.find(id='ugtab').find_all(class_='views-row')
            undergrad = []
            for program in ugprog:
                ugprog = program.find('h4').get_text(strip=True)
                undergrad.append(ugprog)
        else:
            undergrad = 'na'
        #extract list of masters programs offered 
        if soup.find(id='pgtab') is not None:    
            pgprog = soup.find(id='pgtab').find_all(class_='views-row')
            masters = []
            for program in pgprog:
                pgname = program.find('h4').get_text(strip=True)
                masters.append(pgname)
        else:
            masters = 'na'
        #extract list of MBA programs offereed
        if soup.find(id ='mbatab') is not None:
            mbaprog = soup.find(id='mbatab').find_all(class_='views-row')
            mba = []
            for program in mbaprog:
                mbaname = program.find('h4').get_text(strip=True)
                mba.append(mbaname)
        else:
            mba = 'na'
        #extract list of PhD programs offered
        if soup.find(id ='phdtab') is not None:
            phdprog = soup.find(id='phdtab').find_all(class_='views-row')
            phd = []
            for program in phdprog:
                phdname = program.find('h4').get_text(strip=True)
                phd.append(phdname)
        else:
            phd = 'na'
        #extract each university website
        website = soup.find('div', class_='univ-logo-n-name').find('h1').find('a').get('href')

       #extract the current raking of each university based on the available ranking systems
        if soup.find(class_='nav nav-pills rnk-list qs-ranking-p2 qs-ranking-p2-js no-scroll-bar') is not None:
            current_ranking = {}
            ranking = soup.find(class_='nav nav-pills rnk-list qs-ranking-p2 qs-ranking-p2-js no-scroll-bar').find_all('li')
            for item in ranking:
                rank_system = item.find('h3').text
                rank = item.find('div', class_='latest_rank').get_text(strip=True).replace("#", "")
                current_ranking[rank_system] = rank
        else:
            current_ranking = ''
        #extract historical ranking information for years available
        if soup.find(id = 'rank-data') is not None:
            rankdata = soup.find('div', id = 'rank-data').find_all('li')
            rank_hist={}
            for item in rankdata:
                yr= item.text[:4]
                ranked = item.find(class_='d-rank-res').get_text(strip=True).replace("#", "")
                rank_hist[yr] = ranked
        else:
            rank_hist = 'NOT AVAIALBLE'
        #extract total school population including total students, international students and staff
        sch_population = {}
        staff_stud = soup.find_all(class_='studstaff-subsection')
        for item in staff_stud:
            description = item.find(class_='studstaff-subsection-title').text
            figure = item.find(class_='studstaff-subsection-count').text
            sch_population[description] = figure
        #extract the first year tuition for each category of students
        tuition = {}
        fees = soup.find_all(class_='univ-subsection')
        for item in fees:
            student_type = item.find('h4', class_='univ-subsection-title').text
            amount = item.find('div', class_='univ-subsection-value').find('div').text
            tuition[student_type] = amount

        data = [name, campus_dict, sch_population, undergrad, masters, mba, phd, tuition, website, current_ranking, rank_hist,]
        #
        #append the extracted information to the open csv file
        with open('School_Info.csv', 'a', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(data)
        #global n
        #n = n+1
    print('Extracted Data has been saved successfully')
    return data

Extracted Data has been saved successfully


In [10]:
#execute the function
sch_data(url_list)

Extracted Data has been saved successfully


['Les Roches Jin Jiang',
 {'Les Roches Crans-Montana': {'latitude': '46.3089972',
   'longitude': '7.4968109'},
  'Les Roches Marbella': {'latitude': '36.5045387', 'longitude': '-4.9413413'},
  'Les Roches Jin Jiang': {'latitude': '30.835243',
   'longitude': '121.519928'}},
 {},
 ['BBA in Global Hospitality Management'],
 ['Master’s in Hospitality Strategy and Digital Transformation',
  'Executive Master’s in International Hotel Management',
  'MBA in Global Hospitality Management',
  'Master’s in International Hotel Management',
  'Master’s in Marketing and Management for Luxury Tourism'],
 'na',
 'na',
 {'Domestic Students': '30,850 CHF', 'International Students': '30,850 CHF'},
 'https://lesroches.edu/',
 {'QS WUR Ranking By Subject': '4'},
 {'2020': '3', '2021': '4', '2022': '5', '2023': '4'}]