In [1]:
import pandas as pd
import numpy as np
import json
import re

In [2]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup



### EXPLORING THE DATA

In [3]:
filename = '../data_raw/qs_data/2022.txt'
with open(filename) as f:
    data = json.load(f)

In [4]:
df = pd.DataFrame(data['data'])

#### SUBSET AND RENAME COLUMNS

In [6]:
# Select columns with data indicators and university data
sel_cols = df.columns[
    df.columns.str.contains('ind_') | df.columns.isin(['nid', 'uni', 'region', 'location', 'city', 'overall'])
]

In [7]:
def parse_col_name(x):
    soup = BeautifulSoup(x)
    tag = soup.find('div', class_="td-wrap")
    if tag:
        return tag.text.strip().title()
    
    return x.strip().title()

In [12]:
data_columns = {
    'nid': 'NID'
}
for col_raw in data['columns']:
    if col_raw['data'] in sel_cols:
        data_columns[col_raw['data']] = parse_col_name(col_raw['title'])

In [13]:
data_columns

{'nid': 'NID',
 'region': 'Region',
 'location': 'Location',
 'uni': 'University',
 'overall': 'Overall Score',
 'city': 'City',
 'ind_14': 'International Students Ratio',
 'ind_18': 'International Faculty Ratio',
 'ind_36': 'Faculty Student Ratio',
 'ind_73': 'Citations Per Faculty',
 'ind_76': 'Academic Reputation',
 'ind_77': 'Employer Reputation'}

In [14]:
sub_data = df[sel_cols]
sub_data.columns = sub_data.columns.map(lambda x: data_columns[x])

In [16]:
sub_data.head()

Unnamed: 0,City,International Students Ratio,International Faculty Ratio,Faculty Student Ratio,Citations Per Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University
0,Cambridge,"<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...",United States,294850,"<div class=""td-wrap""><div class=""td-wrap-in"">1...",North America,"<div class=""td-wrap""><div class=""td-wrap-in""><..."
1,Oxford,"<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...",United Kingdom,294654,"<div class=""td-wrap""><div class=""td-wrap-in"">9...",Europe,"<div class=""td-wrap""><div class=""td-wrap-in""><..."
2,Stanford,"<div class=""td-wrap""><div class=""td-wrap-in"">6...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...",United States,297282,"<div class=""td-wrap""><div class=""td-wrap-in"">9...",North America,"<div class=""td-wrap""><div class=""td-wrap-in""><..."
3,Cambridge,"<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...",United Kingdom,294561,"<div class=""td-wrap""><div class=""td-wrap-in"">9...",Europe,"<div class=""td-wrap""><div class=""td-wrap-in""><..."
4,Cambridge,"<div class=""td-wrap""><div class=""td-wrap-in"">7...","<div class=""td-wrap""><div class=""td-wrap-in"">8...","<div class=""td-wrap""><div class=""td-wrap-in"">9...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...","<div class=""td-wrap""><div class=""td-wrap-in"">1...",United States,294270,"<div class=""td-wrap""><div class=""td-wrap-in"">9...",North America,"<div class=""td-wrap""><div class=""td-wrap-in""><..."


#### PARSE DATA VALUES

In [17]:
def parse_value(x):
    soup = BeautifulSoup(x)
    tag = soup.find('div', class_="td-wrap-in")
    if tag:
        return tag.text.strip().upper()
    
    return x.strip().upper()

In [18]:
sub_data = sub_data.applymap(parse_value)

In [19]:
sub_data.head(3)

Unnamed: 0,City,International Students Ratio,International Faculty Ratio,Faculty Student Ratio,Citations Per Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University
0,CAMBRIDGE,91.4,100.0,100,100.0,100,100,UNITED STATES,294850,100.0,NORTH AMERICA,MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT)
1,OXFORD,98.5,99.5,100,96.0,100,100,UNITED KINGDOM,294654,99.5,EUROPE,UNIVERSITY OF OXFORD
2,STANFORD,67.0,99.8,100,99.9,100,100,UNITED STATES,297282,98.7,NORTH AMERICA,STANFORD UNIVERSITY


### MAKING THE DATASET PROCEDURE

In [20]:
def parse_col_name(x):
    soup = BeautifulSoup(x)
    tag = soup.find('div', class_="td-wrap")
    if tag:
        return tag.text.strip().title()
    
    return x.strip().title()

def parse_value(x):
    soup = BeautifulSoup(x)
    tag = soup.find('div', class_="td-wrap-in")
    if tag:
        return tag.text.strip().upper()
    
    return x.strip().upper()

In [21]:
years = [
    2012,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
    2021,
    2022
]

In [24]:
dataset = []

for year in years:
    
    filename = '../data_raw/qs_data/{year}.txt'.format(year=year)
    
    print('Processing:', filename)
    
    # Load data
    with open(filename) as f:
        data = json.load(f)
    df = pd.DataFrame(data['data'])
    
    # Select columns with data indicators and university data
    sel_cols = df.columns[
        df.columns.str.contains('ind_') | df.columns.isin(['nid', 'uni', 'region', 'location', 'city', 'overall'])
    ]
    
    # Columns dictionary
    data_columns = {
        'nid': 'NID'
    }
    for col_raw in data['columns']:
        if col_raw['data'] in sel_cols:
            data_columns[col_raw['data']] = parse_col_name(col_raw['title'])
            
    # Subset data
    sub_data = df[sel_cols]
    sub_data.columns = sub_data.columns.map(lambda x: data_columns[x])
    
    # Unify column names
    sub_data.columns = sub_data.columns.map(lambda x: x.replace('&', 'And'))
    sub_data.columns = sub_data.columns.map(lambda x: x.replace('Ratio', '').strip())
    
    # Parse data
    sub_data = sub_data.applymap(parse_value)
    
    sub_data.loc[:, 'Year'] = year
    
    dataset.append(sub_data)

Processing: qs_data/2012.txt
Processing: qs_data/2014.txt
Processing: qs_data/2015.txt
Processing: qs_data/2016.txt
Processing: qs_data/2017.txt
Processing: qs_data/2018.txt
Processing: qs_data/2019.txt
Processing: qs_data/2020.txt
Processing: qs_data/2021.txt
Processing: qs_data/2022.txt


In [25]:
dataset[0].head()

Unnamed: 0,City,Natural Sciences,Life Sciences And Medicine,Engineering And Technology,Arts And Humanities,International Students,Social Sciences And Management,Citations Per Faculty,Faculty Student,International Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
0,CAMBRIDGE,95.06,60.89,100.0,46.27,96.5,64.62,99.3,99.9,86.4,100.0,100.0,UNITED STATES,294850,100.0,NORTH AMERICA,MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT),2012
1,CAMBRIDGE,98.5,82.75,66.51,95.21,96.0,72.97,97.0,98.3,98.2,100.0,100.0,UNITED KINGDOM,294561,99.78,EUROPE,UNIVERSITY OF CAMBRIDGE,2012
2,CAMBRIDGE,100.0,100.0,46.14,100.0,78.4,100.0,100.0,98.6,90.0,100.0,100.0,UNITED STATES,294270,99.15,NORTH AMERICA,HARVARD UNIVERSITY,2012
3,LONDON,40.24,49.02,33.13,54.56,99.9,42.15,94.0,98.4,96.3,99.6,95.6,UNITED KINGDOM,294014,98.69,EUROPE,UCL,2012
4,OXFORD,89.84,78.11,52.7,97.66,95.8,76.47,89.4,100.0,98.0,100.0,100.0,UNITED KINGDOM,294654,98.57,EUROPE,UNIVERSITY OF OXFORD,2012


In [26]:
dataset[4].head()

Unnamed: 0,City,International Students,Citations Per Faculty,Faculty Student,International Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
0,CAMBRIDGE,96.6,99.9,100.0,100.0,100.0,100.0,UNITED STATES,294850,100.0,NORTH AMERICA,MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT),2017
1,STANFORD,74.0,99.7,100.0,99.7,100.0,100.0,UNITED STATES,297282,98.7,NORTH AMERICA,STANFORD UNIVERSITY,2017
2,CAMBRIDGE,70.4,100.0,98.5,100.0,100.0,100.0,UNITED STATES,294270,98.3,NORTH AMERICA,HARVARD UNIVERSITY,2017
3,CAMBRIDGE,97.8,86.5,100.0,97.6,100.0,100.0,UNITED KINGDOM,294561,97.2,EUROPE,UNIVERSITY OF CAMBRIDGE,2017
4,PASADENA,87.7,100.0,100.0,91.2,99.4,80.7,UNITED STATES,294562,96.9,NORTH AMERICA,CALIFORNIA INSTITUTE OF TECHNOLOGY (CALTECH),2017


In [27]:
dataset[7].head()

Unnamed: 0,City,International Students,International Faculty,Faculty Student,Citations Per Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
0,CAMBRIDGE,94.1,100.0,100.0,99.8,100.0,100.0,UNITED STATES,294850,100.0,NORTH AMERICA,MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT),2020
1,STANFORD,67.7,99.8,100.0,98.6,100.0,100.0,UNITED STATES,297282,98.4,NORTH AMERICA,STANFORD UNIVERSITY,2020
2,CAMBRIDGE,62.2,86.3,98.7,99.6,100.0,100.0,UNITED STATES,294270,97.4,NORTH AMERICA,HARVARD UNIVERSITY,2020
3,OXFORD,98.5,99.7,100.0,84.7,100.0,100.0,UNITED KINGDOM,294654,97.2,EUROPE,UNIVERSITY OF OXFORD,2020
4,PASADENA,87.3,99.4,100.0,100.0,97.8,81.2,UNITED STATES,294562,96.9,NORTH AMERICA,CALIFORNIA INSTITUTE OF TECHNOLOGY (CALTECH),2020


In [28]:
dataset_df = pd.concat(dataset, ignore_index=True, sort=False)

In [29]:
dataset_df.head(3)

Unnamed: 0,City,Natural Sciences,Life Sciences And Medicine,Engineering And Technology,Arts And Humanities,International Students,Social Sciences And Management,Citations Per Faculty,Faculty Student,International Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
0,CAMBRIDGE,95.06,60.89,100.0,46.27,96.5,64.62,99.3,99.9,86.4,100,100,UNITED STATES,294850,100.0,NORTH AMERICA,MASSACHUSETTS INSTITUTE OF TECHNOLOGY (MIT),2012
1,CAMBRIDGE,98.5,82.75,66.51,95.21,96.0,72.97,97.0,98.3,98.2,100,100,UNITED KINGDOM,294561,99.78,EUROPE,UNIVERSITY OF CAMBRIDGE,2012
2,CAMBRIDGE,100.0,100.0,46.14,100.0,78.4,100.0,100.0,98.6,90.0,100,100,UNITED STATES,294270,99.15,NORTH AMERICA,HARVARD UNIVERSITY,2012


In [30]:
dataset_df.tail(3)

Unnamed: 0,City,Natural Sciences,Life Sciences And Medicine,Engineering And Technology,Arts And Humanities,International Students,Social Sciences And Management,Citations Per Faculty,Faculty Student,International Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
10044,IRBID,,,,,11.0,,2.2,1.9,1.2,9.3,10.2,JORDAN,297384,,ASIA,YARMOUK UNIVERSITY,2022
10045,ISTANBUL,,,,,8.9,,5.4,6.1,1.1,5.6,10.5,TURKEY,309595,,ASIA,YILDIZ TECHNICAL UNIVERSITY,2022
10046,ZAGAZIG,,,,,1.9,,2.2,3.4,1.0,4.7,2.8,EGYPT,297612,,AFRICA,ZAGAZIG UNIVERSITY,2022


In [35]:
dataset_df.loc[
    dataset_df.NID == '294014'
]

Unnamed: 0,City,Natural Sciences,Life Sciences And Medicine,Engineering And Technology,Arts And Humanities,International Students,Social Sciences And Management,Citations Per Faculty,Faculty Student,International Faculty,Academic Reputation,Employer Reputation,Location,NID,Overall Score,Region,University,Year
3,LONDON,40.24,49.02,33.13,54.56,99.9,42.15,94.0,98.4,96.3,99.6,95.6,UNITED KINGDOM,294014,98.69,EUROPE,UCL,2012
869,LONDON,,85.4,75.9,85.6,100.0,81.2,95.6,98.9,96.5,99.9,98.7,UNITED KINGDOM,294014,98.9,EUROPE,UCL,2014
1771,LONDON,,89.2,81.7,84.4,100.0,81.8,97.4,99.0,96.8,99.9,99.4,UNITED KINGDOM,294014,99.2,EUROPE,UCL,2015
2657,LONDON,,87.6,80.4,84.7,99.9,81.9,88.0,98.6,95.5,99.9,99.8,UNITED KINGDOM,294014,97.2,EUROPE,UCL,2016
3571,LONDON,,,,,100.0,,79.1,98.7,99.1,99.8,99.3,UNITED KINGDOM,294014,95.6,EUROPE,UCL,2017
4504,LONDON,,,,,100.0,,74.7,99.1,96.6,99.7,99.5,UNITED KINGDOM,294014,94.6,EUROPE,UCL,2018
5484,LONDON,,,,,100.0,,66.2,99.2,98.7,99.3,99.2,UNITED KINGDOM,294014,92.9,EUROPE,UCL,2019
6500,LONDON,,,,,100.0,,76.7,98.1,99.1,99.3,98.7,UNITED KINGDOM,294014,94.8,EUROPE,UCL,2020
7571,LONDON,,,,,100.0,,65.4,98.4,99.3,99.4,98.3,UNITED KINGDOM,294014,92.9,EUROPE,UCL,2021
8755,LONDON,,,,,100.0,,78.0,99.0,99.5,99.4,98.9,UNITED KINGDOM,294014,95.4,EUROPE,UCL,2022


In [31]:
dataset_df.to_csv('../data_raw/QS_World_Rankings.csv')