In [1]:
# Import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn
import re
import pickle
%matplotlib inline

# 01. Scraping info from topuniversities.com

In [2]:
qs_url = 'https://www.topuniversities.com'

## Initial postman/parsing
Trying to get the url which contains the actual data that we want to parse. Using Postman we can see that the actual ranking data which is shown on the page is generated with a request to `rank_url`, therefore it is this
link that we'll need to GET to extract all the data we're interested in.

In [3]:
resp = requests.get(qs_url + '/university-rankings/world-university-rankings/2018')

In [4]:
start = resp.text.find('rank_url')
start = resp.text.find('http', start)
stop = resp.text.find('.txt', start)
qs_data_url = resp.text[start:stop+len('.txt')]
print(qs_data_url)

https:\/\/www.topuniversities.com\/sites\/default\/files\/qs-rankings-data\/357051.txt


We've still got all the escape characters, in this case backslashes, so we'll have to replace them:

In [5]:
qs_data_url = qs_data_url.replace('\\' , '')
print(qs_data_url)

https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt


## Scraping the main data and putting everything into a `DataFrame`
Scraping everything that is contained in the `rank_url`. This is the majority of what we are interested in, the faculty and student data are contained on another page that is specific to each university. This will be scraped afterwards.
Handily enough, the data from `rank_url` is in `JSON` format, so we'll use the `JSON` parsing capabilities of 
`requests`

In [6]:
rank_data = requests.get(qs_data_url)
parsed_data = rank_data.json()
parsed_data.keys()

dict_keys(['data'])

We've got a `Dict` with only one key, so let's have a look into it.

In [7]:
print(type(parsed_data['data']))
print(len(parsed_data['data']))
parsed_data['data'][0]

<class 'list'>
959


{'cc': 'US',
 'core_id': '410',
 'country': 'United States',
 'guide': '<a href="/where-to-study/north-america/united-states/guide" class="guide-link" target="_blank">United States</a>',
 'logo': '<img src="https://www.topuniversities.com/sites/default/files/massachusetts-institute-of-technology-mit_410_small_0.jpg" alt="Massachusetts Institute of Technology (MIT)  Logo">',
 'nid': '294850',
 'rank_display': '1',
 'region': 'North America',
 'score': '100',
 'stars': '6',
 'title': 'Massachusetts Institute of Technology (MIT)',
 'url': '/universities/massachusetts-institute-technology-mit'}

We've got a list of 959 entries. Not too surprisingly the list we get in the `data` key is conveniently organised from highest to lowest ranked.

Let's now put all of this data into a single `DataFrame`. We're only interested in the top 200 universities, so we'll ignore the rest of the set.

In [8]:
qs_df = pd.DataFrame()
for i in range(0,200):
    qs_df = qs_df.append(parsed_data['data'][i], ignore_index=True)
print(qs_df.shape)
qs_df.head()

(200, 12)


Unnamed: 0,cc,core_id,country,guide,logo,nid,rank_display,region,score,stars,title,url
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge


We've now got a `DataFrame` with the main information, but we still need to scrape a page for each individual university that contains the number of total and international, faculty and students.

## Scraping the specific page for each university

We will first define a handy little function to extract numbers from strings with newlines and commas, for example from `\n1,300` we want to extract only the `1300`

In [9]:
def xtract_number(str_in):
    str_in = str_in.replace(',' , '')
    str_in = re.search(r'\d+', str_in).group()
    return str_in

Let's add the extra columns that we're going to populate

In [10]:
columns_to_add = ['total faculty','inter faculty','total student','total inter']
qs_df = pd.concat([qs_df, pd.DataFrame(columns=columns_to_add)], axis=1)

The page containing the additional data we're looking for is already contained in the `url` field of our `DataFrame`. Each value we're looking for is contained in a `<div>` tag with `class=` the data we're looking for, within this tag is another `<div>` tag with `class="number"` which has the actual numeric value. We're therefore going to parse the page for each university and use `BeautifulSoup` to find all these tags. As there are several of them on each page, we'll double-check that they're  all the same.
The following step is very slow, it has to parse a lot of html for 200 entries.

In [11]:
for idx in qs_df.index:
#for idx in range(30,200): # you can uncomment this to only parse the first few universities
    page = requests.get(qs_url + qs_df.loc[idx]['url']) # GET the page for one university
    soup = BeautifulSoup(page.text, 'html.parser') # parse it with bs4

    for column in columns_to_add:
        try:
            wrapper = soup.find_all('div', class_=column) # find the tag of interest
            if not wrapper:
                print('No data for', qs_df.loc[idx]['title'], 'concerning', column)
            values = np.zeros(len(wrapper))
            for i in range(0,len(wrapper)): # if there are several tags, we'll check they have the same values
                values[i] = xtract_number(wrapper[0].find('div', class_='number').string)
                if i>0 and values[i] != values[i-1]:
                   raise Exception('Numerical values for', qs_df.loc[idx]['title'], 'are different throughout the HTML') 
                else:
                    qs_df.loc[idx][column] = values[0]
            
        except IndexError:
            print('No data for', qs_df.loc[idx]['title'], 'concerning', column)

No data for New York University (NYU) concerning total faculty
No data for New York University (NYU) concerning inter faculty
No data for New York University (NYU) concerning total student
No data for New York University (NYU) concerning total inter
No data for Indian Institute of Science (IISc) Bangalore concerning inter faculty


Furthermore, we can see that some data is missing for New York University and the Indian Institute of Science.
Going to the website and checking this by hand does indeed show that these pieces of information are missing. We'll therefore leave these as NaN to signify the missing data.

Given the time to GET and parse all this HTML, we've stored the `DataFrame` in a pickle for convenience:

In [12]:
pickle.dump( qs_df, open( "qs_dataframe.p", "wb" ) )
qs_df = pickle.load( open( "qs_dataframe.p", "rb" ) )

Let's drop some of the extra columns that we don't really need, they're still in the pickle if we need them later. We'll also change the columns to floats for our calculations.

In [13]:
qs_df.drop(['core_id', 'guide', 'logo', 'nid', 'url', 'cc', 'score', 'stars'], axis=1, inplace=True)
qs_df.rank_display = qs_df.rank_display.str.replace('=','')
qs_df.rename(columns={'title':'name'}, inplace=True) # to merge on a column they need the same name in both dfs

In [14]:
int_cols = ['rank_display', 'total faculty', 'inter faculty', 'total student', 'total inter']
for col in int_cols:
    qs_df.loc[:,col] = qs_df.loc[:,col].astype(float)

## Which are the best universities?

We are now going to compare the best universities in terms of ratio between faculty members: students and % of international students. Let's add these columns, they are merely operations involving the other columns

In [15]:
qs_df['faculty:students ratio'] = qs_df['total faculty']/qs_df['total student']
qs_df['% international students'] = 100*qs_df['total inter']/qs_df['total student']

In [16]:
qs_df.sort_values('faculty:students ratio' , ascending=False).head()

Unnamed: 0,country,rank_display,region,name,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
3,United States,4.0,North America,California Institute of Technology (Caltech),953.0,350.0,2255.0,647.0,0.422616,28.691796
15,United States,16.0,North America,Yale University,4940.0,1708.0,12402.0,2469.0,0.398323,19.908079
5,United Kingdom,6.0,Europe,University of Oxford,6750.0,2964.0,19720.0,7353.0,0.342292,37.287018
4,United Kingdom,5.0,Europe,University of Cambridge,5490.0,2278.0,18770.0,6699.0,0.292488,35.689931
16,United States,17.0,North America,Johns Hopkins University,4462.0,1061.0,16146.0,4105.0,0.276353,25.424254


In [17]:
qs_df.sort_values('% international students' , ascending=False).head()

Unnamed: 0,country,rank_display,region,name,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
34,United Kingdom,35.0,Europe,London School of Economics and Political Scien...,1088.0,687.0,9760.0,6748.0,0.111475,69.139344
11,Switzerland,12.0,Europe,Ecole Polytechnique Fédérale de Lausanne (EPFL),1695.0,1300.0,10343.0,5896.0,0.163879,57.004738
7,United Kingdom,8.0,Europe,Imperial College London,3930.0,2071.0,16090.0,8746.0,0.244251,54.356743
198,Netherlands,200.0,Europe,Maastricht University,1277.0,502.0,16385.0,8234.0,0.077937,50.25328
47,United States,47.0,North America,Carnegie Mellon University,1342.0,425.0,13356.0,6385.0,0.100479,47.806229


## Aggregating by Country

We'll first create a new `DataFrame` which will have info aggregated by country

In [18]:
qs_country = pd.DataFrame(columns=['country'] + columns_to_add )
qs_country['country'] = qs_df['country'].unique()
qs_country.set_index('country', inplace=True)

Let's calculate the totals per country

In [19]:
for country in qs_df['country'].unique():
    sums = qs_df[qs_df['country'] == country][columns_to_add].sum()
    qs_country.loc[country][columns_to_add] = sums

Now let's calculate the same two stats that we did per university before

In [20]:
qs_country['faculty:students ratio'] = qs_country['total faculty']/qs_country['total student']
qs_country['% international students'] = 100*qs_country['total inter']/qs_country['total student']

In [21]:
qs_country.sort_values('faculty:students ratio' , ascending=False).head()

Unnamed: 0_level_0,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Russia,6709,373,30233,5098,0.22191,16.8624
Denmark,11916,3904,67223,9543,0.177261,14.196
Saudi Arabia,1062,665,6040,989,0.175828,16.3742
Singapore,9444,6079,58466,16168,0.16153,27.6537
Malaysia,2755,655,17902,3476,0.153893,19.4168


In [22]:
qs_country.sort_values('% international students' , ascending=False).head()

Unnamed: 0_level_0,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Australia,22034,11382,301994,106359,0.0729617,35.2189
United Kingdom,79934,30216,583621,199426,0.136962,34.1705
Hong Kong,10166,6296,78838,24499,0.128948,31.0751
Austria,4117,1572,63446,19667,0.0648898,30.998
Switzerland,15323,9208,109112,32995,0.140434,30.2396


## Aggregating by region
Let's do the same thing but grouping per region now, we'll do this in the same way as before.

In [23]:
qs_region = pd.DataFrame(columns=['region'] + columns_to_add )
qs_region['region'] = qs_df['region'].unique()
qs_region.set_index('region', inplace=True)
qs_region.head()

for region in qs_df['region'].unique():
    sums = qs_df[qs_df['region'] == region][columns_to_add].sum()
    qs_region.loc[region][columns_to_add] = sums
    
qs_region['faculty:students ratio'] = qs_region['total faculty']/qs_region['total student']
qs_region['% international students'] = 100*qs_region['total inter']/qs_region['total student']

In [24]:
qs_region.sort_values('faculty:students ratio' , ascending=False)

Unnamed: 0_level_0,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Asia,106734,25462,807003.0,110100,0.13226,13.6431
North America,182123,43836,1546350.0,292116,0.117776,18.8906
Europe,218358,67598,1957250.0,449364,0.111564,22.9589
Latin America,45382,5648,435750.0,36871,0.104147,8.4615
Africa,1733,379,19593.0,3325,0.08845,16.9703
Oceania,25347,12786,350167.0,118798,0.0723855,33.9261


In [25]:
qs_region.sort_values('% international students' , ascending=False)

Unnamed: 0_level_0,total faculty,inter faculty,total student,total inter,faculty:students ratio,% international students
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Oceania,25347,12786,350167.0,118798,0.0723855,33.9261
Europe,218358,67598,1957250.0,449364,0.111564,22.9589
North America,182123,43836,1546350.0,292116,0.117776,18.8906
Africa,1733,379,19593.0,3325,0.08845,16.9703
Asia,106734,25462,807003.0,110100,0.13226,13.6431
Latin America,45382,5648,435750.0,36871,0.104147,8.4615


# 02. Scraping top 200 universities from Times Higher Education

We have the same issue as before, the HTML from the given url doesn't contain the data that we actually
want, rather it is loaded with a jQuery to a `json` somewhere else on the site. Using Postman and inspecting the html, there is only one `json` loaded on the ranking page, so we'll simply do some string handling to extract
the url of interest from the HTML.

In [26]:
times_url = 'https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking'
resp = requests.get(times_url)

In order to extract the url we want, we're first going to find where the "json" at the end of the url is located. We'll then use `rfind` to find the "http" at the beginning of this url.

In [27]:
stop = resp.text.find('json')
start = resp.text.rfind('http', 0 , stop)
times_data_url = resp.text[start:stop+len('json')]
print(times_data_url)

https:\/\/www.timeshighereducation.com\/sites\/default\/files\/the_data_rankings\/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json


As before, we need to filter out all the backlashes

In [28]:
times_data_url = times_data_url.replace('\\' , '')
print(times_data_url)

https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json


In [29]:
times_data = requests.get(times_data_url)
times_parsed = times_data.json()

We've got some other keys than just the `data` one, but they don't seem of use for what we're looking for. `location` is already contained in the main `data` key-value pair

In [30]:
times_parsed.keys()

dict_keys(['data', 'locations', 'subjects', 'pillars'])

Again, it looks like the `list` we get in the `data` key is conveniently organised from highest to lowest ranked:

In [31]:
times_parsed['data'][0]

{'aliases': 'University of Oxford',
 'location': 'United Kingdom',
 'member_level': '0',
 'name': 'University of Oxford',
 'nid': 468,
 'rank': '1',
 'rank_order': '10',
 'record_type': 'master_account',
 'scores_citations': '99.1',
 'scores_citations_rank': '15',
 'scores_industry_income': '63.7',
 'scores_industry_income_rank': '169',
 'scores_international_outlook': '95.0',
 'scores_international_outlook_rank': '24',
 'scores_overall': '94.3',
 'scores_overall_rank': '10',
 'scores_research': '99.5',
 'scores_research_rank': '1',
 'scores_teaching': '86.7',
 'scores_teaching_rank': '5',
 'stats_female_male_ratio': '46 : 54',
 'stats_number_students': '20,409',
 'stats_pc_intl_students': '38%',
 'stats_student_staff_ratio': '11.2',
 'subjects_offered': 'Archaeology,Art, Performing Arts & Design,Biological Sciences,Business & Management,Chemical Engineering,Chemistry,Civil Engineering,Computer Science,Economics & Econometrics,Electrical & Electronic Engineering,General Engineering,Geo

Let's create our `DataFrame` containing the top 200

In [32]:
times_df = pd.DataFrame()
for i in range(0,200):
    times_df = times_df.append(times_parsed['data'][i], ignore_index=True)
print(times_df.shape)
times_df.head()

(200, 26)


Unnamed: 0,aliases,location,member_level,name,nid,rank,rank_order,record_type,scores_citations,scores_citations_rank,...,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio,subjects_offered,url
0,University of Oxford,United Kingdom,0,University of Oxford,468.0,1,10,master_account,99.1,15,...,99.5,1,86.7,5,46 : 54,20409,38%,11.2,"Archaeology,Art, Performing Arts & Design,Biol...",/world-university-rankings/university-oxford
1,University of Cambridge,United Kingdom,0,University of Cambridge,470.0,2,20,master_account,97.5,29,...,97.8,3,87.8,3,45 : 55,18389,35%,10.9,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/university-cambridge
2,California Institute of Technology caltech,United States,0,California Institute of Technology,128779.0,=3,30,private,99.5,10,...,97.5,4,90.3,1,31 : 69,2209,27%,6.5,"Architecture,Biological Sciences,Business & Ma...",/world-university-rankings/california-institut...
3,Stanford University,United States,11,Stanford University,467.0,=3,40,private,99.9,4,...,96.7,5,89.1,2,42 : 58,15845,22%,7.5,"Archaeology,Architecture,Art, Performing Arts ...",/world-university-rankings/stanford-university
4,Massachusetts Institute of Technology,United States,0,Massachusetts Institute of Technology,471.0,5,50,private,100.0,1,...,91.9,9,87.3,4,37 : 63,11177,34%,8.7,"Architecture,Art, Performing Arts & Design,Bio...",/world-university-rankings/massachusetts-insti...


We've got a lot more information this time, let's get rid of the columns we're not interested in, after backing it up to a pickle. We'll rename some of the columns too, to make it consistent with the previous `df`

In [33]:
pickle.dump( times_df, open( "times_dataframe.p", "wb" ) )
times_df = pickle.load( open( "times_dataframe.p", "rb" ) )
times_df = times_df[['location','name','rank','stats_student_staff_ratio','stats_number_students','stats_pc_intl_students']]
times_df.rename(columns={'location':'country', 'stats_number_students':'total student','rank':'rank_display','stats_pc_intl_students':'% international students'}, inplace=True)
times_df.rank_display = times_df.rank_display.str.replace('=','')

Let's now add region information based on what we have from the previous `DataFrame`

In [34]:
times_df['region'] = np.nan
for country in times_df['country'].unique():
    try:
        times_df.loc[times_df['country'] == country, 'region'] = qs_df[qs_df['country'] == country]['region'].iloc[0]
    except IndexError:
        print('No region info for', country)    

No region info for Luxembourg
No region info for Russian Federation


We're missing some region info about Luxembourg and the Russian Federation, so we'll add this by hand

In [35]:
times_df.loc[times_df['country'] == 'Luxembourg', 'region'] = 'Europe'
times_df.loc[times_df['country'] == 'Russian Federation', 'region'] = 'Europe' 
# in the previous data Russia is assigned to the Europe region

In [36]:
times_df.head()

Unnamed: 0,country,name,rank_display,stats_student_staff_ratio,total student,% international students,region
0,United Kingdom,University of Oxford,1,11.2,20409,38%,Europe
1,United Kingdom,University of Cambridge,2,10.9,18389,35%,Europe
2,United States,California Institute of Technology,3,6.5,2209,27%,North America
3,United States,Stanford University,3,7.5,15845,22%,North America
4,United States,Massachusetts Institute of Technology,5,8.7,11177,34%,North America


## Best universities

We'll change types to floats where we need it. We also need to do a little bit of string cleaning before handing it over to `Pandas`

In [37]:
times_df.loc[:,'total student'] = times_df.loc[:,'total student'].str.replace(',' , '')
times_df.loc[:,'% international students'] = times_df.loc[:,'% international students'].str.replace('%' , '')

columns = ['rank_display', 'stats_student_staff_ratio', 'total student', '% international students']
for col in columns:
    times_df.loc[:,col] = times_df.loc[:,col].astype(float)
times_df['faculty:students ratio'] = 1/times_df['stats_student_staff_ratio']

In [38]:
times_df.sort_values('faculty:students ratio' , ascending=False).head()

Unnamed: 0,country,name,rank_display,stats_student_staff_ratio,total student,% international students,region,faculty:students ratio
105,United States,Vanderbilt University,105.0,3.3,12011.0,13.0,North America,0.30303
109,Denmark,University of Copenhagen,109.0,4.1,30395.0,14.0,Europe,0.243902
153,United States,University of Rochester,153.0,4.3,9636.0,29.0,North America,0.232558
11,United States,Yale University,12.0,4.3,12155.0,21.0,North America,0.232558
12,United States,Johns Hopkins University,13.0,4.3,15498.0,24.0,North America,0.232558


In [39]:
times_df.sort_values('% international students' , ascending=False).head()

Unnamed: 0,country,name,rank_display,stats_student_staff_ratio,total student,% international students,region,faculty:students ratio
24,United Kingdom,London School of Economics and Political Science,25.0,12.2,10065.0,71.0,Europe,0.081967
178,Luxembourg,University of Luxembourg,179.0,14.6,4969.0,57.0,Europe,0.068493
37,Switzerland,École Polytechnique Fédérale de Lausanne,38.0,11.2,9928.0,55.0,Europe,0.089286
7,United Kingdom,Imperial College London,8.0,11.4,15857.0,55.0,Europe,0.087719
102,Netherlands,Maastricht University,103.0,18.0,16727.0,50.0,Europe,0.055556


We've only got the % international students and students:staff ratio in the data from the Times, so we'll calculate the number of staff and international students from this data. Note that we don't have any info concerning % international faculty from the Times data.

In [40]:
times_df['total inter'] = times_df['total student']*times_df['% international students']/100
times_df['total inter'] = times_df['total inter'].astype(int) # rounding it off to an integer
times_df['total faculty'] = times_df['total student']*times_df['faculty:students ratio']
times_df['total faculty'] = times_df['total faculty'].astype(int)

In [41]:
times_df.head()

Unnamed: 0,country,name,rank_display,stats_student_staff_ratio,total student,% international students,region,faculty:students ratio,total inter,total faculty
0,United Kingdom,University of Oxford,1.0,11.2,20409.0,38.0,Europe,0.089286,7755,1822
1,United Kingdom,University of Cambridge,2.0,10.9,18389.0,35.0,Europe,0.091743,6436,1687
2,United States,California Institute of Technology,3.0,6.5,2209.0,27.0,North America,0.153846,596,339
3,United States,Stanford University,3.0,7.5,15845.0,22.0,North America,0.133333,3485,2112
4,United States,Massachusetts Institute of Technology,5.0,8.7,11177.0,34.0,North America,0.114943,3800,1284


## Grouping by country

In [42]:
columns_to_add = ['total faculty', 'total student', 'total inter']
times_country = pd.DataFrame(columns=['country'] + columns_to_add )
times_country['country'] = times_df['country'].unique()
times_country.set_index('country', inplace=True)

for country in times_df['country'].unique():
    sums = times_df[times_df['country'] == country][columns_to_add].sum()
    times_country.loc[country][columns_to_add] = sums

times_country['faculty:students ratio'] = times_country['total faculty']/times_country['total student']
times_country['% international students'] = 100*times_country['total inter']/times_country['total student']
times_country.head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United Kingdom,44425,596449.0,213045,0.0744825,35.7189
United States,152317,1705530.0,280338,0.0893076,16.437
Switzerland,10048,107852.0,32743,0.0931647,30.3592
Singapore,3364,56101.0,17084,0.0599633,30.4522
Canada,13236,249401.0,55902,0.0530712,22.4145


In [43]:
times_country.sort_values('faculty:students ratio' , ascending=False).head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Denmark,10596,64110,9326,0.165278,14.5469
Russian Federation,4004,29236,6431,0.136954,21.9969
Japan,6464,48481,4398,0.133331,9.0716
Italy,157,1205,132,0.13029,10.9544
Switzerland,10048,107852,32743,0.0931647,30.3592


In [44]:
times_country.sort_values('% international students' , ascending=False).head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Luxembourg,340,4969,2832,0.0684242,56.9934
United Kingdom,44425,596449,213045,0.0744825,35.7189
Hong Kong,4140,77663,25157,0.0533072,32.3925
Australia,9937,268630,83807,0.0369914,31.1979
Singapore,3364,56101,17084,0.0599633,30.4522


## Grouping by region

In [45]:
columns_to_add = ['total faculty', 'total student', 'total inter']
times_region = pd.DataFrame(columns=['region'] + columns_to_add )
times_region['region'] = times_df['region'].unique()
times_region.set_index('region', inplace=True)

for region in times_df['region'].unique():
    sums = times_df[times_df['region'] == region][columns_to_add].sum()
    times_region.loc[region, columns_to_add] = sums
    
times_region['faculty:students ratio'] = times_region['total faculty']/times_region['total student']
times_region['% international students'] = 100*times_region['total inter']/times_region['total student']
times_region.head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Europe,126816,2143623,481530,0.05916,22.463372
North America,165553,1954934,336240,0.084685,17.199558
Asia,42628,530949,78827,0.080286,14.846435
Oceania,11551,298978,92607,0.038635,30.97452
Africa,1775,20775,3739,0.085439,17.997593


In [46]:
times_region.sort_values('faculty:students ratio' , ascending=False).head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,1775,20775,3739,0.085439,17.997593
North America,165553,1954934,336240,0.084685,17.199558
Asia,42628,530949,78827,0.080286,14.846435
Europe,126816,2143623,481530,0.05916,22.463372
Oceania,11551,298978,92607,0.038635,30.97452


In [47]:
times_region.sort_values('% international students' , ascending=False).head()

Unnamed: 0_level_0,total faculty,total student,total inter,faculty:students ratio,% international students
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Oceania,11551,298978,92607,0.038635,30.97452
Europe,126816,2143623,481530,0.05916,22.463372
Africa,1775,20775,3739,0.085439,17.997593
North America,165553,1954934,336240,0.084685,17.199558
Asia,42628,530949,78827,0.080286,14.846435


# 03. Merging both `DataFrames`

As we've been looking at the number of students and faculty as well as whether they're international or not, we'll only keep this data in the merged `DataFrame`. It already looks like we only recover about half of the universities when we try to match them by name, 105 out of 200.

In [48]:
len(set(times_df['name'].unique()).intersection(qs_df['name'].unique()))

105

Let's try to see if we can increase this match by using the str.contains() method. If one of the `dfs` has a name that is extended from the other, we'll simplify this back to the shorter name. We can see that we recover more than 30 universities this way.

In [49]:
diff1 = list(set(times_df['name'].unique()).difference(qs_df['name'].unique()))
diff2 = list(set(qs_df['name'].unique()).difference(times_df['name'].unique()))

for i in diff1:
    if len(qs_df[qs_df['name'].str.contains(i)]) > 0:
        qs_df.loc[qs_df['name'].str.contains(i), 'name'] = i
for i in diff2:
    if len(times_df[times_df['name'].str.contains(i)]) > 0:
        times_df.loc[times_df['name'].str.contains(i),'name'] = i



In [50]:
len(set(times_df['name'].unique()).intersection(qs_df['name'].unique()))

138

In [51]:
mrg_df = times_df.merge(qs_df, how='inner', on='name')
print(mrg_df.shape)
mrg_df.head()

(140, 19)


Unnamed: 0,country_x,name,rank_display_x,stats_student_staff_ratio,total student_x,% international students_x,region_x,faculty:students ratio_x,total inter_x,total faculty_x,country_y,rank_display_y,region_y,total faculty_y,inter faculty,total student_y,total inter_y,faculty:students ratio_y,% international students_y
0,United Kingdom,University of Oxford,1.0,11.2,20409.0,38.0,Europe,0.089286,7755,1822,United Kingdom,6.0,Europe,6750.0,2964.0,19720.0,7353.0,0.342292,37.287018
1,United Kingdom,University of Cambridge,2.0,10.9,18389.0,35.0,Europe,0.091743,6436,1687,United Kingdom,5.0,Europe,5490.0,2278.0,18770.0,6699.0,0.292488,35.689931
2,United States,California Institute of Technology,3.0,6.5,2209.0,27.0,North America,0.153846,596,339,United States,4.0,North America,953.0,350.0,2255.0,647.0,0.422616,28.691796
3,United States,Stanford University,3.0,7.5,15845.0,22.0,North America,0.133333,3485,2112,United States,2.0,North America,4285.0,2042.0,15878.0,3611.0,0.26987,22.742159
4,United States,Massachusetts Institute of Technology,5.0,8.7,11177.0,34.0,North America,0.114943,3800,1284,United States,1.0,North America,2982.0,1679.0,11067.0,3717.0,0.26945,33.586338


In [52]:
mrg_df.sort_index(axis=1, inplace=True)

# 04. Exploratory Analysis

We can simply perform a correlation between all columns to get an overal look at our data. We'll use Spearman's correlation coefficient, as we want to capture more than just linear relationships. Inspection (not shown here) with Pearson's showed similar trends.

This actually gives us two pieces of information. Fist of all we can identify where our datasets diverge by comparing the same metric between both sets. Keep in mind this is correlation, so it will show a difference in trend of these numbers, not absolute value. Firstly we see that most data between sets have correlation of > 90%, however it seems that what consists of 'faculty' is rather different between the two. Indeed the topuniversities data mentions 'number of academic faculty staff' whereas the times only says 'staff'.

In [53]:
mrg_df.corr(method='spearman')

Unnamed: 0,% international students_x,% international students_y,faculty:students ratio_x,faculty:students ratio_y,inter faculty,rank_display_x,rank_display_y,stats_student_staff_ratio,total faculty_x,total faculty_y,total inter_x,total inter_y,total student_x,total student_y
% international students_x,1.0,0.931329,-0.059531,0.100353,0.354761,-0.175264,-0.286133,0.059531,-0.326781,-0.207809,0.577136,0.469042,-0.361895,-0.350192
% international students_y,0.931329,1.0,-0.08672,0.077108,0.415221,-0.190697,-0.301954,0.08672,-0.284045,-0.145849,0.591203,0.578346,-0.284074,-0.265703
faculty:students ratio_x,-0.059531,-0.08672,1.0,0.647152,-0.018739,-0.18807,-0.291408,-1.0,0.467761,0.180242,-0.304808,-0.305367,-0.313525,-0.312144
faculty:students ratio_y,0.100353,0.077108,0.647152,1.0,0.271529,-0.288466,-0.400755,-0.647152,0.161875,0.364687,-0.232246,-0.281514,-0.40872,-0.41526
inter faculty,0.354761,0.415221,-0.018739,0.271529,1.0,-0.31585,-0.496271,0.018739,0.250683,0.543532,0.53433,0.526956,0.252108,0.249042
rank_display_x,-0.175264,-0.190697,-0.18807,-0.288466,-0.31585,1.0,0.756258,0.18807,-0.246681,-0.352147,-0.2594,-0.228037,-0.084346,-0.083272
rank_display_y,-0.286133,-0.301954,-0.291408,-0.400755,-0.496271,0.756258,1.0,0.291408,-0.275974,-0.388989,-0.276068,-0.246296,-0.03486,-0.032094
stats_student_staff_ratio,0.059531,0.08672,-1.0,-0.647152,0.018739,0.18807,0.291408,1.0,-0.467761,-0.180242,0.304808,0.305367,0.313525,0.312144
total faculty_x,-0.326781,-0.284045,0.467761,0.161875,0.250683,-0.246681,-0.275974,-0.467761,1.0,0.755035,0.215256,0.252142,0.611886,0.606451
total faculty_y,-0.207809,-0.145849,0.180242,0.364687,0.543532,-0.352147,-0.388989,-0.180242,0.755035,1.0,0.345909,0.386382,0.632728,0.640425


One of the stronger tends (Spearman's between roughly 0.4-0.6) is between the % of international students and total international students, suggesting that
more international students there are in a university, the higher the fraction of international students. This could
be due to a sort of "sheep" effect, where there are already large established international student communities is where they also represent a majority of the student population.

There are a few weak trends we can observe, with either Spearman's coefficients roughly between 0.3 and 0.4

* Spearman's suggests that there is some relationship between the % of international students and the number of international faculty members, the various correlation coefficients range between 0.29 and 0.41.
* They also show some inverse relationship between the number of students and the & international students, suggesting that universities with more students have less of a fraction international students

Again, these are rather weak trends, and would need further investigating to conclude something from them

# 05. Finding the 'best' university

I believe that the ranking is the number one metric for trying to judge if a university is best. These two datasets have already synthesized lots of information into coming up with this ranking, so we might as well use it. We should try to take into account the difference in rankings in the two data sets. 

The quality of education students receive is also linked to how many staff are available to teach. Of course, a high faculty:student ratio does not automatically mean that all these extra faculty members are helping to teach students, but it's at least an opportunity for students to receive a better education.

We'll therefore use a weighted sum of means to estimate the 'best' university based on both sets. We'll take the mean of the rank from both data sets, and add the mean of the student:staff ratio with a lower weight to re-rank the universities.

In [54]:
mrg_df['mean_rank'] = mrg_df[['rank_display_x','rank_display_y']].mean(axis=1)
mrg_df['mean_%_inter'] = mrg_df[['% international students_x','% international students_x']].mean(axis=1)/100

In [55]:
mrg_df['new_rank'] = mrg_df['mean_rank'] - mrg_df['mean_%_inter']
mrg_df.set_index('new_rank', inplace=True)
mrg_df.sort_index().head()

Unnamed: 0_level_0,% international students_x,% international students_y,country_x,country_y,faculty:students ratio_x,faculty:students ratio_y,inter faculty,name,rank_display_x,rank_display_y,...,region_y,stats_student_staff_ratio,total faculty_x,total faculty_y,total inter_x,total inter_y,total student_x,total student_y,mean_rank,mean_%_inter
new_rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.28,22.0,22.742159,United States,United States,0.133333,0.26987,2042.0,Stanford University,3.0,2.0,...,North America,7.5,2112,4285.0,3485,3611.0,15845.0,15878.0,2.5,0.22
2.66,34.0,33.586338,United States,United States,0.114943,0.26945,1679.0,Massachusetts Institute of Technology,5.0,1.0,...,North America,8.7,1284,2982.0,3800,3717.0,11177.0,11067.0,3.0,0.34
3.12,38.0,37.287018,United Kingdom,United Kingdom,0.089286,0.342292,2964.0,University of Oxford,1.0,6.0,...,Europe,11.2,1822,6750.0,7755,7353.0,20409.0,19720.0,3.5,0.38
3.15,35.0,35.689931,United Kingdom,United Kingdom,0.091743,0.292488,2278.0,University of Cambridge,2.0,5.0,...,Europe,10.9,1687,5490.0,6436,6699.0,18389.0,18770.0,3.5,0.35
3.23,27.0,28.691796,United States,United States,0.153846,0.422616,350.0,California Institute of Technology,3.0,4.0,...,North America,6.5,339,953.0,596,647.0,2209.0,2255.0,3.5,0.27
