In [None]:
# Import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn
import re
import pickle
%matplotlib inline

# 01. Scraping info from topuniversities.com

In [None]:
qs_url = 'https://www.topuniversities.com'

## Initial postman/parsing
Trying to get the url which contains the actual data that we want to parse. Using Postman we can see that the actual ranking data which is shown on the page is generated with a request to `rank_url`, therefore it is this
link that we'll need to GET to extract all the data we're interested in.

In [None]:
resp = requests.get(qs_url + '/university-rankings/world-university-rankings/2018')

In [None]:
start = resp.text.find('rank_url')
start = resp.text.find('http', start)
stop = resp.text.find('.txt', start)
qs_data_url = resp.text[start:stop+len('.txt')]
print(qs_data_url)

We've still got all the escape characters, in this case backslashes, so we'll have to replace them:

In [None]:
qs_data_url = qs_data_url.replace('\\' , '')
print(qs_data_url)

## Scraping the main data and putting everything into a `DataFrame`
Scraping everything that is contained in the `rank_url`. This is the majority of what we are interested in, the faculty and student data are contained on another page that is specific to each university. This will be scraped afterwards.
Handily enough, the data from `rank_url` is in `JSON` format, so we'll use the `JSON` parsing capabilities of 
`requests`

In [None]:
rank_data = requests.get(qs_data_url)
parsed_data = rank_data.json()
parsed_data.keys()

We've got a `Dict` with only one key, so let's have a look into it.

In [None]:
print(type(parsed_data['data']))
print(len(parsed_data['data']))
parsed_data['data'][0]

We've got a list of 959 entries. Not too surprisingly the list we get in the `data` key is conveniently organised from highest to lowest ranked.

Let's now put all of this data into a single `DataFrame`. We're only interested in the top 200 universities, so we'll ignore the rest of the set.

In [None]:
qs_df = pd.DataFrame()
for i in range(0,200):
    qs_df = qs_df.append(parsed_data['data'][i], ignore_index=True)
print(qs_df.shape)
qs_df.head()

We've now got a `DataFrame` with the main information, but we still need to scrape a page for each individual university that contains the number of total and international, faculty and students.

## Scraping the specific page for each university

We will first define a handy little function to extract numbers from strings with newlines and commas, for example from `\n1,300` we want to extract only the `1300`

In [None]:
def xtract_number(str_in):
    str_in = str_in.replace(',' , '')
    str_in = re.search(r'\d+', str_in).group()
    return str_in

Let's add the extra columns that we're going to populate

In [None]:
columns_to_add = ['total faculty','inter faculty','total student','total inter']
qs_df = pd.concat([qs_df, pd.DataFrame(columns=columns_to_add)], axis=1)

The page containing the additional data we're looking for is already contained in the `url` field of our `DataFrame`. Each value we're looking for is contained in a `<div>` tag with `class=` the data we're looking for, within this tag is another `<div>` tag with `class="number"` which has the actual numeric value. We're therefore going to parse the page for each university and use `BeautifulSoup` to find all these tags. As there are several of them on each page, we'll double-check that they're  all the same.
The following step is very slow, it has to parse a lot of html for 200 entries.

In [None]:
for idx in qs_df.index:
#for idx in range(30,200): # you can uncomment this to only parse the first few universities
    page = requests.get(qs_url + qs_df.loc[idx]['url']) # GET the page for one university
    soup = BeautifulSoup(page.text, 'html.parser') # parse it with bs4

    for column in columns_to_add:
        try:
            wrapper = soup.find_all('div', class_=column) # find the tag of interest
            if not wrapper:
                print('No data for', qs_df.loc[idx]['title'], 'concerning', column)
            values = np.zeros(len(wrapper))
            for i in range(0,len(wrapper)): # if there are several tags, we'll check they have the same values
                values[i] = xtract_number(wrapper[0].find('div', class_='number').string)
                if i>0 and values[i] != values[i-1]:
                   raise Exception('Numerical values for', qs_df.loc[idx]['title'], 'are different throughout the HTML') 
                else:
                    qs_df.loc[idx][column] = values[0]
            
        except IndexError:
            print('No data for', qs_df.loc[idx]['title'], 'concerning', column)

Furthermore, we can see that some data is missing for New York University and the Indian Institute of Science.
Going to the website and checking this by hand does indeed show that these pieces of information are missing. We'll therefore leave these as NaN to signify the missing data.

Given the time to GET and parse all this HTML, we've stored the `DataFrame` in a pickle for convenience:

In [None]:
pickle.dump( qs_df, open( "qs_dataframe.p", "wb" ) )
qs_df = pickle.load( open( "qs_dataframe.p", "rb" ) )

Let's drop some of the extra columns that we don't really need, they're still in the pickle if we need them later. We'll also change the columns to floats for our calculations.

In [None]:
qs_df.drop(['core_id', 'guide', 'logo', 'nid', 'url', 'cc', 'score', 'stars'], axis=1, inplace=True)
qs_df.rank_display = qs_df.rank_display.str.replace('=','')
qs_df.rename(columns={'title':'name'}, inplace=True) # to merge on a column they need the same name in both dfs

In [None]:
int_cols = ['rank_display', 'total faculty', 'inter faculty', 'total student', 'total inter']
for col in int_cols:
    qs_df.loc[:,col] = qs_df.loc[:,col].astype(float)

## Which are the best universities?

We are now going to compare the best universities in terms of ratio between faculty members: students and % of international students. Let's add these columns, they are merely operations involving the other columns

In [None]:
qs_df['faculty:students ratio'] = qs_df['total faculty']/qs_df['total student']
qs_df['% international students'] = 100*qs_df['total inter']/qs_df['total student']

In [None]:
qs_df.sort_values('faculty:students ratio' , ascending=False).head()

In [None]:
qs_df.sort_values('% international students' , ascending=False).head()

## Aggregating by Country

We'll first create a new `DataFrame` which will have info aggregated by country

In [None]:
qs_country = pd.DataFrame(columns=['country'] + columns_to_add )
qs_country['country'] = qs_df['country'].unique()
qs_country.set_index('country', inplace=True)

Let's calculate the totals per country

In [None]:
for country in qs_df['country'].unique():
    sums = qs_df[qs_df['country'] == country][columns_to_add].sum()
    qs_country.loc[country][columns_to_add] = sums

Now let's calculate the same two stats that we did per university before

In [None]:
qs_country['faculty:students ratio'] = qs_country['total faculty']/qs_country['total student']
qs_country['% international students'] = 100*qs_country['total inter']/qs_country['total student']

In [None]:
qs_country.sort_values('faculty:students ratio' , ascending=False).head()

In [None]:
qs_country.sort_values('% international students' , ascending=False).head()

## Aggregating by region
Let's do the same thing but grouping per region now, we'll do this in the same way as before.

In [None]:
qs_region = pd.DataFrame(columns=['region'] + columns_to_add )
qs_region['region'] = qs_df['region'].unique()
qs_region.set_index('region', inplace=True)
qs_region.head()

for region in qs_df['region'].unique():
    sums = qs_df[qs_df['region'] == region][columns_to_add].sum()
    qs_region.loc[region][columns_to_add] = sums
    
qs_region['faculty:students ratio'] = qs_region['total faculty']/qs_region['total student']
qs_region['% international students'] = 100*qs_region['total inter']/qs_region['total student']

In [None]:
qs_region.sort_values('faculty:students ratio' , ascending=False)

In [None]:
qs_region.sort_values('% international students' , ascending=False)

# 02. Scraping top 200 universities from Times Higher Education

We have the same issue as before, the HTML from the given url doesn't contain the data that we actually
want, rather it is loaded with a jQuery to a `json` somewhere else on the site. Using Postman and inspecting the html, there is only one `json` loaded on the ranking page, so we'll simply do some string handling to extract
the url of interest from the HTML.

In [None]:
times_url = 'https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking'
resp = requests.get(times_url)

In order to extract the url we want, we're first going to find where the "json" at the end of the url is located. We'll then use `rfind` to find the "http" at the beginning of this url.

In [None]:
stop = resp.text.find('json')
start = resp.text.rfind('http', 0 , stop)
times_data_url = resp.text[start:stop+len('json')]
print(times_data_url)

As before, we need to filter out all the backlashes

In [None]:
times_data_url = times_data_url.replace('\\' , '')
print(times_data_url)

In [None]:
times_data = requests.get(times_data_url)
times_parsed = times_data.json()

We've got some other keys than just the `data` one, but they don't seem of use for what we're looking for. `location` is already contained in the main `data` key-value pair

In [None]:
times_parsed.keys()

Again, it looks like the `list` we get in the `data` key is conveniently organised from highest to lowest ranked:

In [None]:
times_parsed['data'][0]

Let's create our `DataFrame` containing the top 200

In [None]:
times_df = pd.DataFrame()
for i in range(0,200):
    times_df = times_df.append(times_parsed['data'][i], ignore_index=True)
print(times_df.shape)
times_df.head()

We've got a lot more information this time, let's get rid of the columns we're not interested in, after backing it up to a pickle. We'll rename some of the columns too, to make it consistent with the previous `df`

In [None]:
pickle.dump( times_df, open( "times_dataframe.p", "wb" ) )
times_df = pickle.load( open( "times_dataframe.p", "rb" ) )
times_df = times_df[['location','name','rank','stats_student_staff_ratio','stats_number_students','stats_pc_intl_students']]
times_df.rename(columns={'location':'country', 'stats_number_students':'total student','rank':'rank_display','stats_pc_intl_students':'% international students'}, inplace=True)
times_df.rank_display = times_df.rank_display.str.replace('=','')

Let's now add region information based on what we have from the previous `DataFrame`

In [None]:
times_df['region'] = np.nan
for country in times_df['country'].unique():
    try:
        times_df.loc[times_df['country'] == country, 'region'] = qs_df[qs_df['country'] == country]['region'].iloc[0]
    except IndexError:
        print('No region info for', country)    

We're missing some region info about Luxembourg and the Russian Federation, so we'll add this by hand

In [None]:
times_df.loc[times_df['country'] == 'Luxembourg', 'region'] = 'Europe'
times_df.loc[times_df['country'] == 'Russian Federation', 'region'] = 'Europe' 
# in the previous data Russia is assigned to the Europe region

In [None]:
times_df.head()

## Best universities

We'll change types to floats where we need it. We also need to do a little bit of string cleaning before handing it over to `Pandas`

In [None]:
times_df.loc[:,'total student'] = times_df.loc[:,'total student'].str.replace(',' , '')
times_df.loc[:,'% international students'] = times_df.loc[:,'% international students'].str.replace('%' , '')

columns = ['rank_display', 'stats_student_staff_ratio', 'total student', '% international students']
for col in columns:
    times_df.loc[:,col] = times_df.loc[:,col].astype(float)
times_df['faculty:students ratio'] = 1/times_df['stats_student_staff_ratio']

In [None]:
times_df.sort_values('faculty:students ratio' , ascending=False).head()

In [None]:
times_df.sort_values('% international students' , ascending=False).head()

We've only got the % international students and students:staff ratio in the data from the Times, so we'll calculate the number of staff and international students from this data. Note that we don't have any info concerning % international faculty from the Times data.

In [None]:
times_df['total inter'] = times_df['total student']*times_df['% international students']/100
times_df['total inter'] = times_df['total inter'].astype(int) # rounding it off to an integer
times_df['total faculty'] = times_df['total student']*times_df['faculty:students ratio']
times_df['total faculty'] = times_df['total faculty'].astype(int)

In [None]:
times_df.head()

## Grouping by country

In [None]:
columns_to_add = ['total faculty', 'total student', 'total inter']
times_country = pd.DataFrame(columns=['country'] + columns_to_add )
times_country['country'] = times_df['country'].unique()
times_country.set_index('country', inplace=True)

for country in times_df['country'].unique():
    sums = times_df[times_df['country'] == country][columns_to_add].sum()
    times_country.loc[country][columns_to_add] = sums

times_country['faculty:students ratio'] = times_country['total faculty']/times_country['total student']
times_country['% international students'] = 100*times_country['total inter']/times_country['total student']
times_country.head()

In [None]:
times_country.sort_values('faculty:students ratio' , ascending=False).head()

In [None]:
times_country.sort_values('% international students' , ascending=False).head()

## Grouping by region

In [None]:
columns_to_add = ['total faculty', 'total student', 'total inter']
times_region = pd.DataFrame(columns=['region'] + columns_to_add )
times_region['region'] = times_df['region'].unique()
times_region.set_index('region', inplace=True)

for region in times_df['region'].unique():
    sums = times_df[times_df['region'] == region][columns_to_add].sum()
    times_region.loc[region, columns_to_add] = sums
    
times_region['faculty:students ratio'] = times_region['total faculty']/times_region['total student']
times_region['% international students'] = 100*times_region['total inter']/times_region['total student']
times_region.head()

In [None]:
times_region.sort_values('faculty:students ratio' , ascending=False).head()

In [None]:
times_region.sort_values('% international students' , ascending=False).head()

# 03. Merging both `DataFrames`

As we've been looking at the number of students and faculty as well as whether they're international or not, we'll only keep this data in the merged `DataFrame`. It already looks like we only recover about half of the universities when we try to match them by name, 105 out of 200.

In [None]:
len(set(times_df['name'].unique()).intersection(qs_df['name'].unique()))

Let's try to see if we can increase this match by using the str.contains() method. If one of the `dfs` has a name that is extended from the other, we'll simplify this back to the shorter name. We can see that we recover more than 30 universities this way.

In [None]:
diff1 = list(set(times_df['name'].unique()).difference(qs_df['name'].unique()))
diff2 = list(set(qs_df['name'].unique()).difference(times_df['name'].unique()))

for i in diff1:
    if len(qs_df[qs_df['name'].str.contains(i)]) > 0:
        qs_df.loc[qs_df['name'].str.contains(i), 'name'] = i
for i in diff2:
    if len(times_df[times_df['name'].str.contains(i)]) > 0:
        times_df.loc[times_df['name'].str.contains(i),'name'] = i

In [None]:
len(set(times_df['name'].unique()).intersection(qs_df['name'].unique()))

In [None]:
mrg_df = times_df.merge(qs_df, how='inner', on='name')
print(mrg_df.shape)
mrg_df.head()

In [None]:
mrg_df.sort_index(axis=1, inplace=True)

# 04. Exploratory Analysis

We can simply perform a correlation between all columns to get an overal look at our data. We'll use Spearman's correlation coefficient, as we want to capture more than just linear relationships. Inspection (not shown here) with Pearson's showed similar trends.

This actually gives us two pieces of information. Fist of all we can identify where our datasets diverge by comparing the same metric between both sets. Keep in mind this is correlation, so it will show a difference in trend of these numbers, not absolute value. Firstly we see that most data between sets have correlation of > 90%, however it seems that what consists of 'faculty' is rather different between the two. Indeed the topuniversities data mentions 'number of academic faculty staff' whereas the times only says 'staff'.

In [None]:
mrg_df.corr(method='spearman')

One of the stronger tends (Spearman's between roughly 0.4-0.6) is between the % of international students and total international students, suggesting that
more international students there are in a university, the higher the fraction of international students. This could
be due to a sort of "sheep" effect, where there are already large established international student communities is where they also represent a majority of the student population.

There are a few weak trends we can observe, with either Spearman's coefficients roughly between 0.3 and 0.4

* Spearman's suggests that there is some relationship between the % of international students and the number of international faculty members, the various correlation coefficients range between 0.29 and 0.41.
* They also show some inverse relationship between the number of students and the & international students, suggesting that universities with more students have less of a fraction international students

Again, these are rather weak trends, and would need further investigating to conclude something from them

# 05. Finding the 'best' university

I believe that the ranking is the number one metric for trying to judge if a university is best. These two datasets have already synthesized lots of information into coming up with this ranking, so we might as well use it. We should try to take into account the difference in rankings in the two data sets. 

The quality of education students receive is also linked to how many staff are available to teach. Of course, a high faculty:student ratio does not automatically mean that all these extra faculty members are helping to teach students, but it's at least an opportunity for students to receive a better education.

We'll therefore use a weighted sum of means to estimate the 'best' university based on both sets. We'll take the mean of the rank from both data sets, and add the mean of the student:staff ratio with a lower weight to re-rank the universities.

In [None]:
mrg_df['mean_rank'] = mrg_df[['rank_display_x','rank_display_y']].mean(axis=1)
mrg_df['mean_%_inter'] = mrg_df[['% international students_x','% international students_x']].mean(axis=1)/100

In [None]:
mrg_df['new_rank'] = mrg_df['mean_rank'] - mrg_df['mean_%_inter']
mrg_df.set_index('new_rank', inplace=True)
mrg_df.sort_index().head()