In [None]:
# Import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn
import re
import pickle
%matplotlib inline

# 01. Scraping info from topuniversities.com

In [None]:
qs_url = 'https://www.topuniversities.com'

## Initial postman/parsing
Trying to get the url which contains the actual data that we want to parse. Using Postman we can see that the actual ranking data which is shown on the page is generated with a request to `rank_url`, therefore it is this
link that we'll need to GET to extract all the data we're interested in.

In [None]:
resp = requests.get(qs_url + '/university-rankings/world-university-rankings/2018')

In [None]:
start = resp.text.find('rank_url')
start = resp.text.find('http', start)
stop = resp.text.find('.txt', start)
qs_data_url = resp.text[start:stop+len('.txt')]
print(qs_data_url)

We've still got all the escape characters, in this case backslashes, so we'll have to replace them:

In [None]:
qs_data_url = qs_data_url.replace('\\' , '')
print(qs_data_url)

## Scraping the main data and putting everything into a `DataFrame`
Scraping everything that is contained in the `rank_url`. This is the majority of what we are interested in, the faculty and student data are contained on another page that is specific to each university. This will be scraped afterwards.
Handily enough, the data from `rank_url` is in `JSON` format, so we'll use the `JSON` parsing capabilities of 
`requests`

In [None]:
rank_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt'
rank_data = requests.get(qs_data_url)
parsed_data = rank_data.json()
parsed_data.keys()

We've got a `Dict` with only one key, so let's have a look into it.

In [None]:
print(type(parsed_data['data']))
print(len(parsed_data['data']))
parsed_data['data'][0]

We've got a list of 959 entries. Not too surprisingly the list we get in the `data` key is conveniently organised from highest to lowest ranked.

Let's now put all of this data into a single `DataFrame`. We're only interested in the top 200 universities, so we'll ignore the rest of the set.

In [None]:
qs_df = pd.DataFrame()
for i in range(0,200):
    qs_df = qs_df.append(parsed_data['data'][i], ignore_index=True)
print(qs_df.shape)
qs_df.head()

We've now got a `DataFrame` with the main information, but we still need to scrape a page for each individual university that contains the number of total and international, faculty and students

## Scraping the specific page for each university

We will first define a handy little function to extract numbers from strings with newlines and commas, for example from `\n1,300` we want to extract only the `1300`

In [None]:
def xtract_number(str_in):
    str_in = str_in.replace(',' , '')
    str_in = re.search(r'\d+', str_in).group()
    return str_in

Let's add the extra columns that we're going to populate

In [None]:
columns_to_add = ['total faculty','inter faculty','total student','total inter']
qs_df = pd.concat([qs_df, pd.DataFrame(columns=columns_to_add)], axis=1)
qs_df.head()

In [None]:
#Additional information is contained in the following tags
#<h3> Number of international students
#<h3> Number of students
#<h3> Number of academic faculty staff --> <div class="anno">In total & <div class="anno">International

The page containing the additional data we're looking for is already contained in the `url` field of our `DataFrame`. Each value we're looking for is contained in a `<div>` tag with `class=` the data we're looking for, within this tag is another `<div>` tag with `class="number"` which has the actual numeric value. We're therefore going to parse the page for each university and use `BeautifulSoup` to find all these tags. As there are several of them on each page, we'll double-check that they're  all the same.
The following step is very slow, it has to parse a lot of html for 200 entries.

In [None]:
for idx in qs_df.index:
#for idx in range(30,200): # you can uncomment this to only parse the first few universities
    page = requests.get(qs_url + qs_df.loc[idx]['url']) # GET the page for one university
    soup = BeautifulSoup(page.text, 'html.parser') # parse it with bs4

    for column in columns_to_add:
        try:
            wrapper = soup.find_all('div', class_=column) # find the tag of interest
            if not wrapper:
                print('No data for', qs_df.loc[idx]['title'], 'concerning', column)
            values = np.zeros(len(wrapper))
            for i in range(0,len(wrapper)): # if there are several tags, we'll check they have the same values
                values[i] = xtract_number(wrapper[0].find('div', class_='number').string)
                if i>0 and values[i] != values[i-1]:
                   raise Exception('Numerical values for', qs_df.loc[idx]['title'], 'are different throughout the HTML') 
                else:
                    qs_df.loc[idx][column] = values[0]
            
        except IndexError:
            print('No data for', qs_df.loc[idx]['title'], 'concerning', column)

Furthermore, we can see that some data is missing for New York University and the Indian Institute of Science.
Going to the website and checking this by hand does indeed show that these pieces of information are missing. We'll therefore leave these as NaN to signify the missing data.

Given the time to GET and parse all this HTML, we've stored the `DataFrame` in a pickle for convenience:

In [None]:
#pickle.dump( qs_df, open( "qs_dataframe.p", "wb" ) )
#qs_df = pickle.load( open( "qs_dataframe.p", "rb" ) )

Let's drop some of the extra columns that we don't really need:

In [None]:
qs_df.drop(['core_id', 'guide', 'logo', 'nid', 'url'], axis=1, inplace=True)

## Which are the best universities?

We are now going to compare the best universities in terms of ratio between faculty members: students and % of international students. Let's add these columns, they are merely operations involving the other columns

In [None]:
qs_df['faculty:students ratio'] = qs_df['total faculty']/qs_df['total student']
qs_df['% international students'] = 100*qs_df['total inter']/qs_df['total student']

In [None]:
qs_df.head()

In [None]:
qs_df.sort_values('faculty:students ratio' , ascending=False)

In [None]:
qs_df.sort_values('% international students' , ascending=False)

In [None]:
qs_df['country'].unique()

# 02. Scraping top 200 universities from Times Higher Education

We have the same issue as before, the HTML from the main page given doesn't contain the data that we actually
want, rather it is loaded with a jQuery to a `json` somewhere else on the site. Using Postman and inspecting the html, there is only one `json` loaded on the ranking page, so we'll simply do some string handling to extract
the url of interest from the HTML.

In [None]:
times_url = 'https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking'
resp = requests.get(times_url)

In order to extract the url we want, we're first going to find where the "json" at the end of the url is located. We'll then use `rfind` to find the "http" at the beginning of this url.

In [None]:
stop = resp.text.find('json')
start = resp.text.rfind('http', 0 , stop)
times_data_url = resp.text[start:stop+len('json')]
print(times_data_url)

As before, we need to filter out all the backlashes

In [None]:
times_data_url = times_data_url.replace('\\' , '')
print(times_data_url)

In [None]:
times_data = requests.get(times_data_url)
#times_parsed = rank_data.json()
#parsed_data

In [None]:
times_data.text[:300]

In [None]:
times_parsed = times_data.json()

We've got some other keys than just the `data` one, but they don't seem of use for what we're looking for. `location` is already contained in the main `data` key-value pair

In [None]:
times_parsed.keys()

Again, it looks like `list` we get in the `data` key is conveniently organised from highest to lowest ranked:

In [None]:
times_parsed['data'][0]

In [None]:
times_df = pd.DataFrame()
for i in range(0,200):
    times_df = times_df.append(times_parsed['data'][i], ignore_index=True)
print(times_df.shape)
times_df.head()

In [None]:
times_df.columns