In [None]:
# Import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import re
import pickle
%matplotlib inline

# 01. Scraping info from topuniversities.com

In [None]:
qs_url = 'https://www.topuniversities.com'

## Initial postman/parsing
Trying to get the url which contains the actual data that we want to parse

In [None]:
resp = requests.get(qs_url + '/university-rankings/world-university-rankings/2018')
soup = BeautifulSoup(resp.text,'html.parser')

In [None]:
scripts = soup.find_all('script', type='text/javascript')
len(scripts)

In [None]:
i=0
for script in scripts:
    if script.text.find('rank_url')!= -1:
        print(i)
    i = i+1

In [None]:
scripts[28]

In [None]:
scripts[28].text[58:]
len(scripts[28].text)

In [None]:
scripts[28].text.find('rank_url')

In [None]:
scripts[28].text[14778:15178]

In [None]:
# How to parse headers to a python dict?
resp.headers

The actual ranking data which is shown on the page is generated with a request to `rank_url`, therefore it is this
linke that we'll need to GET to extract all the data we're interested in.

## Scraping the main data and putting everything into a `DataFrame`
Scraping everything that is contained in the `rank_url`. This is the majority of what we are interested in, the faculty and student data are contained on another page that is specific to each university. This will be scraped afterwards.
Handily enough, the data from `rank_url` is in `JSON` format, so we'll use the `JSON` parsing capabilities of 
`requests`

In [None]:
rank_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt'
rank_data = requests.get(rank_url)
parsed_data = rank_data.json()
parsed_data

We've got a a `Dict` with only one key, so let's have a look into it.

In [None]:
parsed_data.keys()

Not too surprisingly the list we get in the `data` key is conveniently organised from highest to lowest ranked:

In [None]:
type(parsed_data) # dict
type(parsed_data['data']) #list
# List is already organised based on rank (with indexing starting at 0):
parsed_data['data'][3]['rank_display']

In [None]:
parsed_data['data'][197]

Let's now put all of this data into a single `DataFrame`. We're only interested in the top 200 universities, so we'll ignore the rest of the set.

In [None]:
qs_data = pd.DataFrame()
for i in range(0,200):
    qs_data = qs_data.append(parsed_data['data'][i], ignore_index=True)
print(qs_data.shape)
qs_data.head()

## Scraping the specific page for each university

We will first define a handy little function to extract numbers from strings with newlines and commas:

In [None]:
def xtract_number(str):
    str = str.replace(',' , '')
    str = re.search(r'\d+', str).group()
    return str

Let's add the extra columns that we're going to populate

In [None]:
columns_to_add = ['total faculty','inter faculty','total student','total inter']
qs_data = pd.concat([qs_data, pd.DataFrame(columns=columns_to_add)], axis=1)
qs_data.head()

Additional information is contained in the following tags
<h3> Number of international students
<h3> Number of students
<h3> Number of academic faculty staff --> <div class="anno">In total & <div class="anno">International

The following step is very slow, it has to parse a lot of html for 200 entries.

In [None]:
#for idx in qs_data.index:
for idx in range(0,11):
    page = requests.get(qs_url + qs_data.loc[idx]['url'])
    soup = BeautifulSoup(page.text, 'html.parser')

    for column in columns_to_add:
        try:
            wrapper = soup.find_all('div',class_=column)
            value = xtract_number(wrapper[1].find('div', class_='number').string)
            qs_data.loc[idx][column] = value
        except IndexError:
            print('No data for', qs_data.loc[idx]['title'], 'concerning', column)

Furthermore, we can see that some data is missing for New York University and the Indian Institute of Science.
Going to the website and checking this by hand does indeed show that these pieces of information are missing. We'll therefore leave these as NaN to signify the missing data.

In [None]:
qs_data.head(12)

In [None]:
pickle.dump( qs_data, open( "qs_dataframe.p", "wb" ) )

In [None]:
qs_df = pickle.load( open( "qs_dataframe.p", "rb" ) )

In [None]:
qs_df.head()

# 02. Scraping top 200 universities from Times Higher Education

We have the same issue as before, the HTML from the main page given doesn't contain the data that we actually
want, rather it is loaded with a jQuery from a `json` somewhere else on the site. Using Postman and inspecting the html, there is only one `json` loaded on the ranking page, so we'll simply do some string handling to extract
the url of interest from the HTML.

In [None]:
times_url = 'https://www.timeshighereducation.com/world-university-rankings/2018/world-ranking'
resp = requests.get(times_url)

In order to extract the url we want, we're first going to find where the "json" at the end of the url is located. We'll then use `rfind` to find the "http" at the beginning of this url.

In [None]:
stop = resp.text.find('json')
start = resp.text.rfind('http', 0 , stop)
times_data_url = resp.text[start:stop+len('json')]
print(times_data_url)

We've still got all the escape characters, in this case backslashes, so we'll have to replace them:

In [None]:
times_data_url = times_data_url.replace('\\' , '')
print(times_data_url)

In [None]:
times_data = requests.get(times_data_url)
#times_parsed = rank_data.json()
#parsed_data

In [None]:
times_data.text[:300]

In [None]:
times_parsed = times_data.json()

We've got some other keys than just the `data` one, but they don't seem of use for what we're looking for. `location` is already contained in the main `data` key-value pair

In [None]:
times_parsed.keys()

Again, it looks like `list` we get in the `data` key is conveniently organised from highest to lowest ranked:

In [None]:
times_parsed['data'][0]

In [None]:
times_df = pd.DataFrame()
for i in range(0,200):
    times_df = times_df.append(times_parsed['data'][i], ignore_index=True)
print(times_df.shape)
times_df.head()

In [None]:
times_df.columnsmns