In [None]:
# Import needed libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import re
%matplotlib inline

# Scraping info from topuniversities.com

In [None]:
base_url = 'https://www.topuniversities.com'

## Initial postman/parsing
Trying to get the url which contains the actual data that we want to parse

In [None]:
resp = requests.get(base_url + '/university-rankings/world-university-rankings/2018')
soup = BeautifulSoup(resp.text,'html.parser')

In [None]:
scripts = soup.find_all('script', type='text/javascript')
len(scripts)

In [None]:
i=0
for script in scripts:
    if script.text.find('rank_url')!= -1:
        print(i)
    i = i+1

In [None]:
scripts[28]

In [None]:
scripts[28].text[58:]
len(scripts[28].text)

In [None]:
scripts[28].text.find('rank_url')

In [None]:
scripts[28].text[14778:15178]

In [None]:
# How to parse headers to a python dict?
resp.headers

The actual ranking data which is shown on the page is generated with a request to `rank_url`, therefore it is this
linke that we'll need to GET to extract all the data we're interested in.

## Scraping the main data and putting everything into a `DataFrame`
Scraping everything that is contained in the `rank_url`. This is the majority of what we are interested in, the faculty and student data are contained on another page that is specific to each university. This will be scraped afterwards in the next section

In [None]:
rank_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt'
rank_data = requests.get(rank_url)
parsed_data = rank_data.json()
parsed_data

In [None]:
type(parsed_data) # dict
type(parsed_data['data']) #list
# List is already organised based on rank (with indexing starting at 0):
parsed_data['data'][3]['rank_display']

In [None]:
parsed_data['data'][197]

Let's now put all of this data into a single `DataFrame`. We're only interested in the top 200 universities, so we'll ignore the rest of the set.

In [None]:
qs_data = pd.DataFrame()
for i in range(0,200):
    qs_data = qs_data.append(parsed_data['data'][i], ignore_index=True)
print(qs_data.shape)
qs_data.head()

## Scraping the specific page for each university

We will first define a handy little function to extract numbers from strings with newlines and commas:

In [None]:
def xtract_number(str):
    str = str.replace(',' , '')
    str = re.search(r'\d+', str).group()
    return str

Let's add the extra columns that we're going to populate

In [None]:
columns_to_add = ['total faculty','inter faculty','total student','total inter']
qs_data = pd.concat([qs_data, pd.DataFrame(columns=columns_to_add)], axis=1)
qs_data.head(12)

Additional information is contained in the following tags
<h3> Number of international students
<h3> Number of students
<h3> Number of academic faculty staff --> <div class="anno">In total & <div class="anno">International

The following step is very slow, it has to parse a lot of html for 200 entries.

In [None]:
for idx in qs_data.index:
#for idx in [199]:
    page = requests.get(base_url + qs_data.loc[idx]['url'])
    soup = BeautifulSoup(page.text, 'html.parser')

    for column in columns_to_add:
        try:
            wrapper = soup.find_all('div',class_=column)
            value = xtract_number(wrapper[1].find('div', class_='number').string)
            qs_data.loc[idx][column] = value
        except IndexError:
            print('No data for', qs_data.loc[idx]['title'], 'concerning', column)

Furthermore, we can see that some data is missing for New York University and the Indian Institute of Science.
Going to the website and checking this by hand does indeed show that these pieces of information are missing. We'll therefore leave these as NaN to signify the missing data.

In [None]:
qs_data.tail()