# Imports

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Task 1

Obtain the 200 top-ranking universities in www.topuniversities.com (ranking 2018). In particular, extract the following fields for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). Some information is not available in the main list and you have to find them in the details page. Store the resulting dataset in a pandas DataFrame and answer the following questions:
- Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?
- Answer the previous question aggregating the data by (c) country and (d) region.

## Solution

### Explanation and assumptions

**Explanation**

For this task, we have to retrieve the information in 2 separate steps.

The first step is to retrieve the data (name of university, rank, country and region) using a simple GET query on a file. We used the Postman extension with the interceptor in order to find this file. The file contains a list of the 1000 best ranked universities sorted by rank. This file contains data in the JSON format.

The second step is to retrieve the number of faculty members and the number of students (international and total) for each university by following the details page found in the JSON data. When reaching the details page, we can simply parse the HTML and retrieve the needed information.

### Code

In [None]:
# Constants
TOP_UNIVERSITIES_BASE = 'https://www.topuniversities.com'
TOP_UNIVERSITIES_RANKING = TOP_UNIVERSITIES_BASE + '/sites/default/files/qs-rankings-data/357051.txt'
column_names = ['name', 'rank_top', 'country', 'region', 'score']
names_map = {
    'total faculty':'faculty_tot',
    'inter faculty': 'faculty_int',
    'total student':'student_tot', 
    'total inter':'student_int'
}

In [None]:
ranking = requests.get(TOP_UNIVERSITIES_RANKING)
top_200 = ranking.json()['data'][0:200]

In [None]:
universities_top = pd.DataFrame(columns = (column_names + list(names_map.values())))

def get_details_info(div_name):
    div = soup.find('div',class_=div_name)
    if div:
        return int(div
            .find('div',class_='number')
            .text
            .replace("\n","")
            .replace(",","")
            .strip())
    else:
        return float('NaN')

for university in top_200:
    row = {
        'name': university['title'],
        'rank_top': university['rank_display'],
        'country': university['country'],
        'region': university['region'],
        'score': university['score']
    }
    
    details_page_url = TOP_UNIVERSITIES_BASE + university['url']
    r = requests.get(details_page_url)
    soup = BeautifulSoup(r.text,'html.parser')
    
    for div_class, column_name in names_map.items():
        row[column_name] = get_details_info(div_class)
    
    universities_top = universities.append(row, ignore_index=True)
    
universities_top.set_index(['name'])

In [None]:
universities_top

## Task 2

Obtain the 200 top-ranking universities in www.timeshighereducation.com (ranking 2018). Repeat the analysis of the previous point and discuss briefly what you observed.

## Solution

### Explanation and assumptions

## Code

In [None]:
# Constants
TIMES_BASE = 'https://www.timeshighereducation.com'
TIMES_RANKING = TIMES_BASE + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
column_names = ['name', 'rank_times', 'country', 'score']
names_map = {
    'total faculty':'faculty_tot',
    'total student':'student_tot', 
    'total inter':'student_int'
}

In [None]:
ranking = requests.get(TIMES_RANKING)
top_200 = ranking.json()['data'][:20]

In [None]:
universities_times = pd.DataFrame(columns = (column_names + list(names_map.values())))

def to_int(value):
    return int(value.replace(",","").replace('%',""))

for university in top_200:
    row = {
        'name': university['name'],
        'rank_times': university['rank'],
        'country': university['location'],
        'score': university['scores_overall'],
        'faculty_tot': np.ceil(to_int(university['stats_number_students']) / float(university['stats_student_staff_ratio'])),   
        'student_tot': university['stats_number_students'],   
        'student_int': np.ceil(to_int(university['stats_number_students']) * (to_int(university['stats_pc_intl_students']) / 100)),   
    }
        
    universities_times = universities.append(row, ignore_index=True)

universities_times = universities_times.set_index(['name'])

In [None]:
universities_times