In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import urlparse
%matplotlib inline

# Collecting a comprehensive list of universities

## Find list of all countries over the world

In [None]:
# List of all countries of the world from https://en.wikipedia.org/wiki/Education_Index
df_ei = pd.read_excel('world_list_education_index.xlsx', header=None)

In [None]:
countries_ei = df_ei[0].tolist()

In [None]:
df_ei[0][168] = 'Switzerland'
df_ei[0][122] = 'Nepal'

## Find databases of universities over the world

### List of all countries from univ.cc

In [None]:
html_univ_world = requests.get("http://univ.cc/world.php").text

In [None]:
univ_soup = BeautifulSoup(html_univ_world, 'html.parser')

In [None]:
countries_univ = []
for option in univ_soup.find_all('option')[1:]:
    countries_univ.append(option.text.split(sep='(')[0].strip())

### List of all countries in Shanghai ranking

In [None]:
html_shanghai = requests.get("http://www.shanghairanking.com/Search.html").text
shanghai_soup = BeautifulSoup(html_shanghai, 'html.parser')

In [None]:
countries_shanghai = []
for option in shanghai_soup.find_all('option')[1:-1]:
    countries_shanghai.append(option.text.strip())

## Are all universities over the world represented?

Since univ.cc contains the most countries, we first check if all countries contained in the Shanghai ranking are present. Afterwards we verify that the list if very comprehensive by cross checking with the wikipedia education index list.

In [None]:
print("The education index list contains {} countries".format(len(countries_ei)))
print("The univ.cc list contains {} countries".format(len(countries_univ)))
print("The Shanghai list contains {} countries".format(len(countries_shanghai)))

In [None]:
df_shanghai = pd.DataFrame(countries_shanghai, columns=['country'])
df_shanghai[~df_shanghai.country.isin(countries_univ)]

In [None]:
print('Czech Republic' in countries_univ)
print('Hong Kong' in countries_univ)
print('Macau' in countries_univ)
print('Korea, South' in countries_univ)
print('Taiwan' in countries_univ)
print('United States is missing because univ.cc has a seperate list for the united states')

In [None]:
df_ei[~df_ei[0].isin(countries_univ)]

In [None]:
print('Brunei' in countries_univ)
print('Congo, Republic of the' in countries_univ)
print('Congo, Democratic Republic of the' in countries_univ)
print('Guinea' in countries_univ)
print('Korea, North' in countries_univ)
print('Korea, South' in countries_univ)
print('Laos' in countries_univ)
print('Moldova' in countries_univ)
print('Burma' in countries_univ)
print('Palestine' in countries_univ)
print('Russia' in countries_univ)
print('Syria' in countries_univ)
print('Macedonia' in countries_univ)

print('\nPalau Community College is a two-year college in the Republic of Palau, and the only school of higher education in the nation, not necesarry to include\n')

print('The University of the South Pacific is a regional university serving 12 member countries: Cook Islands, Fiji Islands, Kiribati, Marshall Islands, Nauru, Niue, Samoa, Solomon Islands, Tokelau, Tonga, Tuvalu and Vanuatu.\n')

print('United States is missing because univ.cc has a seperate list for the united states\n')

print('Only for Timor-Leste and Sao Tome and Principe, which both have 1 university, the country was not found in the univ.cc list which makes it a very comprehensive list')

## Create comprehensive university list

In [2]:
def create_university_df():
    # create list by searching each country seperatly to add country information
    base_urls = ["http://univ.cc/world.php", "http://univ.cc/states.php"]
    countries_univ = []
    countries_names = []
    for i,url in enumerate(base_urls):
        r = requests.get(url).text
        h = BeautifulSoup(r, 'html.parser')
        
        for option in h.find_all('option')[1:]:
            c = option.text.split(sep='(')[0].strip()
            countries_univ.append((c, option['value']))
            if i == 1:
                countries_names.append("United States")
            else:
                countries_names.append(c)

    search_url = "http://univ.cc/search.php?dom="
    next_p = "&start="
    step = 50
    unis = []
    
    for i in range(len(countries_univ)):
        url = search_url + countries_univ[i][1]
        r = requests.get(url).text
        h = BeautifulSoup(r, 'html.parser')

        for l in h.find_all('li'):
            a = l.find('a')
            unis.append([countries_names[i], a.text, urlparse(a['href']).netloc])

        n = int(h.find('p').text.split(' ')[1])

        if n > step:
            url = url + next_p
            for j in range (step+1, n, step):
                url_start = url + str(j)

                r = requests.get(url_start).text
                h = BeautifulSoup(r, 'html.parser')

                for l in h.find_all('li'):
                    a = l.find('a')
                    unis.append([countries_names[i], a.text, urlparse(a['href']).netloc])
                    
    df = pd.DataFrame(unis,columns=['Country', 'University', 'Website'])
    df.drop_duplicates(subset=['Website'], inplace=True)
    return df

In [3]:
uni_df = create_university_df()

In [25]:
uni_df.to_csv('university_list_countries.csv')

# Is it OK to only consider english search terms?
Check official languages of the country or by languages the websites is offered in

In [121]:
def provided_languages(website):
    if 'http://' not in website:
        website = 'http://' + website
    
    languages = []
    try:
        r = requests.get(website).text
        h = BeautifulSoup(r, 'html.parser')
        h.find_all(lambda tag:[languages.append(tag[a]) for a in tag.attrs if 'lang' in a])
    except requests.exceptions.RequestException as e:
        print(e)
        
    result = ",".join(languages)
    print("website {}: {}".format(website, result))
    return result

In [None]:
uni_df['Website'].apply(provided_languages)

In [4]:
def add_country_info(unis_df):
    country_info = pd.read_csv('CountryInfo.csv', delimiter=";")
    lang = country_info[['Country','Languages', 'Population', 'Continent']]
    return unis_df.merge(lang, how='left', on="Country")

In [88]:
extra_uni_info = add_country_info(uni_df)

In [104]:
language_filter = np.logical_not(extra_uni_info['Languages'].str.contains('en').tolist())

In [106]:
language_filtered = extra_uni_info[language_filter]

In [None]:
provided_languages = language_filtered['Website'].apply(provided_languages)

website http://www.afghanuniversity.edu.af: 
website http://www.au.edu.af: en-US
website http://www.auaf.edu.af: 
website http://www.aria.edu.af: en-US
website http://www.ariana.edu.af: 
website http://www.badakhshan.edu.af: 
website http://www.baghlan.edu.af: fa,fa
website http://www.bakhtar.edu.af: 
HTTPConnectionPool(host='www.ba.edu.af', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x116eb3e10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))
website http://www.ba.edu.af: 
website http://www.bu.edu.af: fa,fa
website http://www.bost.edu.af: en-US,en-US
website http://www.dawat.edu.af: fa-ir
website http://www.dunya.edu.af: en-US
website http://www.faryab.edu.af: 
