In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from urllib.parse import urlparse

# Collecting a comprehensive list of universities

## Find list of all countries over the world

In [None]:
# List of all countries of the world from https://en.wikipedia.org/wiki/Education_Index
df_ei = pd.read_excel('world_list_education_index.xlsx', header=None)

In [None]:
countries_ei = df_ei[0].tolist()

In [None]:
df_ei[0][168] = 'Switzerland'
df_ei[0][122] = 'Nepal'

## Find databases of universities over the world

### List of all countries from univ.cc

In [None]:
html_univ_world = requests.get("http://univ.cc/world.php").text

In [None]:
univ_soup = BeautifulSoup(html_univ_world, 'html.parser')

In [None]:
countries_univ = []
for option in univ_soup.find_all('option')[1:]:
    countries_univ.append(option.text.split(sep='(')[0].strip())

### List of all countries in Shanghai ranking

In [None]:
html_shanghai = requests.get("http://www.shanghairanking.com/Search.html").text
shanghai_soup = BeautifulSoup(html_shanghai, 'html.parser')

In [None]:
countries_shanghai = []
for option in shanghai_soup.find_all('option')[1:-1]:
    countries_shanghai.append(option.text.strip())

## Verify if these two list give all universities over the world

Since univ.cc contains the most countries, we first check if all countries contained in the Shanghai ranking are present. Afterwards we verify that the list if very comprehensive by cross checking with the wikipedia education index list.

In [None]:
print("The education index list contains {} countries".format(len(countries_ei)))
print("The univ.cc list contains {} countries".format(len(countries_univ)))
print("The Shanghai list contains {} countries".format(len(countries_shanghai)))

In [None]:
df_shanghai = pd.DataFrame(countries_shanghai, columns=['country'])
df_shanghai[~df_shanghai.country.isin(countries_univ)]

In [None]:
print('Czech Republic' in countries_univ)
print('Hong Kong' in countries_univ)
print('Macau' in countries_univ)
print('Korea, South' in countries_univ)
print('Taiwan' in countries_univ)
print('United States is missing because univ.cc has a seperate list for the united states')

In [None]:
df_ei[~df_ei[0].isin(countries_univ)]

In [None]:
print('Brunei' in countries_univ)
print('Congo, Republic of the' in countries_univ)
print('Congo, Democratic Republic of the' in countries_univ)
print('Guinea' in countries_univ)
print('Korea, North' in countries_univ)
print('Korea, South' in countries_univ)
print('Laos' in countries_univ)
print('Moldova' in countries_univ)
print('Burma' in countries_univ)
print('Palestine' in countries_univ)
print('Russia' in countries_univ)
print('Syria' in countries_univ)
print('Macedonia' in countries_univ)

print('\nPalau Community College is a two-year college in the Republic of Palau, and the only school of higher education in the nation, not necesarry to include\n')

print('The University of the South Pacific is a regional university serving 12 member countries: Cook Islands, Fiji Islands, Kiribati, Marshall Islands, Nauru, Niue, Samoa, Solomon Islands, Tokelau, Tonga, Tuvalu and Vanuatu.\n')

print('United States is missing because univ.cc has a seperate list for the united states\n')

print('Only for Timor-Leste and Sao Tome and Principe, which both have 1 university, the country was not found in the univ.cc list which makes it a very comprehensive list')

## Create comprehensive university list

In [None]:
n_uni_world = 7406
n_uni_edu = 2072
step = 50
def univcc_par_url(start):
    return '&start={}'.format(start)

univcc_base_url_world = "http://univ.cc/search.php?dom=world"
univcc_base_url_edu = "http://univ.cc/search.php?dom=edu"

data_list = []
for i in range(1,n_uni_world,step):
    r = requests.get(univcc_base_url_world + univcc_par_url(i)).text
    h = BeautifulSoup(r, 'html.parser')
    
    for l in h.find_all('li'):
        a = l.find('a')
        data_list.append([a.text, urlparse(a['href']).netloc])
    
df = pd.DataFrame(data_list,columns=['University', 'Website'])

for i in range(1,n_uni_edu,step):
    r = requests.get(univcc_base_url_edu + univcc_par_url(i)).text
    h = BeautifulSoup(r, 'html.parser')
    
    for l in h.find_all('li'):
        a = l.find('a')
        data_list.append([a.text, urlparse(a['href']).netloc])
    
df = pd.DataFrame(data_list,columns=['University', 'Website'])

In [None]:
df.drop_duplicates(subset=['Website'], inplace=True)
df.to_csv('university_list_inclusive.csv')