In [259]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import numpy as np
from urllib.parse import urlparse

# Collecting a comprehensive list of universities

## Find list of all countries over the world

In [186]:
# List of all countries of the world from https://en.wikipedia.org/wiki/Education_Index
df_ei = pd.read_excel('world_list_education_index.xlsx', header=None)

In [187]:
countries_ei = df_ei[0].tolist()

In [188]:
df_ei[0][168] = 'Switzerland'
df_ei[0][122] = 'Nepal'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


## Find databases of universities over the world

### List of all countries from univ.cc

In [84]:
html_univ_world = requests.get("http://univ.cc/world.php").text

In [85]:
univ_soup = BeautifulSoup(html_univ_world, 'html.parser')

In [86]:
countries_univ = []
for option in univ_soup.find_all('option')[1:]:
    countries_univ.append(option.text.split(sep='(')[0].strip())

### List of all countries in Shanghai ranking

In [87]:
html_shanghai = requests.get("http://www.shanghairanking.com/Search.html").text
shanghai_soup = BeautifulSoup(html_shanghai, 'html.parser')

In [88]:
countries_shanghai = []
for option in shanghai_soup.find_all('option')[1:-1]:
    countries_shanghai.append(option.text.strip())

## Verify if these two list give all universities over the world

Since univ.cc contains the most countries, we first check if all countries contained in the Shanghai ranking are present. Afterwards we verify that the list if very comprehensive by cross checking with the wikipedia education index list.

In [214]:
print("The education index list contains {} countries".format(len(countries_ei)))
print("The univ.cc list contains {} countries".format(len(countries_univ)))
print("The Shanghai list contains {} countries".format(len(countries_shanghai)))

The education index list contains 195 countries
The univ.cc list contains 205 countries
The Shanghai list contains 62 countries


In [215]:
df_shanghai = pd.DataFrame(countries_shanghai, columns=['country'])
df_shanghai[~df_shanghai.country.isin(countries_univ)]

Unnamed: 0,country
13,Czech
21,"Hong Kong, China"
33,"Macau, China"
51,South Korea
55,"Taiwan, China"
59,United States


In [216]:
print('Czech Republic' in countries_univ)
print('Hong Kong' in countries_univ)
print('Macau' in countries_univ)
print('Korea, South' in countries_univ)
print('Taiwan' in countries_univ)
print('United States is missing because univ.cc has a seperate list for the united states')

True
True
True
True
True
United States is missing because univ.cc has a seperate list for the united states


In [217]:
df_ei[~df_ei[0].isin(countries_univ)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
24,Brunei Darussalam,0.519,0.541,0.589,0.626,0.646,0.672,0.682,0.681,0.677,0.682,0.678,0.681,0.692,0.692
38,Congo,0.44,0.464,0.456,0.445,0.41,0.459,0.467,0.475,0.483,0.491,0.499,0.505,0.511,0.511
39,Democratic Republic of the Congo,0.241,0.249,0.253,0.272,0.297,0.316,0.32,0.323,0.346,0.349,0.355,0.361,0.372,0.372
69,Guinea-Bissau,,,,,,0.314,0.325,0.325,0.325,0.325,0.325,0.325,0.325,0.325
88,Kiribati,,,,,,,,,,,0.602,0.602,0.602,0.602
89,Democratic People's Republic of Korea,,,,,,,,,,,,,,
90,Republic of Korea,0.565,0.638,0.679,0.743,0.797,0.837,0.843,0.848,0.854,0.856,0.862,0.865,0.865,0.865
93,Lao People's Democratic Republic,0.246,0.256,0.288,0.31,0.352,0.385,0.39,0.395,0.403,0.414,0.422,0.436,0.436,0.436
108,Marshall Islands,,,,,,,,,,,,,,
112,Federated States of Micronesia,,,,,,,,,,,0.611,0.611,0.611,0.611


In [218]:
print('Brunei' in countries_univ)
print('Congo, Republic of the' in countries_univ)
print('Congo, Democratic Republic of the' in countries_univ)
print('Guinea' in countries_univ)
print('Korea, North' in countries_univ)
print('Korea, South' in countries_univ)
print('Laos' in countries_univ)
print('Moldova' in countries_univ)
print('Burma' in countries_univ)
print('Palestine' in countries_univ)
print('Russia' in countries_univ)
print('Syria' in countries_univ)
print('Macedonia' in countries_univ)

print('\nPalau Community College is a two-year college in the Republic of Palau, and the only school of higher education in the nation, not necesarry to include\n')

print('The University of the South Pacific is a regional university serving 12 member countries: Cook Islands, Fiji Islands, Kiribati, Marshall Islands, Nauru, Niue, Samoa, Solomon Islands, Tokelau, Tonga, Tuvalu and Vanuatu.\n')

print('United States is missing because univ.cc has a seperate list for the united states\n')

print('Only for Timor-Leste and Sao Tome and Principe, which both have 1 university, the country was not found in the univ.cc list which makes it a very comprehensive list')

True
True
True
True
True
True
True
True
True
True
True
True
True

Palau Community College is a two-year college in the Republic of Palau, and the only school of higher education in the nation, not necesarry to include

The University of the South Pacific is a regional university serving 12 member countries: Cook Islands, Fiji Islands, Kiribati, Marshall Islands, Nauru, Niue, Samoa, Solomon Islands, Tokelau, Tonga, Tuvalu and Vanuatu.

United States is missing because univ.cc has a seperate list for the united states

Only for Timor-Leste and Sao Tome and Principe, which both have 1 university, the country was not found in the univ.cc list which makes it a very comprehensive list


## Create comprehensive university list

In [264]:
n_uni = 7395
step = 50
def univcc_par_url(start):
    return '&start={}'.format(start)

univcc_base_url = "http://univ.cc/search.php?dom=world"

data_list = []
for i in range(1,n_uni,step):
    r = requests.get(univcc_base_url + univcc_par_url(i)).text
    h = BeautifulSoup(r, 'html.parser')
    
    for l in h.find_all('li'):
        a = l.find('a')
        data_list.append([a.text, urlparse(a['href']).netloc])
    
df = pd.DataFrame(data_list,columns=['University', 'Website'])

In [279]:
df.drop_duplicates(subset=['Website'], inplace=True)
df.to_csv('university_list.csv')