# Shanghai rankings based on country
#### The idea of this document is to make some sort of graphic showing where the world's best _universities_ are located.
#### http://www.shanghairanking.com/ARWU2019.html

In [1]:
# Pull the University data from Shanghai Ranking html and use regex to obtain university data and which country it's in

import requests
from bs4 import BeautifulSoup

import re

data = requests.get('http://www.shanghairanking.com/ARWU2019.html')
soup = BeautifulSoup(data.text, 'html.parser')

university = soup.find('div', {'id': 'rankingarea'})
tbody = university.find('a', {'href': re.compile('World-University-Rankings/')})
country = university.find('a', {'title': re.compile('View universities in [\w]+')})


uni_rankings = {}

print("Top %d universities" %25)
for counter, tr in enumerate(university.find_all('a', {'href': re.compile('World-University-Rankings/')}), 1):
    if counter <= 25:
        print(counter, "--", tr.text)
    uni_rankings[tr.text] = counter 
    if counter == 500:
        break

Top 25 universities
1 -- Harvard University
2 -- Stanford University
3 -- University of Cambridge
4 -- Massachusetts Institute of Technology (MIT)
5 -- University of California, Berkeley
6 -- Princeton University
7 -- University of Oxford
8 -- Columbia University
9 -- California Institute of Technology
10 -- University of Chicago
11 -- University of California, Los Angeles
12 -- Yale University
13 -- Cornell University
14 -- University of Washington
15 -- University College London
16 -- Johns Hopkins University
17 -- University of Pennsylvania
18 -- University of California, San Diego
19 -- Swiss Federal Institute of Technology Zurich
20 -- University of California, San Francisco
21 -- University of Michigan-Ann Arbor
22 -- Washington University in St. Louis
23 -- Imperial College London
24 -- University of Toronto
25 -- The University of Tokyo


An idea would be to create a global map with each country containing a top 100, 200, and 500 university, and color code based on number of universities situated in the country.  

In [2]:
# Create a dictionary to associate uni with rank and country.  Initiate with "US"

uni_loc = {} 
for counter, tr in enumerate(university.find_all('a', {'href': re.compile('World-University-Rankings/')}), 1):
    uni_loc[counter] = [tr.text, "US"]
    
    if counter == 500:
        break

country_counter = {}
for counter, tr in enumerate(university.find_all('a', {'title': re.compile("View universities in [A-Za-z\s]+.")}), 1):
    country_name = re.findall('View universities in ([A-Za-z\s]+).', str(tr))
    uni_loc[counter][1] = country_name[0]
    
    try:
        country_counter[country_name[0]] += 1
    except KeyError:
        country_counter[country_name[0]] = 1
    if uni_loc[counter][1] == 'USA':
        uni_loc[counter][1] = 'United States'
    elif uni_loc[counter][1] == 'UK':
        uni_loc[counter][1] = 'United Kingdom'
    if counter == 500:
        break

## Starting tomorrow's assignment (Aug 15), create a dictionary or list with each country and number of universities it has

In [3]:
# This block sorts the DataFrame

temp = sorted(country_counter, key=country_counter.__getitem__, reverse=True)  

new = {}
for item in temp:
    new[item] =  country_counter[item]

country_counter = new
print(country_counter)
# sorted(numbers, key=numbers.__getitem__) # numbers is dictionary

{'USA': 137, 'China': 66, 'UK': 36, 'Germany': 30, 'Australia': 23, 'France': 21, 'Canada': 18, 'Italy': 16, 'Japan': 14, 'Spain': 13, 'Netherlands': 12, 'Sweden': 11, 'South Korea': 11, 'Switzerland': 8, 'Belgium': 7, 'Israel': 6, 'Brazil': 6, 'Austria': 6, 'Denmark': 5, 'Finland': 5, 'South Africa': 5, 'Russia': 4, 'Saudi Arabia': 4, 'Portugal': 4, 'New Zealand': 4, 'Norway': 3, 'Ireland': 3, 'Iran': 3, 'Singapore': 2, 'Poland': 2, 'Greece': 2, 'Czech Republic': 1, 'Mexico': 1, 'Argentina': 1, 'Egypt': 1, 'Malaysia': 1, 'Estonia': 1, 'India': 1, 'Turkey': 1, 'Thailand': 1, 'Serbia': 1, 'Chile': 1, 'Iceland': 1, 'Croatia': 1}


In [4]:
# Convert dictionary to pandas dataframe to easily graph 

import pandas as pd

country_dataset = pd.DataFrame(list(country_counter.items()), columns=['COUNTRY', 'SCHOOLS'])
country_dataset['CODE'] = None
print(country_dataset)


           COUNTRY  SCHOOLS  CODE
0              USA      137  None
1            China       66  None
2               UK       36  None
3          Germany       30  None
4        Australia       23  None
5           France       21  None
6           Canada       18  None
7            Italy       16  None
8            Japan       14  None
9            Spain       13  None
10     Netherlands       12  None
11          Sweden       11  None
12     South Korea       11  None
13     Switzerland        8  None
14         Belgium        7  None
15          Israel        6  None
16          Brazil        6  None
17         Austria        6  None
18         Denmark        5  None
19         Finland        5  None
20    South Africa        5  None
21          Russia        4  None
22    Saudi Arabia        4  None
23        Portugal        4  None
24     New Zealand        4  None
25          Norway        3  None
26         Ireland        3  None
27            Iran        3  None
28       Singa

In [5]:
# This block is to clean data

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
row_korea = df[df['COUNTRY']=='Korea, South']['COUNTRY'].index[0]
df.at[row_korea, 'COUNTRY'] = 'South Korea'
# row_czech = df[df['COUNTRY']=='Czech Republic']['COUNTRY'].index[0]
# df.at[row_czech, 'COUNTRY'] = 'Czech'
row_USA = country_dataset[country_dataset['COUNTRY']=='USA']['COUNTRY'].index[0]
country_dataset.at[row_USA, 'COUNTRY'] = 'United States'
row_UK = country_dataset[country_dataset['COUNTRY']=='UK']['COUNTRY'].index[0]
country_dataset.at[row_UK, 'COUNTRY'] = 'United Kingdom'

# Search for country in both dataframes to put in the country code into country_datase

In [6]:
countries = country_dataset # Apply the country code to each country to make a 3-letter standard

for country in country_dataset['COUNTRY']:
    row = country_dataset[country_dataset['COUNTRY']==country].index[0]
    temp = df[df['COUNTRY']==country]
    temp = temp['CODE'].iloc[0]
    countries.at[row, 'CODE'] = temp

print(countries)


           COUNTRY  SCHOOLS CODE
0    United States      137  USA
1            China       66  CHN
2   United Kingdom       36  GBR
3          Germany       30  DEU
4        Australia       23  AUS
5           France       21  FRA
6           Canada       18  CAN
7            Italy       16  ITA
8            Japan       14  JPN
9            Spain       13  ESP
10     Netherlands       12  NLD
11          Sweden       11  SWE
12     South Korea       11  KOR
13     Switzerland        8  CHE
14         Belgium        7  BEL
15          Israel        6  ISR
16          Brazil        6  BRA
17         Austria        6  AUT
18         Denmark        5  DNK
19         Finland        5  FIN
20    South Africa        5  ZAF
21          Russia        4  RUS
22    Saudi Arabia        4  SAU
23        Portugal        4  PRT
24     New Zealand        4  NZL
25          Norway        3  NOR
26         Ireland        3  IRL
27            Iran        3  IRN
28       Singapore        2  SGP
29        

In [7]:
import plotly.graph_objects as go
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)

fig = go.Figure(data=go.Choropleth(
    locations = countries['CODE'],
    z = countries['SCHOOLS'],
    text = countries['COUNTRY'],
    colorscale = 'Blues', # I believe this tag will hold the log scale
    autocolorscale=False,
#     tick0 = 2, 
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.2,
    colorbar_title = 'Universities in top 500',
))



fig.update_layout(
    title_text='2019 Global University Rankings',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        text='Source: <a href="http://www.shanghairanking.com/ARWU2019.html">\
            Shanghai Global University Rankings</a>',
        showarrow = False
    )]
)

# fig.show()
iplot(fig)

In [8]:
countries['FINAL'] = countries['COUNTRY'] + ' - ' + countries['SCHOOLS'].map(str)
print(countries)

           COUNTRY  SCHOOLS CODE                FINAL
0    United States      137  USA  United States - 137
1            China       66  CHN           China - 66
2   United Kingdom       36  GBR  United Kingdom - 36
3          Germany       30  DEU         Germany - 30
4        Australia       23  AUS       Australia - 23
5           France       21  FRA          France - 21
6           Canada       18  CAN          Canada - 18
7            Italy       16  ITA           Italy - 16
8            Japan       14  JPN           Japan - 14
9            Spain       13  ESP           Spain - 13
10     Netherlands       12  NLD     Netherlands - 12
11          Sweden       11  SWE          Sweden - 11
12     South Korea       11  KOR     South Korea - 11
13     Switzerland        8  CHE      Switzerland - 8
14         Belgium        7  BEL          Belgium - 7
15          Israel        6  ISR           Israel - 6
16          Brazil        6  BRA           Brazil - 6
17         Austria        6 

In [9]:
# This below graph is how the graph colors should appear.  Need to show these colors but display the real numbers
# Converted values to logarithmic form, but still printing the country with number of universities in top 500.

import plotly.graph_objects as go
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)

fig = go.Figure(data=go.Choropleth(
    locations = countries['CODE'],
    z = np.log(countries['SCHOOLS']),
    text = countries['FINAL'],
    colorscale = 'Blues', 
    autocolorscale=False,
#     tick0 = '1',
    showscale=True,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.2,
    colorbar_title = 'Universities in top 500',
    #hovertext = countries['COUNTRY'] + countries['SCHOOLS'],
))



fig.update_layout(
    title_text='2019 Global University Rankings',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        text='Source: <a href="http://www.shanghairanking.com/ARWU2019.html">\
            Shanghai Global University Rankings</a>',
        showarrow = False
    )]
)

# fig.show()
iplot(fig)

# An idea... create a max amount so that the US and China don't create extremes in the color.  With this we 
# may not have to convert all the data into logarthmic values, while retaining the true value.  

In [10]:
# This block is used to show a list of top ranked universities in desired country


check_country = input("Which country do you want to check? ").title()
i = 0
lists = {}
for uni in uni_loc:
    if uni_loc[uni][1] == check_country:
        lists[uni_loc[uni][0]] = uni_rankings[uni_loc[uni][0]]
    i += 1
    if i == 500:
        break
print("\nBest Universities:")
if len(lists) == 0:
    print("No universities in top 500, or check your spelling.")
for item in lists:
    print(item + " -- #" + str(lists[item]))

Which country do you want to check? switzerland

Best Universities:
Swiss Federal Institute of Technology Zurich -- #19
University of Geneva -- #58
University of Zurich -- #61
Swiss Federal Institute of Technology Lausanne -- #78
University of Basel -- #88
University of Bern -- #133
University of Lausanne -- #180
University of Fribourg -- #469
