In [1]:
import pandas as pd
import json

### EXPLORING DATA

In [2]:
filename = '../data_raw/arwu_data/2017.json'
with open(filename) as f:
    data = json.load(f)
    
data = data['data']

In [3]:
# https://www.shanghairanking.com/methodology/arwu/2021
data['indicators']

[{'code': '84', 'nameEn': 'Alumni'},
 {'code': '85', 'nameEn': 'Award'},
 {'code': '86', 'nameEn': 'HiCi'},
 {'code': '87', 'nameEn': 'N&S'},
 {'code': '88', 'nameEn': 'PUB'},
 {'code': '89', 'nameEn': 'PCP'}]

In [18]:
indicators = { rec['code']:rec['nameEn'] for rec in data['indicators'] }

In [20]:
indicators

{'84': 'Alumni',
 '85': 'Award',
 '86': 'HiCi',
 '87': 'N&S',
 '88': 'PUB',
 '89': 'PCP'}

In [24]:
ind_df = pd.DataFrame(
    [ { indicators[k]: v for k,v in rec['indData'].items()} for rec in data['rankings'] ]
)

In [28]:
uni_df = pd.DataFrame(
    data['rankings'],
    columns = [
        'ranking',
        'univNameEn',
        'univUp',
        'region',
        'regionLogo',
        'regionRanking',
        'score',
    ]
)

In [29]:
df = pd.concat([uni_df, ind_df], axis=1)

In [30]:
df.head()

Unnamed: 0,ranking,univNameEn,univUp,region,regionLogo,regionRanking,score,Alumni,Award,HiCi,N&S,PCP,PUB
0,1,Harvard University,harvard-university,United States,us,1,100.0,100.0,100.0,100.0,100.0,79.5,100.0
1,2,Stanford University,stanford-university,United States,us,2,76.5,44.5,88.5,87.3,74.4,56.8,73.7
2,3,University of Cambridge,university-of-cambridge,United Kingdom,gb,1,70.9,81.4,95.4,54.6,57.3,59.1,69.8
3,4,Massachusetts Institute of Technology (MIT),massachusetts-institute-of-technology-mit,United States,us,3,70.4,68.7,82.3,56.7,72.4,70.8,63.5
4,5,"University of California, Berkeley",university-of-california-berkeley,United States,us,4,69.1,64.4,78.4,63.6,68.1,58.7,66.7


### MAKING THE DATASET - PROCEDURE

In [31]:
dataset_df = pd.DataFrame([])

for year in range(2003, 2022):
    filename = '../data_raw/arwu_data/{year}.json'.format(year=year)
    print('Processing:', filename)
    
    with open(filename) as f:
        data = json.load(f)
        
    data = data['data']
    
    indicators = { rec['code']:rec['nameEn'] for rec in data['indicators'] }
    
    ind_df = pd.DataFrame(
        [ { indicators[k]: v for k,v in rec['indData'].items()} for rec in data['rankings'] ]
    )
    
    sel_cols = [ 'ranking', 'univNameEn', 'univUp', 'region', 'regionLogo', 'regionRanking', 'score' ]
    
    uni_df = pd.DataFrame(
        data['rankings'],
        columns = sel_cols
    )
    
    df = pd.concat([uni_df, ind_df], axis=1)
    df.loc[:, 'year'] = year
    
    dataset_df = pd.concat([dataset_df, df])

Processing: arwu_data/2003.json
Processing: arwu_data/2004.json
Processing: arwu_data/2005.json


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Processing: arwu_data/2006.json
Processing: arwu_data/2007.json
Processing: arwu_data/2008.json
Processing: arwu_data/2009.json
Processing: arwu_data/2010.json
Processing: arwu_data/2011.json
Processing: arwu_data/2012.json
Processing: arwu_data/2013.json
Processing: arwu_data/2014.json
Processing: arwu_data/2015.json
Processing: arwu_data/2016.json
Processing: arwu_data/2017.json
Processing: arwu_data/2018.json
Processing: arwu_data/2019.json
Processing: arwu_data/2020.json
Processing: arwu_data/2021.json


In [32]:
dataset_df.head()

Unnamed: 0,Alumni,Award,HiCi,N&S,PCP,PUB,ranking,region,regionLogo,regionRanking,score,univNameEn,univUp,year
0,,100.0,100.0,100.0,68.7,100.0,1,United States,us,1,100.0,Harvard University,harvard-university,2003
1,,76.2,88.2,73.8,80.5,72.2,2,United States,us,2,83.5,Stanford University,stanford-university,2003
2,,72.9,68.0,64.1,100.0,52.0,3,United States,us,3,76.3,California Institute of Technology,california-institute-of-technology,2003
3,,75.0,70.3,76.1,51.8,72.8,4,United States,us,4,74.0,"University of California, Berkeley",university-of-california-berkeley,2003
4,,91.1,58.0,56.4,68.7,69.3,5,United Kingdom,gb,1,73.4,University of Cambridge,university-of-cambridge,2003


In [33]:
dataset_df.tail()

Unnamed: 0,Alumni,Award,HiCi,N&S,PCP,PUB,ranking,region,regionLogo,regionRanking,score,univNameEn,univUp,year
995,0.0,0.0,0.0,2.5,11.1,26.9,901-1000,Poland,pl,8-10,,Warsaw University of Technology,warsaw-university-of-technology,2021
996,17.7,0.0,0.0,5.0,16.6,11.2,901-1000,United States,us,187-200,,Williams College,williams-college,2021
997,0.0,0.0,0.0,3.5,10.7,25.4,901-1000,Poland,pl,8-10,,Wroclaw University of Science and Technology,wroclaw-university-of-science-and-technology,2021
998,0.0,0.0,0.0,2.0,9.5,25.6,901-1000,China,cn,143-157,,Wuhan University of Science and Technology,wuhan-university-of-science-and-technology,2021
999,0.0,0.0,7.3,1.4,9.0,20.6,901-1000,China,cn,143-157,,Yantai University,yantai-university,2021


In [34]:
dataset_df.to_csv('../data_raw/ARWU_World_Rankings.csv')