In [2]:
import pandas as pd
import numpy as np
import json

### EXPLORING DATA

In [31]:
with open('../data_raw/the_data/2012.json') as f:
    data = json.load(f)

In [32]:
df = pd.DataFrame(data['data'])

In [33]:
# Select columns with data indicators and university data
sel_cols = df.columns[
    df.columns.str.contains('scores_') | \
    df.columns.str.contains('stats_') | \
    df.columns.isin(['name', 'rank', 'nid', 'location'])
]

In [34]:
df[sel_cols].head()

Unnamed: 0,location,name,nid,rank,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,scores_overall,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank
0,United States,California Institute of Technology,128779,1,99.9,3,97.0,26,56.0,172,94.8,1,98.2,4,95.7,2
1,United States,Harvard University,466,2,99.8,7,35.9,212,67.5,115,93.9,2,97.4,5,95.8,1
2,United States,Stanford University,467,2,99.8,8,63.8,74,57.2,162,93.9,3,98.9,3,94.8,3
3,United Kingdom,University of Oxford,468,4,97.9,15,62.1,81,91.9,12,93.6,4,96.6,6,89.5,8
4,United States,Princeton University,469,5,100.0,2,81.0,43,49.6,198,92.9,5,99.1,2,91.5,6


In [35]:
df[sel_cols].tail()

Unnamed: 0,location,name,nid,rank,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,scores_overall,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank
397,Australia,La Trobe University,835,350-400,13.6,395,28.9,307,71.3,96,-,252,19.5,297,19.0,351
398,United States,Georgia State University,824,350-400,34.4,326,39.1,180,27.9,316,-,232,20.2,287,19.6,344
399,United States,Old Dominion University,847,350-400,46.9,253,-,383,27.3,319,-,279,12.5,378,17.1,374
400,Spain,University of Zaragoza,866,350-400,38.4,303,42.9,142,28.8,312,-,389,9.3,399,14.5,392
401,Ireland,"National University of Ireland, Galway",842,350-400,31.3,337,32.7,254,72.6,89,-,273,12.2,384,17.9,363


### MAKING THE DATASET - PROCEDURE

In [40]:
dataset = []

for year in range(2011, 2023):
    filename = '../data_raw/the_data/{year}.json'.format(year=year)
    print('Processing:', filename)
    
    with open(filename) as f:
        data = json.load(f)
    
    df = pd.DataFrame(data['data'])
    
    sel_cols = df.columns[
        df.columns.str.contains('scores_') | \
        df.columns.str.contains('stats_') | \
        df.columns.isin(['name', 'rank', 'nid', 'location'])
    ]
    
    sub_data = df[sel_cols]
    
    sub_data.loc[:, 'year'] = year
    
    dataset.append(sub_data)

Processing: the_data/2011.json
Processing: the_data/2012.json
Processing: the_data/2013.json
Processing: the_data/2014.json
Processing: the_data/2015.json
Processing: the_data/2016.json
Processing: the_data/2017.json
Processing: the_data/2018.json
Processing: the_data/2019.json
Processing: the_data/2020.json
Processing: the_data/2021.json
Processing: the_data/2022.json


In [41]:
dataset_df = pd.concat(dataset, ignore_index=True, sort=False)

In [42]:
dataset_df.head(3)

Unnamed: 0,location,name,nid,rank,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,...,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,year,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio
0,United States,Harvard University,466,1,98.8,8,34.5,105,72.4,49,...,1,98.7,2,99.7,1,2011,,,,
1,United States,California Institute of Technology,128779,2,99.9,1,83.7,24,54.6,93,...,2,98.0,4,97.7,4,2011,,,,
2,United States,Massachusetts Institute of Technology,471,3,99.9,2,87.5,21,82.3,36,...,3,91.4,11,97.8,3,2011,,,,


In [39]:
dataset_df.loc[ dataset_df.nid==466]

Unnamed: 0,location,name,nid,rank,scores_citations,scores_citations_rank,scores_industry_income,scores_industry_income_rank,scores_international_outlook,scores_international_outlook_rank,...,scores_overall_rank,scores_research,scores_research_rank,scores_teaching,scores_teaching_rank,year,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio
0,United States,Harvard University,466,1,98.8,8,34.5,105,72.4,49,...,1,98.7,2,99.7,1,2011,,,,
201,United States,Harvard University,466,2,99.8,7,35.9,212,67.5,115,...,2,97.4,5,95.8,1,2012,,,,
605,United States,Harvard University,466,4,99.2,10,39.9,212,63.7,128,...,4,98.6,5,94.9,3,2013,,,,
1003,United States,Harvard University,466,2,99.1,9,40.6,208,66.2,125,...,2,98.5,1,95.3,1,2014,,,,
1403,United States,Harvard University,466,2,98.9,12,44.0,184,67.6,115,...,2,98.6,1,92.9,1,2015,,,,
1808,United States,Harvard University,466,6,99.8,5,45.2,276,77.2,134,...,6,99.0,1,83.6,10,2016,,20152.0,25%,8.9
2608,United States,Harvard University,466,6,99.7,8,47.3,287,77.9,139,...,60,98.3,2,87.5,9,2017,,19890.0,25%,8.8
3589,United States,Harvard University,466,6,99.7,8,46.4,330,79.7,143,...,60,98.4,2,84.2,9,2018,,20326.0,26%,8.9
4692,United States,Harvard University,466,6,99.6,8,48.7,345,79.7,164,...,60,98.4,3,90.1,8,2019,48 : 52,20595.0,26%,9.1
5951,United States,Harvard University,466,7,99.1,19,47.3,395,76.3,217,...,70,98.6,3,89.2,8,2020,49 : 51,20823.0,24%,9.2


In [45]:
dataset_df.to_csv('../data_raw/THE_World_Rankings.csv')