In [139]:
import pandas as pd

In [149]:
# load the dataset of student exchanges
df = pd.read_csv("../dataset/erasmus-only-study-exchange.csv")

In [150]:
# translate existing field of study entries to categories of larger granularity
# mapping has been done manually, see field_of_study_mapping.csv for details
field_of_study_mapping = {1: "Natural Sciences and Mathematics",
2: "Engineering and Technology",
3: "Humanities, Arts and Social Science",
4: "Medicine and Health",
5: "Economics and Business",
6: "Languages",
7: "Other",
8: "Not specified"}
foe_labels = pd.read_csv("../dataset/field_of_study_mapping.csv", sep="; ")

df = pd.merge(df, foe_labels, on="Field of Education")
df["Field of Education"] = df["Label"].apply(lambda x: field_of_study_mapping[x])

  foe_labels = pd.read_csv("../dataset/field_of_study_mapping.csv", sep="; ")


In [151]:
df["Field of Education"].value_counts()

Economics and Business                 281383
Humanities, Arts and Social Science    279115
Engineering and Technology             190865
Languages                              138740
Other                                  102934
Medicine and Health                     65730
Natural Sciences and Mathematics        49649
Not specified                           13949
Name: Field of Education, dtype: int64

In [152]:
# prepare dataset for visualization 1: students per country (with attributes: gender, field of education, sending/receiving, academic year)
# we want the following tabular dataset:
# | Country | Year | University | Direction | Gender | Field | Participants |
# ===========================================================================
# | NL      | 2018 | TU Delft   | Sending   | Male   | CS    | 8            |

grouping_columns = ['Academic Year', "Participant Gender", "Field of Education"]

sending = df[grouping_columns + ["Participants", "Sending Organization", "Sending Country Code"]]
sending = sending.groupby(grouping_columns + ["Sending Organization", "Sending Country Code"]).agg({"Participants": "sum"}).reset_index()
sending.rename(columns={"Sending Organization": "Organization", "Sending Country Code": "Country"}, inplace=True)
sending["Direction"] = "Sending"

receiving = df[grouping_columns + ["Participants", "Receiving Organization", "Receiving Country Code"]]
receiving = receiving.groupby(grouping_columns + ["Receiving Organization", "Receiving Country Code"]).agg({"Participants": "sum"}).reset_index()
receiving.rename(columns={"Receiving Organization": "Organization", "Receiving Country Code": "Country"}, inplace=True)
receiving["Direction"] = "Receiving"

visualization_1 = pd.concat([sending, receiving], axis=0).sort_values("Participants", ascending=False).reset_index(drop=True)
visualization_1.head(10)
visualization_1.to_csv("../dataset/vizualization_1.csv")

In [153]:
visualization_1

Unnamed: 0,Academic Year,Participant Gender,Field of Education,Organization,Country,Participants,Direction
0,2017-2018,Male,Engineering and Technology,UNIVERSIDAD POLITECNICA DE MADRID,ES,726,Sending
1,2016-2017,Male,Engineering and Technology,UNIVERSIDAD POLITECNICA DE MADRID,ES,694,Sending
2,2016-2017,Male,Engineering and Technology,UNIVERSITAT POLITECNICA DE CATALUNYA,ES,654,Sending
3,2019-2020,Female,"Humanities, Arts and Social Science",ALMA MATER STUDIORUM - UNIVERSITA DI BOLOGNA,IT,636,Sending
4,2015-2016,Male,Engineering and Technology,UNIVERSITAT POLITECNICA DE CATALUNYA,ES,615,Sending
...,...,...,...,...,...,...,...
163858,2017-2018,Female,Other,UNIWERSYTET ARTYSTYCZNY W POZNANIU,PL,1,Sending
163859,2016-2017,Male,Languages,NORD UNIVERSITET,NO,1,Receiving
163860,2018-2019,Female,Languages,Manisa Celal Bayar University,TR,1,Receiving
163861,2017-2018,Female,Other,UNIWERSYTET HUMANISTYCZNO-PRZYRODNICZY IM. JAN...,PL,1,Sending


In [154]:
# visualization 2 - chord diagram - for each pair of countries (c1,c2) we want {from: c1, to: c2, participants: X}
visualization_2 = df[["Sending Country Code", "Receiving Country Code", "Participants"]] \
    .groupby(["Sending Country Code", "Receiving Country Code"]) \
    .agg({"Participants": "sum"}).sort_values("Participants", ascending=False).reset_index()
visualization_2.to_csv("../dataset/vizualization_2.csv")
visualization_2

Unnamed: 0,Sending Country Code,Receiving Country Code,Participants
0,IT,ES,48445
1,ES,IT,34643
2,DE,ES,27863
3,FR,ES,27188
4,DE,FR,24358
...,...,...,...
981,TR,RS,1
982,MT,BG,1
983,AT,MK,1
984,LI,SI,1


In [155]:
# visualization 3: popularity of fields per year
visualization_3 = df[["Academic Year", "Field of Education", "Participants"]] \
    .groupby(["Academic Year", "Field of Education"]) \
    .agg({"Participants": "sum"}).sort_values(["Academic Year", "Field of Education"]).reset_index()
visualization_3.to_csv("../dataset/vizualization_3.csv")
visualization_3

Unnamed: 0,Academic Year,Field of Education,Participants
0,2014-2015,Economics and Business,35818
1,2014-2015,Engineering and Technology,23552
2,2014-2015,"Humanities, Arts and Social Science",37220
3,2014-2015,Languages,20361
4,2014-2015,Medicine and Health,7600
5,2014-2015,Natural Sciences and Mathematics,6431
6,2014-2015,Not specified,402
7,2014-2015,Other,10683
8,2015-2016,Economics and Business,53693
9,2015-2016,Engineering and Technology,35840


In [156]:
# Visualization 4: bubble - bubble sizes based on total participants or pageranks
# TODO