In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
# parse country codes to names, country codes taken from: https://datahub.io/core/country-list
with open("../dataset/country_codes.json", "r") as f:
    country_codes = json.load(f)
    code_to_country = {entry["Code"]: entry["Name"] for entry in country_codes}
    # "uk" and "el" were missing in the country codes list
    code_to_country["UK"] = "Utd. Kingdom"
    code_to_country["EL"] = "Greece"

In [None]:
# load the dataset of student exchanges
df = pd.read_csv("../dataset/erasmus-only-study-exchange.csv")

In [None]:
# translate existing field of study entries to categories of larger granularity
# mapping has been done manually, see field_of_study_mapping.csv for details
field_of_study_mapping = {
1: "Natural Sciences and Mathematics",
2: "Engineering and Technology",
3: "Humanities, Arts and Social Science",
4: "Medicine and Health",
5: "Economics and Business",
6: "Languages",
7: "Other",
8: "Not specified"}
foe_labels = pd.read_csv("../dataset/field_of_study_mapping.csv", sep="; ")

df = pd.merge(df, foe_labels, on="Field of Education")
df["Field of Education"] = df["Label"].apply(lambda x: field_of_study_mapping[x])

In [None]:
df["Field of Education"].value_counts()

In [None]:
# prepare dataset for visualization 1: students per country (with attributes: gender, field of education, sending/receiving, academic year)
# we want the following tabular dataset:
# | Country | Year | University | Direction | Gender | Field | Participants |
# ===========================================================================
# | NL      | 2018 | TU Delft   | Sending   | Male   | CS    | 8            |

df_female = df[df["Participant Gender"] == "Female"]
df_male = df[df["Participant Gender"] == "Male"]
grouping_columns = ['Academic Year']

# SENDING FEMALE
sending_female = df_female[grouping_columns + ["Participants", "Sending Country Code"]]
sending_female = sending_female.groupby(["Academic Year", "Sending Country Code"]).agg({"Participants": "sum"}).reset_index()
sending_female.rename(columns={"Sending Country Code": "Country", "Participants" : "Sending-Female"}, inplace=True)

# SENDING MALE
sending_male = df_male[grouping_columns + ["Participants", "Sending Country Code"]]
sending_male = sending_male.groupby(["Academic Year", "Sending Country Code"]).agg({"Participants": "sum"}).reset_index()
sending_male.rename(columns={"Sending Country Code": "Country", "Participants" : "Sending-Male"}, inplace=True)

# SENDING ALL
sending_all = df[grouping_columns + ["Participants", "Sending Country Code"]]
sending_all = sending_all.groupby(grouping_columns + ["Sending Country Code"]).agg({"Participants": "sum"}).reset_index()
sending_all.rename(columns={"Sending Country Code": "Country", "Participants" : "Sending-All"}, inplace=True)

# RECEIVING FEMALE
receiving_female = df_female[grouping_columns + ["Participants", "Receiving Country Code"]]
receiving_female = receiving_female.groupby(["Academic Year", "Receiving Country Code"]).agg({"Participants": "sum"}).reset_index()
receiving_female.rename(columns={"Receiving Country Code": "Country", "Participants" : "Receiving-Female"}, inplace=True)

# RECEIVING MALE
receiving_male = df_male[grouping_columns + ["Participants", "Receiving Country Code"]]
receiving_male = receiving_male.groupby(["Academic Year", "Receiving Country Code"]).agg({"Participants": "sum"}).reset_index()
receiving_male.rename(columns={"Receiving Country Code": "Country", "Participants" : "Receiving-Male"}, inplace=True)

# RECEIVING ALL
receiving_all = df[grouping_columns + ["Participants", "Receiving Country Code"]]
receiving_all = receiving_all.groupby(grouping_columns + ["Receiving Country Code"]).agg({"Participants": "sum"}).reset_index()
receiving_all.rename(columns={"Receiving Country Code": "Country", "Participants" : "Receiving-All"}, inplace=True)

sending_merge = pd.merge(sending_female, sending_male, on=['Academic Year', 'Country'],how='outer').reset_index(drop=True).fillna(0)
sending_merge = pd.merge(sending_merge, sending_all, on=['Academic Year', 'Country'],how='outer').reset_index(drop=True).fillna(0)
receiving_merge = pd.merge(receiving_female, receiving_male, on=['Academic Year', 'Country'],how='outer').reset_index(drop=True).fillna(0)
receiving_merge = pd.merge(receiving_merge, receiving_all, on=['Academic Year', 'Country'],how='outer').reset_index(drop=True).fillna(0)
visualization_1 = pd.merge(sending_merge, receiving_merge, on=['Academic Year', 'Country'],how='outer').reset_index(drop=True).fillna(0)
visualization_1["All-Female"] = visualization_1["Sending-Female"] + visualization_1["Receiving-Female"]
visualization_1["All-Male"] = visualization_1["Sending-Male"] + visualization_1["Receiving-Male"]
visualization_1["All-All"] = visualization_1["Sending-All"] + visualization_1["Receiving-All"]

# Map country id to name, and manually to match for world map
visualization_1["Country"].replace(code_to_country, inplace=True)
visualization_1["Country"].replace({"Czechia" : "Czech Republic", "Serbia" : "Republic of Serbia", "Utd. Kingdom" : "England"}, inplace=True)
visualization_1.head(10)

j = (visualization_1.groupby(['Country'])
       .apply(lambda x: x[['Academic Year','Sending-Female', 'Sending-Male', 'Sending-All', 'Receiving-Female', 'Receiving-Male', 'Receiving-All', 'All-Female', 'All-Male', 'All-All']].to_dict('records'))
       .reset_index()
       .rename(columns={0:'Yearly-Data'})
       .to_json('../dataset/viz1.json', indent=2, orient='records'))

In [None]:
visualization_1

In [None]:
# visualization 2 - chord diagram - for each pair of countries (c1,c2) we want {from: c1, to: c2, participants: X}
visualization_2 = df[["Sending Country Code", "Receiving Country Code", "Participants"]] \
    .groupby(["Sending Country Code", "Receiving Country Code"]) \
    .agg({"Participants": "sum"}).sort_values("Participants", ascending=False).reset_index()

# fix the problem with missing row for RS
rs = {"Sending Country Code": 'RS', "Receiving Country Code": 'UK', "Participants": 0}
visualization_2 = visualization_2.append(rs, ignore_index=True)

# visualization_2.to_csv("../dataset/visualization_2.csv")

In [None]:
visualization_2

In [None]:
# we want to find the total number of participnants for each country
visualization_2[["Sending Country Code", "Participants"]].groupby("Sending Country Code").agg({"Participants": "sum"}).sort_values("Participants", ascending=False)

# we want to find the total number of participants for each country and select only top 10 countries for the visualization
sent = visualization_2[["Sending Country Code", "Participants"]].groupby("Sending Country Code").agg(
    {"Participants": "sum"}).sort_values("Participants", ascending=False).rename(columns = {"Participants": "Sent"})

received = visualization_2[["Receiving Country Code", "Participants"]].groupby("Receiving Country Code").agg(
    {"Participants": "sum"}).sort_values("Participants", ascending=False).rename(columns = {"Participants": "Received"})

counts = sent.join(received)
counts["Total"] = counts["Sent"] + counts["Received"]
counts = counts.sort_values("Total", ascending=False)
top_countries = counts.index.values[:10]

In [None]:
visualization_2 = visualization_2[visualization_2["Sending Country Code"].isin(top_countries)]
visualization_2 = visualization_2[visualization_2["Receiving Country Code"].isin(top_countries)]
visualization_2

In [None]:
visualization_2_matrix = visualization_2.pivot(index=["Sending Country Code"], columns=["Receiving Country Code"], values=["Participants"]).fillna(0)
visualization_2_matrix

In [None]:
visualization_2_matrix_norm = visualization_2_matrix.div(visualization_2_matrix.sum(axis=1), axis = 0)
visualization_2_matrix_norm

In [None]:
matrix_values = visualization_2_matrix.values
matrix_labels = [code_to_country[code] for code in visualization_2_matrix.index.values]

In [None]:
# save the data to json so it can be read in the visualization
import json

with open("../docs/data/viz2.json", "w+") as f:
    data = {"matrix": matrix_values.tolist(),
    "labels": matrix_labels}
    json.dump(data, f)

In [None]:
# visualization 3: popularity of fields per year
visualization_3 = df[["Academic Year", "Field of Education", "Participants"]] \
    .groupby(["Academic Year", "Field of Education"]) \
    .agg({"Participants": "sum"}).sort_values(["Academic Year", "Field of Education"]).reset_index()
visualization_3.to_csv("../dataset/visualization_3.csv")
visualization_3

In [None]:
viz3 = visualization_3.pivot(index="Academic Year", columns="Field of Education", values="Participants")
viz3.index = viz3.index.map(lambda x: int(x.split("-")[0]))
viz3.index.name = "Year"

In [None]:
viz3 = viz3.div(viz3.sum(axis=1), axis=0)

In [None]:
viz3

In [None]:
viz3.to_csv("../docs/data/viz3.csv")

In [None]:
%pwd

In [None]:
# Visualization 4: bubble - bubble sizes based on total participants or pageranks
exchanges = df[["Sending Organization", "Receiving Organization", "Participants"]]
links = exchanges.groupby(["Sending Organization", "Receiving Organization"]).agg('sum').reset_index().sort_values("Participants", ascending=False) \
    .rename(columns={"Sending Organization":"from", "Receiving Organization":"to", "Participants": "links"}).reset_index(drop=True)

In [None]:
import networkx as nx
# create a graph
G = nx.from_pandas_edgelist(links, source="from", target="to", edge_attr="links", create_using=nx.DiGraph())

In [None]:
pagerank = nx.pagerank(G)

In [None]:
betweenness = nx.betweenness_centrality(G)

In [None]:
betweenness

In [None]:
# calculate total number of incoming/outgoing students per university
outgoing = links[["from", "links"]].groupby("from").agg("sum").sort_values("links", ascending=False).reset_index().rename(columns={"from": "university", "links":"outgoing"})
incoming = links[["to", "links"]].groupby("to").agg("sum").sort_values("links", ascending=False).reset_index().rename(columns={"to": "university", "links":"incoming"})

In [None]:
# add rankings 
ranking_df = pd.merge(outgoing, incoming, on="university", how="outer").fillna(0)
ranking_df["total"] = ranking_df["incoming"] + ranking_df["outgoing"]
ranking_df["pagerank"] = ranking_df["university"].apply(lambda uni: pagerank[uni])
ranking_df["betweenness"] = ranking_df["university"].apply(lambda uni: betweenness[uni])


In [None]:
ranking_df = ranking_df.sort_values("total", ascending=False).reset_index(drop=True)


In [None]:
ranking_df

In [None]:
ranking_df.to_csv("../dataset/visualization_4.csv")