In [2]:
import json
import networkx as nx
routes = {}

"""
json data is grouped by arrival airport 

this is a directional, weighted relationship
"""
with open("data.json", "r") as file:
    data = json.load(file)
    for arr_airport in data:
        for flight in arr_airport:
            dept_airport = flight['estDepartureAirport']
            arr_airport = flight['estArrivalAirport']

            if (dept_airport != None and arr_airport) and (dept_airport != arr_airport):
                if arr_airport not in routes:
                    routes[arr_airport] = {}
                
                if dept_airport not in routes[arr_airport]:
                    routes[arr_airport][dept_airport] = 0
                
                routes[arr_airport][dept_airport] += 1

print(routes)

{'LOWG': {'LOAG': 1, 'EDDM': 11, 'LZKZ': 1, 'LOWW': 11, 'LTAI': 3, 'EDDF': 13, 'EDFB': 1, 'LOGK': 1, 'LCLK': 1, 'EDLW': 1, 'LEPA': 1, 'LSZH': 4, 'EDGS': 1, 'LJBL': 2, 'EDLA': 1, 'EDDL': 2, 'EDDB': 2, 'EGKB': 1, 'LHBP': 1, 'LGKO': 1, 'LHSM': 1, 'LOXZ': 2, 'LOWS': 1, 'LOAN': 1, 'EDDH': 2, 'EDTM': 1, 'EDQM': 1, 'LFMN': 1, 'LJMB': 1, 'LOKW': 1, 'LFMD': 1, 'LFKC': 1, 'EKSB': 1, 'LSZR': 1, 'EDDC': 1, 'LIPZ': 1, 'LGRP': 1, 'LIPV': 1}, 'LOWI': {'LSZM': 1, 'EDDK': 1, 'LGKO': 1, 'LIPX': 1, 'EHAM': 3, 'LOWW': 10, 'ETHN': 1, 'LSZB': 1, 'LFKJ': 1, 'LGRP': 1, 'LKPM': 1, 'EDFE': 1, 'EDML': 1, 'EGKK': 2, 'EDDH': 2, 'LEBL': 2, 'LFLB': 1, 'LSGG': 1, 'LKPR': 1, 'EBAW': 1, 'LFMD': 1, 'LSZH': 2, 'LIRA': 1, 'EDJA': 1, 'LIPB': 1, 'LDLO': 1, 'LOWS': 1, 'LFKC': 1, 'EGGW': 1, 'EDSB': 1, 'LFBO': 1, 'EGBB': 1, 'EDDP': 1, 'EGCC': 1, 'ETHA': 1, 'EGPH': 1, 'EDDB': 1}, 'LOWK': {'LEAL': 1, 'LZIB': 1, 'LOWW': 2}, 'LOWL': {'LOLO': 1, 'LOLH': 1, 'LTAI': 3, 'EDDP': 2, 'LHBP': 2, 'EDLV': 1, 'EGSS': 1, 'EDDK': 1, 'EDDG': 1,

In [3]:
g = nx.DiGraph()
g.clear()

for arr_airport, sources in routes.items():
    g.add_node(arr_airport, label=arr_airport)
    for source in sources.keys():
        g.add_node(source, label=source)

seen = set()
for arr_airport, sources in routes.items():
    for source, count in sources.items():
        edge = (source, arr_airport)
        if edge not in seen:
            g.add_edge(source, arr_airport, weight=count, label=f"{count}")
            seen.add(edge)

nx.write_graphml(g, "graph.graphml")

sort of error - it appears in the graphml file that the directions are messed up. after some thought, it could be that the nx is organizing nodes and edges differently when it writes to a file. for example - when i print routes, it says the number of times EDDM had arrived at LOWG was 11. However, if I printed the edges with data, the first thing I see is LOWG -> EDDM with a different value. I thought the graph was not writing correclty, but upon further inspection, it seems to just organize the nodes differently. 

In [11]:
"""why degree centrality (in degree)?
question is which city gets the busiest.
regular degree centrality does not account for weights (e.g. amount of arrivals).
eigenvector centrality does not account for weighting, but pagerank does.
pagerank gives more weight to neighbors of important nodes, but if there was an airport that had a lot of layovers on the way to the final 
destination, this would be giving more weight to airports that happen to have direct connections to more populus airports. since we're considering
which cities are most popular, not which airports, whether an airport is neighboring to an important airport seems redundant.
betweenness centrality - considering the fact that layovers are common, a flight with high betweenness centrality could just mean there's a lot of layovers there, not a lot of actual visitors. 
closeness centrality - more closely indicates how easily each city can reach other cities 
"""

centrality = nx.in_degree_centrality(g)
for airport in centrality:
    print(airport, centrality[airport])

top = 3
for node in sorted(centrality, key=centrality.get, reverse=True)[:top]:
    print(node, centrality[node])

LOWG 0.03653846153846154
LOAG 0.0
EDDM 0.16153846153846155
LZKZ 0.014423076923076924
LOWW 0.14807692307692308
LTAI 0.1076923076923077
EDDF 0.18942307692307694
EDFB 0.0
LOGK 0.0
LCLK 0.06538461538461539
EDLW 0.057692307692307696
LEPA 0.11153846153846154
LSZH 0.1701923076923077
EDGS 0.0
LJBL 0.0
EDLA 0.0
EDDL 0.11730769230769231
EDDB 0.15096153846153848
EGKB 0.0
LHBP 0.11057692307692309
LGKO 0.07019230769230769
LHSM 0.0
LOXZ 0.0
LOWS 0.06346153846153846
LOAN 0.0
EDDH 0.11442307692307693
EDTM 0.0
EDQM 0.0
LFMN 0.15961538461538463
LJMB 0.0
LOKW 0.0
LFMD 0.0
LFKC 0.0
EKSB 0.0
LSZR 0.0
EDDC 0.032692307692307694
LIPZ 0.08653846153846154
LGRP 0.10576923076923078
LIPV 0.0
LOWI 0.035576923076923075
LSZM 0.0
EDDK 0.1125
LIPX 0.05
EHAM 0.19230769230769232
ETHN 0.0
LSZB 0.0
LFKJ 0.03173076923076923
LKPM 0.0
EDFE 0.0
EDML 0.0
EGKK 0.11923076923076924
LEBL 0.1528846153846154
LFLB 0.0
LSGG 0.1375
LKPR 0.13365384615384615
EBAW 0.05
LIRA 0.06826923076923078
EDJA 0.058653846153846154
LIPB 0.0
LDLO 0.0
EG

"Most important" nodes: EHAM, EDDF, EGSS

What airports are these?
EHAM - AMS - Amsterdam Schiphol
EDDF - FRA - Frankfurt 
EGSS - STN - London Stansted

In [8]:
top_100 = sorted(centrality, key=centrality.get, reverse=True)[:100]
g_top100 = g.subgraph(top_100).copy()


nx.write_graphml(g_top100, "top100.graphml")