# Final project

## Loading data

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import scipy
from scipy import sparse

In [2]:
airports_ds = pd.read_csv('data/airports.dat', header = None, encoding='utf-8', names = ["Airport ID", "Name"
                                                                                         , "City", "Country", 
                              "IATA", "ICAO", "Latitude", "Longitude", "Altitude", 
                              "Timezone", "DST", "Tz db time zone", "Type", "Source"])
routes = pd.read_csv('data/routes.dat', sep=',', encoding='utf-8', engine='python', 
                     names = ["Airline", "Airline_ID", "Source_airport", "Source_airport_ID", 
                              "Destination_airport", "Destination_airport_ID", "Codeshare", "Stops", "Equipment"]
                     , na_values=['\\N'])

# We only keep the "active" airports for our nodes

# Take all airports ID which are listed in table routes, either as destination or as source
airports = pd.concat([routes["Source_airport_ID"],routes["Destination_airport_ID"]], axis=0)
airports = airports.drop_duplicates()

# Take lines of the airport table, corresponding to the above airports
features = airports_ds[airports_ds['Airport ID'].isin(airports)]

features = features.set_index(pd.Series(range(len(features))))
features = features.assign(newidx=pd.Series(range(len(features))).values)


print("Number of airports in the airports.dat file:", len(airports_ds))
print("Number of \"active\" airports in the routes.dat file:", len(features))
features.head(4)

Number of airports in the airports.dat file: 7184
Number of "active" airports in the routes.dat file: 3186


Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz db time zone,Type,Source,newidx
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports,0
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports,1
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports,2
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports,3


In [3]:
# CREATING DATAFRAME OF DESTINATION AND SOURCE IATA FOR EACH ROUTE WITH ALL THE ACTIVE AIRPORTS
idx_IATA = features[['IATA', 'newidx']]
routes_srcdest = routes[['Source_airport', 'Destination_airport']]

# Left join because we preserve the airports from routes, even if you don't have features on them
idx_src = routes_srcdest.merge(right=idx_IATA, how='left', left_on='Source_airport', right_on='IATA')
idx_src_dest = idx_src.merge(right=idx_IATA, how='left', left_on='Destination_airport', right_on='IATA', suffixes=['_src', '_dest']).drop(columns=["IATA_src", "IATA_dest"])

In [4]:
n_nodes = len(features)
adjacency = np.zeros((n_nodes, n_nodes), dtype=int)
adjacency_uw = np.zeros((n_nodes, n_nodes), dtype=int)

for idx, row in idx_src_dest.iterrows():

    if pd.isnull(row.newidx_src) or pd.isnull(row.newidx_dest):
        continue
    i, j = int(row.newidx_src), int(row.newidx_dest)

    adjacency_uw[i, j] = 1
    
    adjacency[i, j] = adjacency[i, j] + 1

adjacency.max()

20

In [5]:
a = adjacency.sum(axis=1)
b = adjacency.sum(axis=0)

zero_in = np.argwhere(a==0)
zero_out = np.argwhere(b==0)

print("The following values are indices of airports which are connected to nothing :")
no_connection = []

for value in zero_in :
    if np.isin(value[0],zero_out):
        no_connection.append(value[0])
        
print(no_connection)

print("\nTheses airports are the following:")

print(features.loc[features['newidx'].isin(no_connection), ['IATA', 'City', 'Country']])

The following values are indices of airports which are connected to nothing :
[1522, 1523, 2643, 3027, 3029, 3040, 3088]

Theses airports are the following:
     IATA              City         Country
1522  RJA       Rajahmundry           India
1523  TIR          Tirupeti           India
2643  LPS             Lopez   United States
3027  AKI             Akiak   United States
3029  TKJ               Tok   United States
3040  SPB  Charlotte Amalie  Virgin Islands
3088  AGM      Angmagssalik       Greenland


In [6]:
# CAREFULL, DON'T RUN THIS CELL SEVERAL TIMES OTHERWISE IT DROPS COLUMNS EACH TIME
features = features.drop(features.index[no_connection])
features = features.set_index(pd.Series(range(len(features))))
features = features.assign(newidx=pd.Series(range(len(features))).values)

In [7]:
# CAREFULL, DON'T RUN THIS CELL SEVERAL TIMES OTHERWISE IT DELETES ROWS & COLUMNS EACH TIME
adjacency = np.delete(adjacency, no_connection, 0)
adjacency = np.delete(adjacency, no_connection, 1)
adjacency_uw = np.delete(adjacency_uw, no_connection, 0)
adjacency_uw = np.delete(adjacency_uw, no_connection, 1)

In [18]:
adjacency[np.diag_indices_from(adjacency)] = 0
adjacency_uw[np.diag_indices_from(adjacency_uw)] = 0

In [19]:
# Build the symmetric matrices
adjacency_sym = adjacency + adjacency.T
adjacency_uw_sym = adjacency_sym
adjacency_uw_sym[adjacency_uw_sym>0] = 1

In [101]:
# the adjacency matrix we use for the project : symmetric and unweighted
adj = adjacency_uw_sym

Extract adjacency of largest component either by: <br>
using point 8 of milestone 1 <br>
using nx: create nx graph, extract largest components, find a way to keep track of which nodes belongs to which airports.

## Sample part of the graph for interesting insights

In [108]:
def sample_by_degree(adj = adj, threshold = 0, way='gt') :
    degrees = np.sum(adj, axis = 1)
    if way == 'gt' :
        select_idx = np.where(degrees >= threshold)[0]
        new_adj = np.delete(adj, np.where(degrees < threshold), 0)
    elif way == 'lt' :
        select_idx = np.where(degrees <= threshold)[0]
        new_adj = np.delete(adj, np.where(degrees > threshold),0)
    elif way == 'exact' :
        select_idx = np.where(degrees == threshold)[0]
        new_adj = np.delete(adj, np.where(degrees != threshold),0)
    else :
        raise ValueError("'way' parameter should be either gt or lt or exact," 
                         + "respectively for greater than, less than, or exactly the threshold")
    
    airports = features.index[select_idx].values
    new_deg = np.sum(new_adj, axis = 1)
    print(new_adj.shape[0])
    
    return new_adj, new_deg, airports

### Very connected (deg >= 20)

In [128]:
adj_gt20, deg_gt20, airports_gt20 = sample_by_degree(threshold = 20, way = 'gt')
#adjacency_csr = sparse.csr_matrix(adj_gt20)
#degree_matrix_csc = sparse.diags(deg_gt20, format = "csc")

451


### Very very connected (deg >= 170)

In [130]:
adj_gt170, deg_gt170, airports_gt170 = sample_by_degree(threshold = 170, way = 'gt')
features.iloc[airports_gt170,:].head(adj_gt170.shape[0])

12


Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz db time zone,Type,Source,newidx
191,340,Frankfurt am Main International Airport,Frankfurt,Germany,FRA,EDDF,50.033333,8.570556,364,1,E,Europe/Berlin,airport,OurAirports,191
196,346,Munich International Airport,Munich,Germany,MUC,EDDM,48.353802,11.7861,1487,1,E,Europe/Berlin,airport,OurAirports,196
255,507,London Heathrow Airport,London,United Kingdom,LHR,EGLL,51.4706,-0.461941,83,0,E,Europe/London,airport,OurAirports,255
282,580,Amsterdam Airport Schiphol,Amsterdam,Netherlands,AMS,EHAM,52.308601,4.76389,-11,1,E,Europe/Amsterdam,airport,OurAirports,282
628,1382,Charles de Gaulle International Airport,Paris,France,CDG,LFPG,49.012798,2.55,392,1,E,Europe/Paris,airport,OurAirports,628
770,1701,Atatürk International Airport,Istanbul,Turkey,IST,LTBA,40.976898,28.8146,163,3,E,Europe/Istanbul,airport,OurAirports,770
1017,2188,Dubai International Airport,Dubai,United Arab Emirates,DXB,OMDB,25.2528,55.364399,62,4,U,Asia/Dubai,airport,OurAirports,1017
1642,3364,Beijing Capital International Airport,Beijing,China,PEK,ZBAA,40.080101,116.584999,116,8,U,Asia/Shanghai,airport,OurAirports,1642
1800,3670,Dallas Fort Worth International Airport,Dallas-Fort Worth,United States,DFW,KDFW,32.896801,-97.038002,607,-6,A,America/Chicago,airport,OurAirports,1800
1809,3682,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.6367,-84.428101,1026,-5,A,America/New_York,airport,OurAirports,1809


### Very not connected (deg <= 10)

In [134]:
# Maybe not so useful
adj_lt1, deg_lt1,airports_lt1  = sample_by_degree(threshold = 1, way = 'lt')
#adjacency_csr = sparse.csr_matrix(adj_lt10)
#degree_matrix_csc = sparse.diags(deg_lt10,format = "csc")
features.iloc[airports_lt1,:].head(20)

709


Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz db time zone,Type,Source,newidx
9,10,Thule Air Base,Thule,Greenland,THU,BGTL,76.531197,-68.703201,251,-4,E,America/Thule,airport,OurAirports,9
10,11,Akureyri Airport,Akureyri,Iceland,AEY,BIAR,65.660004,-18.072701,6,0,N,Atlantic/Reykjavik,airport,OurAirports,10
11,12,Egilsstaðir Airport,Egilsstadir,Iceland,EGS,BIEG,65.283302,-14.4014,76,0,N,Atlantic/Reykjavik,airport,OurAirports,11
12,15,Ísafjörður Airport,Isafjordur,Iceland,IFJ,BIIS,66.058098,-23.1353,8,0,N,Atlantic/Reykjavik,airport,OurAirports,12
21,31,Brandon Municipal Airport,Brandon,Canada,YBR,CYBR,49.91,-99.951897,1343,-6,A,America/Winnipeg,airport,OurAirports,21
37,60,Fort Simpson Airport,Fort Simpson,Canada,YFS,CYFS,61.760201,-121.237,555,-7,A,America/Edmonton,airport,OurAirports,37
38,61,Kingston Norman Rogers Airport,Kingston,Canada,YGK,CYGK,44.2253,-76.596901,305,-5,A,America/Toronto,airport,OurAirports,38
54,85,Lloydminster Airport,Lloydminster,Canada,YLL,CYLL,53.3092,-110.072998,2193,-7,A,America/Edmonton,airport,OurAirports,54
63,108,Prince Rupert Airport,Prince Pupert,Canada,YPR,CYPR,54.286098,-130.445007,116,-8,A,America/Vancouver,airport,OurAirports,63
69,116,Lethbridge County Airport,Lethbridge,Canada,YQL,CYQL,49.630299,-112.800003,3048,-7,A,America/Edmonton,airport,OurAirports,69


## Creates labels

In [None]:
# 1. South-North
# 2. Continent
# 3. Laplacian clustering
# 4. Degree of nodes (importance of airports)
# 5. use you imagination!

In [None]:
# build solid emebedding: eg degree > 20
# plot signal on it and check 

In [127]:
# Build an embedded version in 2d
# build the clusters with sk-learn (with 3 or 6, see correction)
# Put signals and see how it brights the clustered plot
# and try to recover the same plot (with colors) with a good label (signal) > try several

# Small tips:
# try an equivalent of seaborn for plots
# (...)

# Our orininality will be on good choice of labels!

### Find hypothesis