# An example of Network Optimization (Airlines)

## Import the ***libraries*** and ***dataset***

In [1]:
# import the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# load the dataset
data = pd.read_csv('airlines_network_optimization.csv') #pandas will read the csv file with header as there is no 'header=None' in the code.
print(data)

    year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0   2013      2   26    1807.0            1630       97.0    1956.0   
1   2013      8   17    1459.0            1445       14.0    1801.0   
2   2013      2   13    1812.0            1815       -3.0    2055.0   
3   2013      4   11    2122.0            2115        7.0    2339.0   
4   2013      8    5    1832.0            1835       -3.0    2145.0   
5   2013      6   30    1500.0            1505       -5.0    1751.0   
6   2013      2   14    1442.0            1445       -3.0    1833.0   
7   2013      7   25     752.0             755       -3.0    1037.0   
8   2013      7   10     557.0             600       -3.0     725.0   
9   2013     12   13    1907.0            1915       -8.0    2155.0   
10  2013      1   28    1455.0            1500       -5.0    1647.0   
11  2013      9    6     903.0             912       -9.0    1051.0   
12  2013      8   19       NaN             620        NaN       NaN   
13  20

## Preprocessing the dataset/ Feature Engineering

In [2]:
# data.shape 
# converting sched_dep_time to 'std' - Scheduled time of departure
data['std'] = data.sched_dep_time.astype(str).str.replace('(\d{2}$)', '') + ':' + data.sched_dep_time.astype(str).str.extract('(\d{2}$)', expand=False) + ':00'
# converting sched_arr_time to 'sta' - Scheduled time of arrival
data['sta'] = data.sched_arr_time.astype(str).str.replace('(\d{2}$)', '') + ':' + data.sched_arr_time.astype(str).str.extract('(\d{2}$)', expand=False) + ':00'

# converting dep_time to 'atd' - Actual time of departure
data['atd'] = data.dep_time.fillna(0).astype(np.int64).astype(str).str.replace('(\d{2}$)', '') + ':' + data.dep_time.fillna(0).astype(np.int64).astype(str).str.extract('(\d{2}$)', expand=False) + ':00'
# converting arr_time to 'ata' - Actual time of arrival
data['ata'] = data.arr_time.fillna(0).astype(np.int64).astype(str).str.replace('(\d{2}$)', '') + ':' + data.arr_time.fillna(0).astype(np.int64).astype(str).str.extract('(\d{2}$)', expand=False) + ':00'

data['date'] = pd.to_datetime(data[['year', 'month', 'day']])
# finally we drop the columns we don't need
data = data.drop(columns = ['year', 'month', 'day'])

data['std'] new column of data named 'std'

sched_dep_time.astype(str)   take values from 'sched_dep_time' as string type

The syntax of replace() is: str.replace(old, new, [count])

str.replace('(\d{2}$)', '')  replace first 2 digit with '', means do not replace

+':' add a ':' after that

str.extract('(\d{2}$)'    extract 2 digit and add after : 

expand=False  means it will return a Series. expand=True would return a DataFrame with one column

add :00 after that

fillna(0)    Fill NA/NaN values using the specified method. Here replaces all NaN elements with 0s. It was absent in previous lines of code. Because those were scheduled departure and arrival time which can not be Null. But if a plane fails to arrive then there would be no actual departure or arrival time thus N/A. fillna(0) replaces those datas with zero.

pd.to_datetime(data[['year', 'month', 'day']]) converts to datetime format

data.drop(columns = ['year', 'month', 'day'])   removes the year, month and day column

In [3]:
print(data)

    dep_time  sched_dep_time  dep_delay  arr_time  sched_arr_time  arr_delay  \
0     1807.0            1630       97.0    1956.0            1837       79.0   
1     1459.0            1445       14.0    1801.0            1747       14.0   
2     1812.0            1815       -3.0    2055.0            2125      -30.0   
3     2122.0            2115        7.0    2339.0            2353      -14.0   
4     1832.0            1835       -3.0    2145.0            2155      -10.0   
5     1500.0            1505       -5.0    1751.0            1650       61.0   
6     1442.0            1445       -3.0    1833.0            1747       46.0   
7      752.0             755       -3.0    1037.0            1057      -20.0   
8      557.0             600       -3.0     725.0             715       10.0   
9     1907.0            1915       -8.0    2155.0            2219      -24.0   
10    1455.0            1500       -5.0    1647.0            1655       -8.0   
11     903.0             912       -9.0 

## Formulate the Network which will be optimized

In [4]:
import networkx as nx
FG = nx.from_pandas_edgelist(data, source='origin', target='dest', edge_attr=True,)
# detail documentation of networkx https://networkx.github.io/documentation/networkx-1.7/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html

In [5]:
FG.nodes()
FG.edges()
nx.draw_networkx(FG, with_labels=True,node_size=600, node_color='y') # Quick view of the Graph. As expected we see 3 very busy airports

In [6]:
nx.algorithms.degree_centrality(FG) # Notice the 3 airports from which all of our 100 rows of data originates
nx.density(FG) # Average edge density of the Graphs
nx.average_shortest_path_length(FG) # Average shortest path length for ALL paths in the Graph
nx.average_degree_connectivity(FG) # For a node of degree k - What is the average of its neighbours' degree?

{20: 1.95, 1: 19.307692307692307, 2: 19.0625, 17: 2.0588235294117645, 3: 19.0}

## Shortest path between JAX to DFW

In [7]:
# Let us find all the paths available
for path in nx.all_simple_paths(FG, source='JAX', target='DFW'):
 print(path)
# Let us find the dijkstra path from JAX to DFW.
# You can read more in-depth on how dijkstra works from this resource - https://courses.csail.mit.edu/6.006/fall11/lectures/lecture16.pdf
dijpath = nx.dijkstra_path(FG, source='JAX', target='DFW')
dijpath
# Let us try to find the dijkstra path weighted by airtime (approximate case)
shortpath = nx.dijkstra_path(FG, source='JAX', target='DFW', weight='air_time')
shortpath

['JAX', 'JFK', 'DEN', 'LGA', 'ORD', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'PBI', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'IAD', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'MIA', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'RDU', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'TPA', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'MSP', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'MCO', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'CVG', 'EWR', 'DFW']
['JAX', 'JFK', 'DEN', 'LGA', 'IAH', 'EWR', 'DFW']
['JAX', 'JFK', 'SEA', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'ORD', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'PBI', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'IAD', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'MIA', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'RDU', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'TPA', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'MSP', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'CVG', 'EWR', 'DFW']
['JAX', 'JFK', 'MCO', 'LGA', 'IAH', 'EWR', 'DFW']
['JAX', 'JFK',

['JAX', 'JFK', 'BOS', 'EWR', 'DFW']

Note:  this code is inspired by www.analyticsvidhya.com/blog/2018/04/introduction-to-graph-theory-network-analysis-python-codes/