In [1]:
import pandas as pd
import numpy as np

# helper functions
import cpt_calculator as cpt
TINY = np.finfo(float).tiny

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# 0. Nodes Without Parents

In [2]:
print("These are the list of parentless nodes :", cpt.get_parentless_nodes(),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : ['MONTH', 'DAY_OF_WEEK'] . We start by defining a function to reduce redundancy.


In [3]:
cpt.data_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_DELAY,TAXI_OUT,TAXI_IN,CRS_ARR_TIME,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,DelayedDepartureSFO,ScheduledDepartureSFO,DelayedArrivalPHL,ScheduledArrivalPHL
0,11,3,0,-1,1,1,1,0,0,0,0,0,0,0,0,0
1,8,2,0,1,1,0,1,1,0,0,1,1,0,0,0,0
2,8,6,0,1,1,0,1,1,1,0,0,0,0,0,0,1
3,3,7,1,-1,2,0,0,0,0,0,0,0,0,0,0,0
4,5,7,1,1,0,0,0,1,1,0,0,1,0,3,0,2


In [4]:
list(set(cpt.data_df.DEP_DELAY.tolist()))

[0, 1, -1]

## 1. Month

In [5]:
month_df = cpt.parentless_CPT('MONTH')
month_df

Unnamed: 0,MONTH,num,prob,log_p
0,1,50,0.064599,-2.739549
1,2,29,0.037468,-3.284276
2,3,36,0.046512,-3.068053
3,4,49,0.063307,-2.759752
4,5,64,0.082687,-2.492689
5,6,84,0.108527,-2.220755
6,7,99,0.127907,-2.056452
7,8,93,0.120155,-2.118972
8,9,92,0.118863,-2.129783
9,10,72,0.093023,-2.374906


## 2. Day of Week

In [6]:
day_of_week_df = cpt.parentless_CPT('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,num,prob,log_p
0,1,119,0.153747,-1.872448
1,2,101,0.130491,-2.036451
2,3,94,0.121447,-2.108277
3,4,138,0.178295,-1.724318
4,5,135,0.174419,-1.746297
5,6,73,0.094315,-2.361112
6,7,114,0.147287,-1.915373


# Depth 1: Nodes with Parents

First, we find all the nodes whose values we can compute based on those nodes whose conditional probability tables we have already computed.

In [7]:
cpt.available_nodes()

['ScheduledArrivalPHL', 'CRS_ARR_TIME']

## 3. CRS Arrival Time

In [8]:
arr_time_df = cpt.nonparentless_CPT('CRS_ARR_TIME')

Note: There are 158 rows in the above table, but we should have 168 , which means that 10 row(s) are missing.


In [9]:
# make a complete CPT
temp = [[i, j, k] for i in range(1,13) for j in range(1,8) for k in range(2)]
temp = pd.DataFrame(temp,columns=arr_time_df.columns.tolist()[:-2])
on_li = arr_time_df.columns.tolist()[:-2]
arr_time_df = temp.merge(arr_time_df, on=on_li, how='outer')
# replace NaN values with very small values
arr_time_df['prob'].fillna(TINY, inplace=True)
arr_time_df['log_p'].fillna(np.log(TINY), inplace=True)
arr_time_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,CRS_ARR_TIME,prob,log_p
0,1,1,0,0.125,-2.079442
1,1,1,1,0.875,-0.133531
2,1,2,0,0.5,-0.693147
3,1,2,1,0.5,-0.693147
4,1,3,0,0.142857,-1.94591


## 4. ScheduledArrivalPHL

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [10]:
scheduled_PHL_df = cpt.nonparentless_CPT('ScheduledArrivalPHL')

Note: There are 32 rows in the above table, but we should have 35 , which means that 3 row(s) are missing.


In [11]:
# make a complete CPT
temp = [[i, j] for i in range(1,8) for j in range(2)]
temp = pd.DataFrame(temp,columns=scheduled_PHL_df.columns.tolist()[:-2])
on_li = scheduled_PHL_df.columns.tolist()[:-2]
scheduled_PHL_df = temp.merge(scheduled_PHL_df, on=on_li, how='outer')
# replace NaN values with very small values
scheduled_PHL_df['prob'].fillna(TINY, inplace=True)
scheduled_PHL_df['log_p'].fillna(np.log(TINY), inplace=True)
scheduled_PHL_df.head()

Unnamed: 0,DAY_OF_WEEK,ScheduledArrivalPHL,prob,log_p
0,1,0,0.655462,-0.422415
1,1,1,0.226891,-1.483287
2,2,0,0.663366,-0.410428
3,2,1,0.158416,-1.842532
4,3,0,0.702128,-0.35364


# Depth 2

In [12]:
cpt.available_nodes()

['CRS_DEP_TIME']

## 5. CRS Departure Time

In [13]:
departure_time_df = cpt.nonparentless_CPT('CRS_DEP_TIME')

Note: There are 214 rows in the above table, but we should have 336 , which means that 122 row(s) are missing.


In [14]:
# make a complete CPT
temp = [[i, j, k, l] for i in range(1,13) for j in range(2) for k in range(1,8) for l in range(2)]
temp = pd.DataFrame(temp,columns=departure_time_df.columns.tolist()[:-2])
on_li = departure_time_df.columns.tolist()[:-2]
departure_time_df = temp.merge(departure_time_df, on=on_li, how='outer')
# replace NaN values with very small values
departure_time_df['prob'].fillna(TINY, inplace=True)
departure_time_df['log_p'].fillna(np.log(TINY), inplace=True)
departure_time_df.head()

Unnamed: 0,CRS_ARR_TIME,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,prob,log_p
0,1,0,1,0,2.225074e-308,-708.396419
1,1,0,1,1,2.225074e-308,-708.396419
2,1,0,2,0,2.225074e-308,-708.396419
3,1,0,2,1,2.225074e-308,-708.396419
4,1,0,3,0,2.225074e-308,-708.396419


# Depth 3

In [15]:
cpt.available_nodes()

['ScheduledDepartureSFO']

## 6. Number of Scheduled Departures from SFO

In [16]:
scheduled_SFO_df = cpt.nonparentless_CPT('ScheduledDepartureSFO')
scheduled_SFO_df.head(10)

Note: There are 169 rows in the above table, but we should have 1200 , which means that 1031 row(s) are missing.


Unnamed: 0,CRS_DEP_TIME,ScheduledArrivalPHL,CRS_ARR_TIME,MONTH,ScheduledDepartureSFO,prob,log_p
0,0,0,0,5,0,1.0,0.0
1,0,0,0,6,0,1.0,0.0
2,0,0,0,7,0,1.0,0.0
3,0,0,0,8,0,1.0,0.0
4,0,0,0,9,0,1.0,0.0
5,0,0,1,1,0,0.952381,-0.04879
6,0,0,1,1,4,0.047619,-3.044522
7,0,0,1,2,0,1.0,0.0
8,0,0,1,3,0,1.0,0.0
9,0,0,1,4,0,1.0,0.0


# Depth 4

In [17]:
cpt.available_nodes()

['LATE_AIRCRAFT_DELAY', 'DelayedDepartureSFO']

## 7. Delayed Departure SFO

In [18]:
delayed_SFO_df = cpt.nonparentless_CPT('DelayedDepartureSFO')
delayed_SFO_df.head(10)

Note: There are 13 rows in the above table, but we should have 50 , which means that 37 row(s) are missing.


Unnamed: 0,ScheduledArrivalPHL,ScheduledDepartureSFO,DelayedDepartureSFO,prob,log_p
0,0,0,0,1.0,0.0
1,0,4,0,0.679012,-0.387116
2,0,4,1,0.320988,-1.136353
3,1,0,0,1.0,0.0
4,1,3,0,0.774194,-0.255933
5,1,3,1,0.225806,-1.488077
6,1,4,0,1.0,0.0
7,2,2,1,1.0,0.0
8,2,3,0,0.711111,-0.340927
9,2,3,1,0.288889,-1.241713


## 8. Late Aircraft Delay

In [19]:
late_aircraft_delay_df = cpt.nonparentless_CPT('LATE_AIRCRAFT_DELAY')
late_aircraft_delay_df.head(10)

Note: There are 32 rows in the above table, but we should have 40 , which means that 8 row(s) are missing.


Unnamed: 0,CRS_DEP_TIME,CRS_ARR_TIME,ScheduledDepartureSFO,LATE_AIRCRAFT_DELAY,prob,log_p
0,0,0,0,0,0.9375,-0.064539
1,0,0,0,1,0.0625,-2.772589
2,0,0,1,0,1.0,0.0
3,0,0,2,0,0.6,-0.510826
4,0,0,2,1,0.4,-0.916291
5,0,1,0,0,0.857595,-0.153623
6,0,1,0,1,0.142405,-1.94908
7,0,1,1,0,1.0,0.0
8,0,1,2,0,0.75,-0.287682
9,0,1,2,1,0.25,-1.386294


# Depth 5

In [20]:
cpt.available_nodes()

['NAS_DELAY']

## 9. NAS Delay

In [21]:
nas_delay_df = cpt.nonparentless_CPT('NAS_DELAY')
nas_delay_df.head(10)

Note: There are 15 rows in the above table, but we should have 16 , which means that 1 row(s) are missing.


Unnamed: 0,CRS_DEP_TIME,DelayedDepartureSFO,CRS_ARR_TIME,NAS_DELAY,prob,log_p
0,0,0,0,0,0.789474,-0.236389
1,0,0,0,1,0.210526,-1.558145
2,0,0,1,0,0.635678,-0.453063
3,0,0,1,1,0.364322,-1.009718
4,0,1,0,0,1.0,0.0
5,0,1,1,0,0.625,-0.470004
6,0,1,1,1,0.375,-0.980829
7,1,0,0,0,0.613169,-0.489115
8,1,0,0,1,0.386831,-0.949767
9,1,0,1,0,0.634615,-0.454736


# Depth 6

In [22]:
cpt.available_nodes()

['TAXI_IN', 'WEATHER_DELAY']

## 10. Taxi In

In [23]:
taxi_in_df = cpt.nonparentless_CPT('TAXI_IN')
taxi_in_df.head(10)

Note: There are 21 rows in the above table, but we should have 104 , which means that 83 row(s) are missing.


Unnamed: 0,DelayedDepartureSFO,NAS_DELAY,CRS_ARR_TIME,TAXI_IN,prob,log_p
0,0,0,0,0,0.939024,-0.062914
1,0,0,0,1,0.054878,-2.902642
2,0,0,0,2,0.006098,-5.099866
3,0,0,1,0,0.933566,-0.068743
4,0,0,1,1,0.066434,-2.711553
5,0,1,0,0,0.969388,-0.031091
6,0,1,0,1,0.010204,-4.584967
7,0,1,0,2,0.020408,-3.89182
8,0,1,1,0,0.871951,-0.137022
9,0,1,1,1,0.109756,-2.209495


## 11. Weather Delay

In [24]:
weather_delay_df = cpt.nonparentless_CPT('WEATHER_DELAY')
weather_delay_df.head(10)

Unnamed: 0,NAS_DELAY,WEATHER_DELAY,prob,log_p
0,0,0,0.997967,-0.002035
1,0,1,0.002033,-6.198479
2,1,0,0.985816,-0.014286
3,1,1,0.014184,-4.255613


# Depth 7

In [25]:
cpt.available_nodes()

['CARRIER_DELAY']

## 12. Carrier Delay

In [26]:
carrier_delay_df = cpt.nonparentless_CPT('CARRIER_DELAY')
carrier_delay_df.head(10)

Note: There are 11 rows in the above table, but we should have 16 , which means that 5 row(s) are missing.


Unnamed: 0,CRS_DEP_TIME,WEATHER_DELAY,CRS_ARR_TIME,CARRIER_DELAY,prob,log_p
0,0,0,0,0,0.863636,-0.146603
1,0,0,0,1,0.136364,-1.99243
2,0,0,1,0,0.806527,-0.215018
3,0,0,1,1,0.193473,-1.642616
4,0,1,1,0,1.0,0.0
5,1,0,0,0,0.69112,-0.369442
6,1,0,0,1,0.30888,-1.174801
7,1,0,1,0,0.542373,-0.611802
8,1,0,1,1,0.457627,-0.781701
9,1,1,0,0,1.0,0.0


# Depth 8

In [27]:
cpt.available_nodes()

['ARR_DEL15', 'DEP_DELAY']

## 13. Departure Delay

In [28]:
dep_delay_df = cpt.nonparentless_CPT('DEP_DELAY')
dep_delay_df.head(10)

Note: There are 447 rows in the above table, but we should have 2016 , which means that 1569 row(s) are missing.


Unnamed: 0,MONTH,CARRIER_DELAY,DAY_OF_WEEK,LATE_AIRCRAFT_DELAY,CRS_ARR_TIME,DEP_DELAY,prob,log_p
0,1,0,1,0,0,-1,1.0,0.0
1,1,0,1,0,1,-1,0.2,-1.609438
2,1,0,1,0,1,0,0.6,-0.510826
3,1,0,1,0,1,1,0.2,-1.609438
4,1,0,1,1,1,1,1.0,0.0
5,1,0,2,0,0,-1,1.0,0.0
6,1,0,2,0,1,0,0.5,-0.693147
7,1,0,2,0,1,1,0.5,-0.693147
8,1,0,2,1,0,1,1.0,0.0
9,1,0,3,0,0,1,1.0,0.0


## 14. Arrival Delay

In [29]:
arr_del15_df = cpt.nonparentless_CPT('ARR_DEL15')
arr_del15_df.head(10)

Note: There are 118 rows in the above table, but we should have 640 , which means that 522 row(s) are missing.


Unnamed: 0,NAS_DELAY,CRS_DEP_TIME,CARRIER_DELAY,DelayedDepartureSFO,ScheduledDepartureSFO,LATE_AIRCRAFT_DELAY,CRS_ARR_TIME,ARR_DEL15,prob,log_p
0,0,0,0,0,0,0,0,0,1.0,0.0
1,0,0,0,0,0,0,1,0,1.0,0.0
2,0,0,0,0,0,1,0,1,1.0,0.0
3,0,0,0,0,0,1,1,1,1.0,0.0
4,0,0,0,0,1,0,1,0,1.0,0.0
5,0,0,0,0,2,0,0,0,1.0,0.0
6,0,0,0,0,2,0,1,0,1.0,0.0
7,0,0,0,0,3,0,1,0,1.0,0.0
8,0,0,0,0,3,1,1,1,1.0,0.0
9,0,0,0,0,4,0,1,0,1.0,0.0
