In [1]:
import pandas as pd
import numpy as np

# helper functions
import cpt_calculator as cpt

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# 0. Nodes Without Parents

In [2]:
print("These are the list of parentless nodes :", cpt.get_parentless_nodes(),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : ['DAY_OF_WEEK', 'MONTH'] . We start by defining a function to reduce redundancy.


In [3]:
cpt.data_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_DELAY,TAXI_OUT,TAXI_IN,CRS_ARR_TIME,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,LATE_AIRCRAFT_DELAY,DelayedDepartureSFO,ScheduledDepartureSFO,DelayedArrivalPHL,ScheduledArrivalPHL
0,11,3,0,-1,1,1,1,0,0,0,0,0,0,0,0,0
1,8,2,0,1,1,0,1,1,0,0,1,1,0,0,0,0
2,8,6,0,1,1,0,1,1,1,0,0,0,0,0,0,1
3,3,7,1,-1,2,0,0,0,0,0,0,0,0,0,0,0
4,5,7,1,1,0,0,0,1,1,0,0,1,0,3,0,2


In [4]:
list(set(cpt.data_df.DEP_DELAY.tolist()))

[0, 1, -1]

## 1. Month

In [5]:
month_df = cpt.parentless_CPT('MONTH')
month_df

Unnamed: 0,MONTH,num,prob,log_p
0,1,50,0.064599,-2.739549
1,2,29,0.037468,-3.284276
2,3,36,0.046512,-3.068053
3,4,49,0.063307,-2.759752
4,5,64,0.082687,-2.492689
5,6,84,0.108527,-2.220755
6,7,99,0.127907,-2.056452
7,8,93,0.120155,-2.118972
8,9,92,0.118863,-2.129783
9,10,72,0.093023,-2.374906


## 2. Day of Week

In [6]:
day_of_week_df = cpt.parentless_CPT('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,num,prob,log_p
0,1,119,0.153747,-1.872448
1,2,101,0.130491,-2.036451
2,3,94,0.121447,-2.108277
3,4,138,0.178295,-1.724318
4,5,135,0.174419,-1.746297
5,6,73,0.094315,-2.361112
6,7,114,0.147287,-1.915373


# Depth 1: Nodes with Parents

First, we find all the nodes whose values we can compute based on those nodes whose conditional probability tables we have already computed.

In [7]:
cpt.available_nodes()

['ScheduledArrivalPHL', 'CRS_ARR_TIME']

## 3. CRS Arrival Time

In [8]:
arr_time_df = cpt.nonparentless_CPT('CRS_ARR_TIME')
arr_time_df.head(10)

Note: There are 158 rows in the above table, but we should have 168 , which means that 10 row(s) missing values have been replaced with the probability 2.2250738585072014e-308


## 4. ScheduledArrivalPHL

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [9]:
scheduled_PHL_df = cpt.nonparentless_CPT('ScheduledArrivalPHL')
scheduled_PHL_df.head(10)

Note: There are 32 rows in the above table, but we should have 35 , which means that 3 row(s) missing values have been replaced with the probability 2.2250738585072014e-308


# Depth 2

In [10]:
cpt.available_nodes()

['CRS_DEP_TIME']

## 5. CRS Departure Time

In [11]:
departure_time_df = cpt.nonparentless_CPT('CRS_DEP_TIME')
departure_time_df.head(10)

Note: There are 214 rows in the above table, but we should have 336 , which means that 122 row(s) missing values have been replaced with the probability 2.2250738585072014e-308


# Depth 3

In [None]:
cpt.available_nodes()

## 6. Number of Scheduled Departures from SFO

In [None]:
scheduled_SFO_df = cpt.nonparentless_CPT('ScheduledDepartureSFO')
scheduled_SFO_df.head(10)

# Depth 4

In [None]:
cpt.available_nodes()

## 7. Delayed Departure SFO

In [None]:
delayed_SFO_df = cpt.nonparentless_CPT('DelayedDepartureSFO')
delayed_SFO_df.head(10)

## 8. Late Aircraft Delay

In [None]:
late_aircraft_delay_df = cpt.nonparentless_CPT('LATE_AIRCRAFT_DELAY')
late_aircraft_delay_df.head(10)

# Depth 5

In [None]:
cpt.available_nodes()

## 9. NAS Delay

In [None]:
nas_delay_df = cpt.nonparentless_CPT('NAS_DELAY')
nas_delay_df.head(10)

# Depth 6

In [None]:
cpt.available_nodes()

## 10. Taxi In

In [None]:
taxi_in_df = cpt.nonparentless_CPT('TAXI_IN')
taxi_in_df.head(10)

## 11. Weather Delay

In [None]:
weather_delay_df = cpt.nonparentless_CPT('WEATHER_DELAY')
weather_delay_df.head(10)

# Depth 7

In [None]:
cpt.available_nodes()

## 12. Carrier Delay

In [None]:
carrier_delay_df = cpt.nonparentless_CPT('CARRIER_DELAY')
carrier_delay_df.head(10)

# Depth 8

In [None]:
cpt.available_nodes()

## 13. Departure Delay

In [None]:
dep_delay_df = cpt.nonparentless_CPT('DEP_DELAY')
dep_delay_df.head(10)

## 14. Arrival Delay

In [None]:
arr_del15_df = cpt.nonparentless_CPT('ARR_DEL15')
arr_del15_df.head(10)