In [1]:
import pandas as pd
import numpy as np

# helper functions
import cpt_calculator as cpt

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# 0. Nodes Without Parents

In [2]:
print("These are the list of parentless nodes :", cpt.get_parentless_nodes(),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : ['MONTH', 'DAY_OF_WEEK'] . We start by defining a function to reduce redundancy.


## 1. Month

In [3]:
month_df = cpt.parentless_CPT('MONTH')
month_df

Unnamed: 0,MONTH,num,prob,log_p
0,1,50,0.064599,-2.739549
1,2,29,0.037468,-3.284276
2,3,36,0.046512,-3.068053
3,4,49,0.063307,-2.759752
4,5,64,0.082687,-2.492689
5,6,84,0.108527,-2.220755
6,7,99,0.127907,-2.056452
7,8,93,0.120155,-2.118972
8,9,92,0.118863,-2.129783
9,10,72,0.093023,-2.374906


## 2. Day of Week

In [4]:
day_of_week_df = cpt.parentless_CPT('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,num,prob,log_p
0,1,119,0.153747,-1.872448
1,2,101,0.130491,-2.036451
2,3,94,0.121447,-2.108277
3,4,138,0.178295,-1.724318
4,5,135,0.174419,-1.746297
5,6,73,0.094315,-2.361112
6,7,114,0.147287,-1.915373


# Depth 1: Nodes with Parents

First, we find all the nodes whose values we can compute based on those nodes whose conditional probability tables we have already computed.

In [5]:
cpt.available_nodes()

['ScheduledArrivalPHL', 'CRS_ARR_TIME']

## 3. CRS Arrival Time

In [6]:
cpt.node_values

{'ARR_DEL15': 2,
 'CARRIER_DELAY': 2,
 'CRS_ARR_TIME': 2,
 'CRS_DEP_TIME': 2,
 'DAY_OF_WEEK': 7,
 'DEP_DELAY': 2,
 'DelayedArrivalPHL': 2,
 'DelayedDepartureSFO': 2,
 'LATE_AIRCRAFT_DELAY': 2,
 'MONTH': 12,
 'NAS_DELAY': 2,
 'ScheduledArrivalPHL': 5,
 'ScheduledDepartureSFO': 5,
 'TAXI_IN': 13,
 'TAXI_OUT': 7,
 'WEATHER_DELAY': 2}

In [7]:
arr_time_df = cpt.nonparentless_CPT('CRS_ARR_TIME')
arr_time_df.head(10)

There are 158 rows in the above table, but we should have 168 , which means that 10 rows are missing.


Unnamed: 0,MONTH,DAY_OF_WEEK,CRS_ARR_TIME,prob,log_p
0,1,1,0,0.125,-2.079442
1,1,1,1,0.875,-0.133531
2,1,2,0,0.5,-0.693147
3,1,2,1,0.5,-0.693147
4,1,3,0,0.142857,-1.94591
5,1,3,1,0.857143,-0.154151
6,1,4,0,0.545455,-0.606136
7,1,4,1,0.454545,-0.788457
8,1,5,0,0.428571,-0.847298
9,1,5,1,0.571429,-0.559616


In [8]:
to_add = []
for i in range(1,cpt.node_values['MONTH']+1):
    if (len(arr_time_df[arr_time_df.MONTH==i]) < 14):
        temp = arr_time_df[arr_time_df.MONTH==i]
        # print 'incomplete' months
        print('MONTH', i)
        for j in range(1,cpt.node_values['DAY_OF_WEEK']+1):
            day_count = len(temp[temp.DAY_OF_WEEK==j])
            if (len(temp[temp.DAY_OF_WEEK==j]) < 2):
                # print DAY OF WEEK and its missing value
                if (day_count == 0):
                    print('\tDAY_OF_WEEK',j,'\tno data at all.')
                    to_add.append([i, j, 0,0.0001,np.log(0.0001)])
                    to_add.append([i, j, 1,0.0001,np.log(0.0001)])
                else:
                    elem = list(set([0,1])-set(temp[temp.DAY_OF_WEEK==j].CRS_ARR_TIME))[0]
                    print('\tDAY_OF_WEEK',j,':',elem)
                    to_add.append([i, j,elem,0.0001,np.log(0.0001)])

# add missing datas with probability 0.0001
arr_time_df = arr_time_df.append(pd.DataFrame(to_add, columns=arr_time_df.columns))
arr_time_df = arr_time_df.sort_values(['MONTH','DAY_OF_WEEK','CRS_ARR_TIME']).reset_index().drop(['index'],axis=1)

MONTH 2
	DAY_OF_WEEK 2 : 0
	DAY_OF_WEEK 6 : 0
	DAY_OF_WEEK 7 	no data at all.
MONTH 3
	DAY_OF_WEEK 2 : 1
MONTH 4
	DAY_OF_WEEK 7 : 0
MONTH 6
	DAY_OF_WEEK 3 : 0
MONTH 11
	DAY_OF_WEEK 1 : 1
	DAY_OF_WEEK 6 : 0
MONTH 12
	DAY_OF_WEEK 6 : 1


## 4. ScheduledArrivalPHL

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [9]:
scheduled_PHL_df = cpt.nonparentless_CPT('ScheduledArrivalPHL')
scheduled_PHL_df.head(10)

There are 32 rows in the above table, but we should have 35 , which means that 3 rows are missing.


Unnamed: 0,DAY_OF_WEEK,ScheduledArrivalPHL,prob,log_p
0,1,0,0.655462,-0.422415
1,1,1,0.226891,-1.483287
2,1,2,0.092437,-2.381228
3,1,4,0.02521,-3.680511
4,2,0,0.663366,-0.410428
5,2,1,0.158416,-1.842532
6,2,2,0.089109,-2.417896
7,2,4,0.089109,-2.417896
8,3,0,0.702128,-0.35364
9,3,1,0.234043,-1.452252


In [10]:
print('There are', scheduled_PHL_df.shape[0], 'rows in the above table, but we should have', 7 * 5, \
      ', which means that', 7 * 5 - scheduled_PHL_df.shape[0], 'rows are missing.')

There are 32 rows in the above table, but we should have 35 , which means that 3 rows are missing.


# Depth 2

In [11]:
cpt.available_nodes()

['CRS_DEP_TIME']

## 5. CRS Departure Time

In [12]:
departure_time_df = cpt.nonparentless_CPT('CRS_DEP_TIME')
departure_time_df.head(10)

There are 214 rows in the above table, but we should have 336 , which means that 122 rows are missing.


Unnamed: 0,CRS_ARR_TIME,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,prob,log_p
0,0,1,1,1,1.0,0.0
1,0,1,2,1,1.0,0.0
2,0,1,3,1,1.0,0.0
3,0,1,4,1,1.0,0.0
4,0,1,5,1,1.0,0.0
5,0,1,6,1,1.0,0.0
6,0,1,7,1,1.0,0.0
7,0,2,1,1,1.0,0.0
8,0,2,3,1,1.0,0.0
9,0,2,4,1,1.0,0.0


# Depth 3

In [13]:
cpt.available_nodes()

['ScheduledDepartureSFO']

## 6. Number of Scheduled Departures from SFO

In [14]:
scheduled_SFO_df = cpt.nonparentless_CPT('ScheduledDepartureSFO')
scheduled_SFO_df.head(10)

There are 169 rows in the above table, but we should have 1200 , which means that 1031 rows are missing.


Unnamed: 0,CRS_ARR_TIME,ScheduledArrivalPHL,MONTH,CRS_DEP_TIME,ScheduledDepartureSFO,prob,log_p
0,0,0,1,1,0,0.538462,-0.619039
1,0,0,1,1,4,0.461538,-0.77319
2,0,0,2,1,0,1.0,0.0
3,0,0,3,1,0,0.909091,-0.09531
4,0,0,3,1,4,0.090909,-2.397895
5,0,0,4,1,0,0.857143,-0.154151
6,0,0,4,1,4,0.142857,-1.94591
7,0,0,5,0,0,1.0,0.0
8,0,0,5,1,0,0.818182,-0.200671
9,0,0,5,1,4,0.181818,-1.704748


# Depth 4

In [15]:
cpt.available_nodes()

['LATE_AIRCRAFT_DELAY', 'DelayedDepartureSFO']

## 7. Delayed Departure SFO

In [16]:
delayed_SFO_df = cpt.nonparentless_CPT('DelayedDepartureSFO')
delayed_SFO_df.head(10)

There are 13 rows in the above table, but we should have 50 , which means that 37 rows are missing.


Unnamed: 0,ScheduledArrivalPHL,ScheduledDepartureSFO,DelayedDepartureSFO,prob,log_p
0,0,0,0,1.0,0.0
1,0,4,0,0.679012,-0.387116
2,0,4,1,0.320988,-1.136353
3,1,0,0,1.0,0.0
4,1,3,0,0.774194,-0.255933
5,1,3,1,0.225806,-1.488077
6,1,4,0,1.0,0.0
7,2,2,1,1.0,0.0
8,2,3,0,0.711111,-0.340927
9,2,3,1,0.288889,-1.241713


## 8. Late Aircraft Delay

In [17]:
late_aircraft_delay_df = cpt.nonparentless_CPT('LATE_AIRCRAFT_DELAY')
late_aircraft_delay_df.head(10)

There are 32 rows in the above table, but we should have 40 , which means that 8 rows are missing.


Unnamed: 0,CRS_ARR_TIME,ScheduledDepartureSFO,CRS_DEP_TIME,LATE_AIRCRAFT_DELAY,prob,log_p
0,0,0,0,0,0.9375,-0.064539
1,0,0,0,1,0.0625,-2.772589
2,0,0,1,0,0.796875,-0.227057
3,0,0,1,1,0.203125,-1.593934
4,0,1,0,0,1.0,0.0
5,0,1,1,0,1.0,0.0
6,0,2,0,0,0.6,-0.510826
7,0,2,0,1,0.4,-0.916291
8,0,2,1,0,0.6,-0.510826
9,0,2,1,1,0.4,-0.916291


# Depth 5

In [18]:
cpt.available_nodes()

['NAS_DELAY']

## 9. NAS Delay

In [19]:
nas_delay_df = cpt.nonparentless_CPT('NAS_DELAY')
nas_delay_df.head(10)

There are 15 rows in the above table, but we should have 16 , which means that 1 rows are missing.


Unnamed: 0,CRS_ARR_TIME,DelayedDepartureSFO,CRS_DEP_TIME,NAS_DELAY,prob,log_p
0,0,0,0,0,0.789474,-0.236389
1,0,0,0,1,0.210526,-1.558145
2,0,0,1,0,0.613169,-0.489115
3,0,0,1,1,0.386831,-0.949767
4,0,1,0,0,1.0,0.0
5,0,1,1,0,0.684211,-0.37949
6,0,1,1,1,0.315789,-1.15268
7,1,0,0,0,0.635678,-0.453063
8,1,0,0,1,0.364322,-1.009718
9,1,0,1,0,0.634615,-0.454736


# Depth 6

In [20]:
cpt.available_nodes()

['WEATHER_DELAY', 'TAXI_IN']

## 10. Taxi In

In [21]:
taxi_in_df = cpt.nonparentless_CPT('TAXI_IN')
taxi_in_df.head(10)

There are 21 rows in the above table, but we should have 104 , which means that 83 rows are missing.


Unnamed: 0,CRS_ARR_TIME,NAS_DELAY,DelayedDepartureSFO,TAXI_IN,prob,log_p
0,0,0,0,0,0.939024,-0.062914
1,0,0,0,1,0.054878,-2.902642
2,0,0,0,2,0.006098,-5.099866
3,0,0,1,0,0.9375,-0.064539
4,0,0,1,1,0.0625,-2.772589
5,0,1,0,0,0.969388,-0.031091
6,0,1,0,1,0.010204,-4.584967
7,0,1,0,2,0.020408,-3.89182
8,0,1,1,0,1.0,0.0
9,1,0,0,0,0.933566,-0.068743


## 11. Weather Delay

In [22]:
weather_delay_df = cpt.nonparentless_CPT('WEATHER_DELAY')
weather_delay_df.head(10)

There are 4 rows in the above table, but we should have 4 , which means that 0 rows are missing.


Unnamed: 0,NAS_DELAY,WEATHER_DELAY,prob,log_p
0,0,0,0.997967,-0.002035
1,0,1,0.002033,-6.198479
2,1,0,0.985816,-0.014286
3,1,1,0.014184,-4.255613


# Depth 7

In [23]:
cpt.available_nodes()

['CARRIER_DELAY']

## 12. Carrier Delay

In [24]:
carrier_delay_df = cpt.nonparentless_CPT('CARRIER_DELAY')
carrier_delay_df.head(10)

There are 11 rows in the above table, but we should have 16 , which means that 5 rows are missing.


Unnamed: 0,CRS_ARR_TIME,WEATHER_DELAY,CRS_DEP_TIME,CARRIER_DELAY,prob,log_p
0,0,0,0,0,0.863636,-0.146603
1,0,0,0,1,0.136364,-1.99243
2,0,0,1,0,0.69112,-0.369442
3,0,0,1,1,0.30888,-1.174801
4,0,1,1,0,1.0,0.0
5,1,0,0,0,0.806527,-0.215018
6,1,0,0,1,0.193473,-1.642616
7,1,0,1,0,0.542373,-0.611802
8,1,0,1,1,0.457627,-0.781701
9,1,1,0,0,1.0,0.0


# Depth 8

In [25]:
cpt.available_nodes()

['DEP_DELAY', 'ARR_DEL15']

## 13. Departure Delay

In [26]:
dep_delay_df = cpt.nonparentless_CPT('DEP_DELAY')
dep_delay_df.head(10)

There are 447 rows in the above table, but we should have 1344 , which means that 897 rows are missing.


Unnamed: 0,MONTH,LATE_AIRCRAFT_DELAY,DAY_OF_WEEK,CARRIER_DELAY,CRS_ARR_TIME,DEP_DELAY,prob,log_p
0,1,0,1,0,0,-1,1.0,0.0
1,1,0,1,0,1,-1,0.2,-1.609438
2,1,0,1,0,1,0,0.6,-0.510826
3,1,0,1,0,1,1,0.2,-1.609438
4,1,0,2,0,0,-1,1.0,0.0
5,1,0,2,0,1,0,0.5,-0.693147
6,1,0,2,0,1,1,0.5,-0.693147
7,1,0,3,0,0,1,1.0,0.0
8,1,0,3,0,1,-1,0.25,-1.386294
9,1,0,3,0,1,1,0.75,-0.287682


## 14. Arrival Delay

In [27]:
arr_del15_df = cpt.nonparentless_CPT('ARR_DEL15')
arr_del15_df.head(10)

There are 118 rows in the above table, but we should have 640 , which means that 522 rows are missing.


Unnamed: 0,LATE_AIRCRAFT_DELAY,NAS_DELAY,CARRIER_DELAY,CRS_DEP_TIME,DelayedDepartureSFO,CRS_ARR_TIME,ScheduledDepartureSFO,ARR_DEL15,prob,log_p
0,0,0,0,0,0,0,0,0,1.0,0.0
1,0,0,0,0,0,0,2,0,1.0,0.0
2,0,0,0,0,0,1,0,0,1.0,0.0
3,0,0,0,0,0,1,1,0,1.0,0.0
4,0,0,0,0,0,1,2,0,1.0,0.0
5,0,0,0,0,0,1,3,0,1.0,0.0
6,0,0,0,0,0,1,4,0,1.0,0.0
7,0,0,0,0,1,0,2,0,1.0,0.0
8,0,0,0,0,1,1,2,0,1.0,0.0
9,0,0,0,0,1,1,3,0,1.0,0.0
