In [1]:
import pandas as pd

# read in training data, edge data, and initialize NUM_DELAYS
data_df = pd.read_csv('./train_data.csv')
edges_df = pd.read_csv('./edges.csv')
NUM_DELAYS = data_df.shape[0]

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# Nodes without parent nodes

In [2]:
print("These are the list of parentless nodes :", set(edges_df['Source']) - set(edges_df['Target']),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : {'CRS_DEP_TIME', 'MONTH', 'DAY_OF_WEEK', 'ScheduledArrivalPHL', 'ScheduledDepartureSFO', 'CRS_ARR_TIME'} . We start by defining a function to reduce redundancy.


In [3]:
def parentless(col):
    '''
    Find probability tables of nodes which do not have parents.
    '''
    # prevent KeyError
    if (col != 'QUARTER'):
        df = data_df.groupby(col).count()['QUARTER'].to_frame()
    else:
        df = data_df.groupby(col).count()['DAY_OF_WEEK'].to_frame()
    df.columns = ['count']
    df['probability'] = df['count'] / NUM_DELAYS
    df.reset_index(level=0, inplace=True)
    df.columns = [col,'count','probability']
    return df

## 1. Month

In [4]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,53,0.068475
1,2,28,0.036176
2,3,36,0.046512
3,4,41,0.052972
4,5,66,0.085271
5,6,73,0.094315
6,7,97,0.125323
7,8,92,0.118863
8,9,93,0.120155
9,10,80,0.103359


## 2. Day of Week

In [5]:
day_of_week_df = parentless('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,count,probability
0,1,129,0.166667
1,2,103,0.133075
2,3,98,0.126615
3,4,144,0.186047
4,5,126,0.162791
5,6,71,0.091731
6,7,103,0.133075


## 3. CRS Departure Time

0 represents AM, 1 represents PM.

In [6]:
crs_dep_df = parentless('CRS_DEP_TIME')
crs_dep_df

Unnamed: 0,CRS_DEP_TIME,count,probability
0,0,446,0.576227
1,1,328,0.423773


## 4. CRS Arrival Time

In [7]:
crs_arr_df = parentless('CRS_ARR_TIME')
crs_arr_df

Unnamed: 0,CRS_ARR_TIME,count,probability
0,0,292,0.377261
1,1,482,0.622739


## 5. Month

In [8]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,53,0.068475
1,2,28,0.036176
2,3,36,0.046512
3,4,41,0.052972
4,5,66,0.085271
5,6,73,0.094315
6,7,97,0.125323
7,8,92,0.118863
8,9,93,0.120155
9,10,80,0.103359


## 6. Number of Scheduled Departures at SFO When Delayed Flight was to Depart

In [9]:
scheduled_departure_df = parentless('ScheduledDepartureSFO')
scheduled_departure_df

Unnamed: 0,ScheduledDepartureSFO,count,probability
0,0.0,396,0.511628
1,1.0,155,0.200258
2,3.0,7,0.009044
3,5.0,1,0.001292
4,6.0,9,0.011628
5,12.0,6,0.007752
6,13.0,10,0.01292
7,20.0,8,0.010336
8,23.0,12,0.015504
9,28.0,2,0.002584


## 7. Number of Scheduled Arrivals at PHL When Delayed Flight was to Arrive

In [10]:
scheduled_arrival_df = parentless('ScheduledArrivalPHL')
scheduled_arrival_df

Unnamed: 0,ScheduledArrivalPHL,count,probability
0,0.0,394,0.509044
1,2.0,7,0.009044
2,5.0,76,0.098191
3,9.0,48,0.062016
4,10.0,109,0.140827
5,11.0,10,0.01292
6,12.0,12,0.015504
7,13.0,10,0.01292
8,15.0,15,0.01938
9,16.0,11,0.014212
