In [1]:
import pandas as pd

# read in training data, edge data, and initialize NUM_DELAYS
data_df = pd.read_csv('./train_data.csv')
edges_df = pd.read_csv('./edges.csv')
NUM_DELAYS = data_df.shape[0]

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# Nodes without parent nodes

In [2]:
print("These are the list of parentless nodes :", set(edges_df['Source']) - set(edges_df['Target']),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : {'SECURITY_DELAY', 'MONTH', 'DAY_OF_WEEK'} . We start by defining a function to reduce redundancy.


In [3]:
def parentless(col):
    '''
    Find probability tables of nodes which do not have parents.
    '''
    # prevent KeyError
    if (col != 'QUARTER'):
        df = data_df.groupby(col).count()['QUARTER'].to_frame()
    else:
        df = data_df.groupby(col).count()['DAY_OF_WEEK'].to_frame()
    df.columns = ['count']
    df['probability'] = df['count'] / NUM_DELAYS
    df.reset_index(level=0, inplace=True)
    df.columns = [col,'count','probability']
    return df

## 1. Month

In [4]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,49,0.063307
1,2,27,0.034884
2,3,41,0.052972
3,4,42,0.054264
4,5,69,0.089147
5,6,86,0.111111
6,7,93,0.120155
7,8,87,0.112403
8,9,89,0.114987
9,10,79,0.102067


## 2. Day of Week

In [5]:
day_of_week_df = parentless('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,count,probability
0,1,128,0.165375
1,2,97,0.125323
2,3,95,0.122739
3,4,139,0.179587
4,5,128,0.165375
5,6,73,0.094315
6,7,114,0.147287


## 3. CRS Departure Time

0 represents AM, 1 represents PM.

In [6]:
crs_dep_df = parentless('CRS_DEP_TIME')
crs_dep_df

Unnamed: 0,CRS_DEP_TIME,count,probability
0,0,440,0.568475
1,1,334,0.431525


## 4. CRS Arrival Time

In [7]:
crs_arr_df = parentless('CRS_ARR_TIME')
crs_arr_df

Unnamed: 0,CRS_ARR_TIME,count,probability
0,0,288,0.372093
1,1,486,0.627907


## 5. Month

In [8]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,49,0.063307
1,2,27,0.034884
2,3,41,0.052972
3,4,42,0.054264
4,5,69,0.089147
5,6,86,0.111111
6,7,93,0.120155
7,8,87,0.112403
8,9,89,0.114987
9,10,79,0.102067


## 6. Number of Scheduled Departures at SFO When Delayed Flight was to Depart

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [9]:
scheduled_departure_df = parentless('ScheduledDepartureSFO')
scheduled_departure_df

Unnamed: 0,ScheduledDepartureSFO,count,probability
0,0.0,558,0.72093
1,1.0,19,0.024548
2,2.0,26,0.033592
3,3.0,75,0.096899
4,4.0,96,0.124031


## 7. Number of Scheduled Arrivals at PHL When Delayed Flight was to Arrive

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [10]:
scheduled_arrival_df = parentless('ScheduledArrivalPHL')
scheduled_arrival_df

Unnamed: 0,ScheduledArrivalPHL,count,probability
0,0.0,519,0.670543
1,1.0,170,0.219638
2,2.0,45,0.05814
3,3.0,9,0.011628
4,4.0,31,0.040052
