In [1]:
import pandas as pd

# read in training data, edge data, and initialize NUM_DELAYS
data_df = pd.read_csv('./train_data.csv')
edges_df = pd.read_csv('./edges.csv')
NUM_DELAYS = data_df.shape[0]

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

# Nodes without parent nodes

In [2]:
print("These are the list of parentless nodes :", set(edges_df['Source']) - set(edges_df['Target']),". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : {'MONTH', 'DAY_OF_WEEK', 'SECURITY_DELAY'} . We start by defining a function to reduce redundancy.


In [3]:
def parentless(col):
    '''
    Find probability tables of nodes which do not have parents.
    '''
    # prevent KeyError
    if (col != 'QUARTER'):
        df = data_df.groupby(col).count()['QUARTER'].to_frame()
    else:
        df = data_df.groupby(col).count()['DAY_OF_WEEK'].to_frame()
    df.columns = ['count']
    df['probability'] = df['count'] / NUM_DELAYS
    df.reset_index(level=0, inplace=True)
    df.columns = [col,'count','probability']
    return df

## 1. Month

In [4]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,53,0.068475
1,2,27,0.034884
2,3,41,0.052972
3,4,45,0.05814
4,5,60,0.077519
5,6,84,0.108527
6,7,90,0.116279
7,8,86,0.111111
8,9,96,0.124031
9,10,74,0.095607


## 2. Day of Week

In [5]:
day_of_week_df = parentless('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,count,probability
0,1,126,0.162791
1,2,100,0.129199
2,3,100,0.129199
3,4,134,0.173127
4,5,133,0.171835
5,6,75,0.096899
6,7,106,0.136951


## 3. Security Delay

In [6]:
security_delay_df = parentless('SECURITY_DELAY')
security_delay_df

Unnamed: 0,SECURITY_DELAY,count,probability
0,0.0,773,0.998708
1,1.0,1,0.001292


## 4. CRS Departure Time

0 represents AM, 1 represents PM.

In [8]:
crs_dep_df = parentless('CRS_DEP_TIME')
crs_dep_df

Unnamed: 0,CRS_DEP_TIME,count,probability
0,0,438,0.565891
1,1,336,0.434109


## 4. CRS Arrival Time

In [9]:
crs_arr_df = parentless('CRS_ARR_TIME')
crs_arr_df

Unnamed: 0,CRS_ARR_TIME,count,probability
0,0,299,0.386305
1,1,475,0.613695


## 5. Month

In [10]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,count,probability
0,1,53,0.068475
1,2,27,0.034884
2,3,41,0.052972
3,4,45,0.05814
4,5,60,0.077519
5,6,84,0.108527
6,7,90,0.116279
7,8,86,0.111111
8,9,96,0.124031
9,10,74,0.095607


## 6. Number of Scheduled Departures at SFO When Delayed Flight was to Depart

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [11]:
scheduled_departure_df = parentless('ScheduledDepartureSFO')
scheduled_departure_df

Unnamed: 0,ScheduledDepartureSFO,count,probability
0,0.0,571,0.737726
1,1.0,19,0.024548
2,2.0,23,0.029716
3,3.0,72,0.093023
4,4.0,89,0.114987


## 7. Number of Scheduled Arrivals at PHL When Delayed Flight was to Arrive

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [12]:
scheduled_arrival_df = parentless('ScheduledArrivalPHL')
scheduled_arrival_df

Unnamed: 0,ScheduledArrivalPHL,count,probability
0,0.0,520,0.671835
1,1.0,172,0.222222
2,2.0,45,0.05814
3,3.0,10,0.01292
4,4.0,27,0.034884
