In [1]:
import pandas as pd

# read in training data, edge data, and initialize NUM_DELAYS
data_df = pd.read_csv('./train_data.csv')
edges_df = pd.read_csv('./edges.csv')
NUM_DELAYS = data_df.shape[0]
nodes_with_parents = list(set(edges_df.Target))
nodes_no_parents = list(set(edges_df.Source) - set(edges_df.Target))
nodes_set = set(nodes_with_parents + nodes_no_parents)

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

In [2]:
# make dictionary of nodes and their parents
node_parents = dict()
nodes_with_parents
for node in nodes_with_parents:
    node_parents[node] = set(edges_df.Source[edges_df.Target == node])
for node in nodes_no_parents:
    node_parents[node] = set()
    
# keep track of a visted set
visited_set = set()

# 0. Nodes Without Parents

In [3]:
print("These are the list of parentless nodes :", nodes_no_parents,". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : ['MONTH', 'DAY_OF_WEEK', 'SECURITY_DELAY'] . We start by defining a function to reduce redundancy.


In [4]:
def parentless(col):
    '''
    Find probability tables of nodes which do not have parents.
    '''
    df = data_df.groupby(col).count()['QUARTER'].to_frame()
    df.columns = ['num']
    df['probability'] = df.num / NUM_DELAYS
    df.reset_index(level=0, inplace=True)
    df.columns = [col,'num','probability']
    visited_set.add(col)
    nodes_set.remove(col)
    return df

## 1. Month

In [5]:
month_df = parentless('MONTH')
month_df

Unnamed: 0,MONTH,num,probability
0,1,48,0.062016
1,2,29,0.037468
2,3,43,0.055556
3,4,48,0.062016
4,5,70,0.090439
5,6,68,0.087855
6,7,94,0.121447
7,8,90,0.116279
8,9,90,0.116279
9,10,77,0.099483


## 2. Day of Week

In [6]:
day_of_week_df = parentless('DAY_OF_WEEK')
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,num,probability
0,1,126,0.162791
1,2,104,0.134367
2,3,109,0.140827
3,4,134,0.173127
4,5,117,0.151163
5,6,77,0.099483
6,7,107,0.138243


## 3. Security Delay

In [7]:
security_delay_df = parentless('SECURITY_DELAY')
security_delay_df

Unnamed: 0,SECURITY_DELAY,num,probability
0,0.0,773,0.998708
1,1.0,1,0.001292


# Depth 1: Nodes with Parents

First, we find all the nodes whose values we can compute based on those nodes whose conditional probability tables we have already computed.

In [8]:
def available_nodes():
    '''
    return list of available nodes to visit
    '''
    li = []
    for node in nodes_set:
        if node_parents[node].issubset(visited_set):
            li += [node]
    return li

def compute_CPT(node, parent_df):
    df = data_df.groupby(list(node_parents[node]) + [node]).size().reset_index().rename(columns={0:'num'})
    df = pd.merge(df, parent_df, on=list(node_parents[node]))
    df['probability'] = df.num_x / df.num_y
    df = df.drop(['num_x','num_y'],axis=1)
    visited_set.add(node)
    nodes_set.remove(node)
    return df

In [9]:
available_nodes()

['ScheduledArrivalPHL', 'NAS_DELAY', 'QUARTER']

## 4. Quarter

In [10]:
quarter_df = compute_CPT('QUARTER', month_df)
quarter_df

Unnamed: 0,MONTH,QUARTER,probability
0,1,1,1.0
1,2,1,1.0
2,3,1,1.0
3,4,2,1.0
4,5,2,1.0
5,6,2,1.0
6,7,3,1.0
7,8,3,1.0
8,9,3,1.0
9,10,4,1.0


## 5. ScheduledArrivalPHL

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [11]:
node_parents['ScheduledArrivalPHL']

{'DAY_OF_WEEK'}

In [12]:
scheduled_PHL_df = compute_CPT('ScheduledArrivalPHL',day_of_week_df)
scheduled_PHL_df

Unnamed: 0,DAY_OF_WEEK,ScheduledArrivalPHL,probability
0,1,0.0,0.666667
1,1,1.0,0.238095
2,1,2.0,0.055556
3,1,4.0,0.039683
4,2,0.0,0.740385
5,2,1.0,0.115385
6,2,2.0,0.076923
7,2,4.0,0.067308
8,3,0.0,0.688073
9,3,1.0,0.229358


## 6. NAS_DELAY

In [13]:
node_parents['NAS_DELAY']

{'MONTH'}

In [14]:
scheduled_PHL_df = compute_CPT('NAS_DELAY',month_df)
scheduled_PHL_df

Unnamed: 0,MONTH,NAS_DELAY,probability
0,1,0.0,0.5625
1,1,1.0,0.4375
2,2,0.0,0.793103
3,2,1.0,0.206897
4,3,0.0,0.651163
5,3,1.0,0.348837
6,4,0.0,0.75
7,4,1.0,0.25
8,5,0.0,0.657143
9,5,1.0,0.342857


# Depth 2

## 7. CRS Arrival Time

In [15]:
available_nodes()

['CRS_ARR_TIME']

In [16]:
node_parents['CRS_ARR_TIME']

{'DAY_OF_WEEK', 'MONTH', 'QUARTER'}

# Depth 3