In [1]:
import pandas as pd

# helper functions
import cpt_calculator as cpt

# read in training data, edge data, and initialize NUM_DELAYS
edges_df = pd.read_csv('./edges.csv')
nodes_with_parents = list(set(edges_df.Target))
nodes_no_parents = list(set(edges_df.Source) - set(edges_df.Target))
nodes_set = set(nodes_with_parents + nodes_no_parents)

To compute the conditional probability tables, we will obesrve each node in a BFS-like fashion, starting with nodes that do not have parents and then working our way through nodes whose parents we have already observed.

In [2]:
# make dictionary of nodes and their parents
node_parents = dict()
nodes_with_parents
for node in nodes_with_parents:
    node_parents[node] = set(edges_df.Source[edges_df.Target == node])
for node in nodes_no_parents:
    node_parents[node] = set()
    
# keep track of a visted set
visited_set = set()

# 0. Nodes Without Parents

In [3]:
print("These are the list of parentless nodes :", nodes_no_parents,". We start by defining a function to reduce redundancy.")

These are the list of parentless nodes : ['DAY_OF_WEEK', 'MONTH'] . We start by defining a function to reduce redundancy.


## 1. Month

In [4]:
month_df, visited, nodes_set = cpt.parentless_CPT('MONTH', visited_set, nodes_set)
month_df

Unnamed: 0,MONTH,num,probability,log_p
0,1,50,0.064599,-2.739549
1,2,29,0.037468,-3.284276
2,3,36,0.046512,-3.068053
3,4,49,0.063307,-2.759752
4,5,64,0.082687,-2.492689
5,6,84,0.108527,-2.220755
6,7,99,0.127907,-2.056452
7,8,93,0.120155,-2.118972
8,9,92,0.118863,-2.129783
9,10,72,0.093023,-2.374906


## 2. Day of Week

In [5]:
day_of_week_df, visited, nodes_set = cpt.parentless_CPT('DAY_OF_WEEK', visited_set, nodes_set)
day_of_week_df

Unnamed: 0,DAY_OF_WEEK,num,probability,log_p
0,1,119,0.153747,-1.872448
1,2,101,0.130491,-2.036451
2,3,94,0.121447,-2.108277
3,4,138,0.178295,-1.724318
4,5,135,0.174419,-1.746297
5,6,73,0.094315,-2.361112
6,7,114,0.147287,-1.915373


# Depth 1: Nodes with Parents

First, we find all the nodes whose values we can compute based on those nodes whose conditional probability tables we have already computed.

In [None]:
def available_nodes():
    '''
    return list of available nodes to visit
    '''
    li = []
    for node in nodes_set:
        if node_parents[node].issubset(visited_set):
            li += [node]
    return li

def compute_CPT(node, parent_df):
    df = data_df.groupby(list(node_parents[node]) + [node]).size().reset_index().rename(columns={0:'num'})
    df = pd.merge(df, parent_df, on=list(node_parents[node]))
    df['prob'] = df.num_x / df.num_y
    df = df.drop(['num_x','num_y'],axis=1)
    df['log_p'] = np.log(df.prob)
    visited_set.add(node)
    nodes_set.remove(node)
    return df

In [None]:
available_nodes()

## 4. Quarter

In [None]:
quarter_df = compute_CPT('QUARTER', month_df)
quarter_df

## 5. ScheduledArrivalPHL

0 represents less than 10, 1 represents less than 20, 2 represents less than 30, 3 represents less than 40, 4 represents less than 50.

In [None]:
node_parents['ScheduledArrivalPHL']

In [None]:
scheduled_PHL_df = compute_CPT('ScheduledArrivalPHL',day_of_week_df)
scheduled_PHL_df

## 6. NAS_DELAY

In [None]:
node_parents['NAS_DELAY']

In [None]:
scheduled_PHL_df = compute_CPT('NAS_DELAY',month_df)
scheduled_PHL_df

# Depth 2

## 7. CRS Arrival Time

In [None]:
available_nodes()

In [None]:
node_parents['CRS_ARR_TIME']

In [None]:
def compute_CPT(node, parent_df):
    df = data_df.groupby(list(node_parents[node]) + [node]).size().reset_index().rename(columns={0:'num'})
    df = pd.merge(df, parent_df, on=list(node_parents[node]))
    df['prob'] = df.num_x / df.num_y
    df = df.drop(['num_x','num_y'],axis=1)
    df['log_p'] = np.log(df.prob)
    visited_set.add(node)
    nodes_set.remove(node)
    return df

In [None]:
parent_df_li = [day_of_week_df, month_df, quarter_df]
temp = data_df.groupby(list(node_parents['CRS_ARR_TIME']) + ['CRS_ARR_TIME']).size().reset_index().rename(columns={0:'num'})

In [None]:
temp.head(10)

In [None]:
data_df[(data_df.QUARTER == 1) & (data_df.DAY_OF_WEEK == 1) & (data_df.MONTH == 1)].shape

# Depth 3