# ExploringTemporalData
**Eli Simic Robertson**

In [43]:
import pandas as pd
import numpy as np
from pprint import pprint
import random
import networkx as nx 
import matplotlib.pyplot as plt 
import collections

# deprecation warnings
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)

## Generating My Own Data

In [44]:
body_constr = [0,20]
head_constr = [0,10]
max_root_time = 100000 - head_constr[1]

# BODY_CONST = 'body'
HEAD_CONST = 'head'
ROOT_CONST = 'root'
A = 'a'
always = 100
cycle_prob = 50

In [45]:
def time_interval(body_constr): return tuple(np.random.uniform(body_constr)) 

def unique_body_symbols(low=3, high=5, single=False):
    '''Creates a random unique list of symbols for the body 
    of pattern. Symbol A has been excluded from the list as it is reserved 
    for the head of the pattern.
        Args: 
            n_body_symbols (int): amount of body symbols
            
        Returns:
            random_symbols (list): random choice of symbols
    '''
    symbols = ['b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] # possible body symbols
    n_bod_symbols = np.random.randint(low, high)
    if single:  n_bod_symbols = 1 
    rand_symbols = random.sample(symbols, k=n_bod_symbols)
    
    return rand_symbols

def make_edges(Edge, body_symbols, condition=None, connected_nodes=None):
    '''Args:
            Edge (named_tuple): Edge object with default fields.
            body_symbols (list): unique body symbols.
            condition (str/logical???): Conditional operator, if any.
            connected_nodes (list): list of currently connected nodes to 
            graph object.
        Returns:
            edges (list of named_tuples): 
            connected_nodes (list): 
            
    '''
    
    if condition == 'conjunction' or condition == 'disjunction':
        n_body_symbols = 2
    elif condition == 'negation' or condition == 'cycle':
        n_body_symbols = 1
    elif condition == None:
        n_body_symbols = len(body_symbols)
    
    edges = []
    if connected_nodes is None: connected_nodes = []
    
    for _ in range(n_body_symbols):
        sym = body_symbols.pop()
        start_t, end_t = time_interval(body_constr)
        rand_end_s = np.random.choice(connected_nodes + [ROOT_CONST])        

        if condition == 'disjunction':
            edges.append(Edge(ROOT_CONST, sym, start_t, end_t, prob=always, disjunction=True))
            connected_nodes.append(sym)
        elif condition == 'conjunction':
            edges.append(Edge(ROOT_CONST, sym, start_t, end_t, prob=always, conjunction=True))
            connected_nodes.append(sym)
        elif condition == 'negation':
            edges.append(Edge(rand_end_s, sym, start_t, end_t, prob=always, negation=True))
            connected_nodes.append(sym)
        elif condition == 'cycle':
            edges.append(Edge(rand_end_s, sym, start_t, end_t, prob=cycle_prob, cycle=True))
            connected_nodes.append(sym)
        # no conditional operator, append edge to connected nodes or root
        elif condition == None:
            edges.append(Edge(rand_end_s, sym, start_t, end_t, prob=always))
            connected_nodes.append(sym)

    return edges, connected_nodes

def body_pattern(low_body=4, high_body=6, low_prob=60, high_prob=90, disjunction=False,
                   negation=False, conjunction=False, prob=always, cycle=False):
    
    body_symbols = unique_body_symbols(low_body, high_body) # technically, body_symbols excluding root
    rand_num = np.random.uniform(low=0, high=always)
    prob = np.random.uniform(low=low_prob, high=high_prob)
    connected_nodes = ['root'] # keep track of connected nodes for end_symbol possibilities
    
    pattern = [] # pattern list of edges : graph like object
    fields = ('start_s', 'end_s', 'start_t', 'end_t', 'disjunction', 'negation', 'conjunction', 'cycle', 'prob')
    Edge = collections.namedtuple('Edge', fields)
    Edge.__new__.__defaults__ = (False,) * len(Edge._fields) # set default fields

        
    if disjunction:
        
        max_disjs = len(body_symbols) // 2
        n_disjs = np.random.randint(low=1, high=max_disjs)
        for disj in range(n_disjs): 
            edges, nodes = make_edges(Edge, body_symbols, condition='disjunction')
            pattern.append(Edge(nodes[0], nodes[1], # disjunction edge
                start_t=None, end_t=None, disjunction=True, prob=always))
            pattern.extend(edges)
            connected_nodes.extend(nodes)

    if cycle:
        edges, nodes = make_edges(Edge, body_symbols, condition='cycle') # create cycle edge
        pattern.append(Edge(nodes[0], np.random.choice(connected_nodes),
                           disjunction=True)) # create disjunction edge to stop infinite loop
        pattern.extend(edges)
        connected_nodes.extend(nodes) 
        
    elif negation:
        edges, nodes = make_edges(Edge, body_symbols, condition='negation',
                                 connected_nodes=connected_nodes)
        pattern.extend(edges)
        connected_nodes.extend(nodes) 
        
    elif conjunction:
        edges, nodes = make_edges(Edge, body_symbols, condition='conjunction')
        pattern.extend(edges)
        connected_nodes.extend(nodes)
    # remaining nodes are added to either root or connected body node
    edges, nodes = make_edges(Edge, body_symbols, condition=None,
                             connected_nodes=connected_nodes)
    pattern.extend(edges)
    connected_nodes.extend(nodes)

    return pattern

def plot_pattern(pattern): # sometimes returns None? unsure why. annoying
    G = GraphVisualization() 
    for p in pattern: G.addEdge(p.start_s, p.end_s) 
    G.visualize()

## Generating Training Set
<br>

In [52]:
def time_point(start_t, end_t): return np.random.uniform(start_t, end_t)

def generate_edge_instance(tp,edge):
    return tp - time_point(edge.start_t, edge.end_t), edge.end_s

def outgoing_edges(node, pattern):
    '''Find immediate edges for a node'''
    out_edges = []
    for edge in pattern:            
        if edge.disjunction and edge.start_t is None: continue
            
        elif edge.negation == True: continue # skip negation node        
            
        elif edge.start_s == node:            
            out_edges.append(edge)

    return out_edges

def dest_nodes(node,pattern):
    out_edges = outgoing_edges(node,patt)
    return [edge.end_s for edge in out_edges]
    
def mutual_excl_edges(pattern):
    '''Find all mutually exclusive edges in pattern'''
    return [e for e in pattern if e.start_t == None]

def have_mutual(node, excl_edges):
    '''Is a given node connected to a mutually exclusive node'''
    for edge in excl_edges:
        if node == edge.start_s or node == edge.end_s:
            return True
        
    else: False
        
def generate_neighbouring_tps(node,tp,patt):
    out_edges = outgoing_edges(node,patt)
    excl_edges = mutual_excl_edges(patt)

    # determine which edge to choose from each mutual exclusion
    skip_nodes = []
    for edge in excl_edges:
        if np.random.randint(2) == 1:
            skip_nodes.append(edge.start_s)
        else:
            skip_nodes.append(edge.end_s)

    tps = []
    for edge in out_edges:
        if edge.prob != 100 and rand_num > 50: continue
        elif edge.end_s in skip_nodes: continue
        tps.append(generate_edge_instance(tp,edge))
        
    return tps
        
        
def generate_tps(node,tp,patt):
    all_tps = []

    if node == None:
        # this is the start of recursive generation, start from root
        node = 'root'
        tp = np.random.uniform(low=0, high=max_root_time)
        all_tps.append((tp, node))
    
    neighbouring_tps = generate_neighbouring_tps(node,tp,patt)
    all_tps += neighbouring_tps
    
    for neighbouring_tp in neighbouring_tps:
        # recursive step (because calling this same method)
       all_tps += generate_tps(neighbouring_tp[1],neighbouring_tp[0],patt)
    
    return all_tps

def generate_event_pred(patt, head_prob, make_pred=False):
    '''
    For event instance, only add A if head prob is exceeded
    Input: 
    Returns: 
    '''
    
    body_inst = generate_tps(None, None, patt)
    
    pred = []
    
    for t in body_inst: # get root tp
        if t[1] == 'root': 
            root_tp = t[0]

    start_t, end_t = time_interval(head_constr)    
    head_tp = root_tp + time_point(start_t, end_t)
    consequent = tuple([head_tp, A])
    
    pred.append(consequent)
        
    if head_prob > np.random.randint(low=0, high=100):
        body_inst.append(consequent) 

    if make_pred:
        return body_inst, pred

    else:
        return body_inst

tps = generate_tps(None, None, patt)
generate_event_pred(patt, 90, make_pred=True)

([(49472.43220337719, 'root'),
  (49463.7507946147, 'i'),
  (49471.159104355036, 'h'),
  (49468.08082636653, 'g'),
  (49464.04915026853, 'f'),
  (49474.655604869564, 'a')],
 [(49474.655604869564, 'a')])

## Making Noise!

**Time Series Consists of: **<br>
- Singular Pattern replicated over time line
- Random Subsets of the Pattern (func: rand_subset)
- Individual Noisy Symbols

Insert logic to insert the negation into the instance at equal prob

In [67]:
def rand_subset(pattern): # random subset of the instance
    '''
    Returns:
        ts (list): list of lists of time point and symbol of
        random subset of the pattern definition'''
    subset_idxs = sorted([np.random.randint(low=0, high=len(pattern)) for _ in range(2)])
    sub_pattern = pattern[subset_idxs[0]: subset_idxs[1]]
    
    return sub_pattern
    
def noisy_instance(time_high=100000):
    ''' Input: 
            time_high (int): 
    
            Returns:
                ts (list): list of lists containing random time point and symbol'''
    
    sym = unique_body_symbols(single=True).pop()
    tp = time_point(start_t=0, end_t=time_high)
    inst = [[tp, sym]]
    return inst

def generate_events_preds(pattern, head_prob, n_patterns, n_subsets, n_noisy_insts):
    '''Creates both .event array and its corresponding .pred array from
    
    
    Returns:
            events (list): time series for training/testing set
            preds (list): ground truth values
    '''

    
    events, preds = [], []
    
    for _ in range(n_patterns):
        event, pred =  generate_event_pred(pattern, head_prob, make_pred=True)
        events.extend(event)
        preds.extend(pred)
        
    for _ in range(n_subsets):
        event = generate_event_pred(rand_subset(pattern), 0, make_pred=False) # need to handle head prob better
        if event != []:
            events.extend(event)
        else:
            continue
        
    for _ in range(n_noisy_insts):
        event = noisy_instance()
        events.extend(event)
        

    events.sort(key=lambda x: x[0])
    preds.sort(key=lambda x: x[0])# sort by timestamp
    
    return events, preds
    

    


In [86]:
# patt = body_pattern(low_body=4, high_body=6, low_prob=60, high_prob=90,
#              disjunction=False, negation=False, conjunction=False, prob=100, cycle=False)

n_patterns = 1000
n_subsets = 0
n_noisy_insts = 0
head_prob = 75

patt = body_pattern(low_body=3, high_body=5, disjunction=True)  
events, preds = generate_events_preds(patt, head_prob=head_prob, n_patterns=n_patterns,
                               n_subsets=n_subsets, n_noisy_insts=n_noisy_insts)

## Output to file.

In [87]:
preds[1]

(147.94967494124342, 'a')

In [88]:
patt

[Edge(start_s='d', end_s='f', start_t=None, end_t=None, disjunction=True, negation=False, conjunction=False, cycle=False, prob=100),
 Edge(start_s='root', end_s='d', start_t=0.3895241339008073, end_t=1.6861715826745503, disjunction=True, negation=False, conjunction=False, cycle=False, prob=100),
 Edge(start_s='root', end_s='f', start_t=0.8023062659393738, end_t=8.427243461176475, disjunction=True, negation=False, conjunction=False, cycle=False, prob=100),
 Edge(start_s='d', end_s='c', start_t=0.07454986966270893, end_t=3.1817514221127254, disjunction=False, negation=False, conjunction=False, cycle=False, prob=100),
 Edge(start_s='root', end_s='k', start_t=0.46142102086283976, end_t=16.288422055058167, disjunction=False, negation=False, conjunction=False, cycle=False, prob=100)]

In [89]:
# X = np.array(events)

# values = list(X[:, 1])
# names = list(X[:, 0])
# plt.scatter(names, values);
# plt.show()

In [90]:
'''Create Event File. Create related Pred file. Create log file (HTML??).
'''

file_name = f'n_patterns_{n_patterns}_n_subsets_{n_subsets}_n_noisy_insts_{n_noisy_insts}.evt'

with open(file_name, 'w') as file:
    for l in events:
        s = str(l[0]) + str(l[1]) + '\n'
        file.write(s)

In [91]:
!dir

 Volume in drive C has no label.
 Volume Serial Number is EC71-92B5

 Directory of C:\Users\admin\Documents\TemporalDataGeneration

06/03/2021  06:09 PM    <DIR>          .
06/03/2021  06:09 PM    <DIR>          ..
02/03/2021  11:58 AM    <DIR>          .ipynb_checkpoints
09/02/2021  05:04 PM            37,299 bla.event
26/02/2021  09:03 AM           186,417 ExploringTemporalData.ipynb
06/03/2021  06:09 PM            52,272 ExploringTemporalData_keith.ipynb
06/03/2021  06:09 PM            88,690 n_patterns_1000_n_subsets_0_n_noisy_insts_0.evt
06/03/2021  05:14 PM           160,815 n_patterns_1000_n_subsets_1000_n_noisy_insts_1000.evt
01/02/2021  02:02 PM               952 output_conjunction_nsym3_n1000.txt
01/02/2021  02:02 PM                24 README.md
               7 File(s)        526,469 bytes
               3 Dir(s)  105,269,399,552 bytes free
