The `verbose` flag is specified here, if enabled more information is printed showing what is being changed, etc.

In [1]:
verbose = False

Import all required libraries here.

In [2]:
from collections import namedtuple
import re

First thing, import all the data from the `.boris` file, and print out how many observations are in the file, as a sanity check that all is as expected with the input data.

In [3]:
intruder_test = 'intruder test_noid.boris'
all_accurrance = 'all accurrence observations.boris'

import json
with open(intruder_test, 'r') as f:
    data = json.load(f)
    original_observations = data['observations']
    num_events = 0
    for _,v in original_observations.items():
        num_events = num_events + len(v['events'])
    print(f'Have {len(original_observations)} observations with {num_events} events in total')

Have 327 observations with 19911 events in total


Create a class to represent the events.

In [4]:
class Event:
    __slots__ = 'timestamp', 'subject', 'type', 'recipient'

    def __init__(self, src):
        self.timestamp = src[0]
        self.subject = src[1]
        self.type = src[2]
        self.recipient = src[3]
        
        # mwr without a recipient is always with the partner
        if self.type == 'mwr' and not self.recipient:
            self.recipient = 'partner'
                
        assert self.timestamp >= 0, self
        # TODO allow empty subject for the moment
        assert any(self.subject == x for x in ['m', 'f', 'nm', 'nf', '']) or self.type == 'vstart', self
        assert any(self.type == x for x in ['app', 'fsp', 'vstart', 'mwr', 'sb', 'pre-agg']), self
        # TODO allow empty recipient
        assert any(self.recipient == x for x in ['neighbour', 'intruder', 'partner', '']), self
        
    def __format__(self, format_spec):
        return f'"{self.timestamp}, {self.subject}, {self.type}, {self.recipient}"'           

    def __str__(self, format_spec):
        return f'"{self.timestamp}, {self.subject}, {self.type}, {self.recipient}"'
    
    def __repr__(self):
        return f'"{self.timestamp}, {self.subject}, {self.type}, {self.recipient}"'           

The labels on the observations must be fixed up. "Neighbour" to "Experiment Control" and "Stranger" to "Experiment Treatment", and so on.  All labels are checked against the `pattern` regular expression that is specified here.

In [5]:
pattern = re.compile(
    '(F[0-9][A-D]) +([0-9]{2})\.([0-9]{2})\.? ([A-D]) +(experiment|habituation) (treatment|control)')

def get_label(s):
    groups = re.fullmatch(pattern, s).groups()
    return f'{groups[0]} 2017.{groups[2]}.{groups[1]} {groups[3]} {groups[4]} {groups[5]}'

def correct_label(s):
    new_k = str(s)
    new_k = new_k.replace('neighbour', 'experiment control')
    new_k = new_k.replace('stranger', 'experiment treatment')
    new_k = new_k.replace('habituation experiment', 'habituation treatment')
    new_k = new_k.replace('treatment control', 'experiment control')
    new_k = new_k.replace('habitiution control', 'habituation control')
    new_k = new_k.replace('habitiution experiment', 'habituation treatment')
    new_k = new_k.replace('habitation experiment', 'habituation treatment')
    new_k = new_k.replace('habitiution control', 'habituation control')

    # ensure that all keys now match the expected layout
    if not pattern.fullmatch(new_k):
        raise ValueError()

    new_k = get_label(new_k)    
    if new_k != s:
        if verbose:
            print(f'Change {k:>40} -> {new_k}')
    
    return new_k

Start by removing some events that are wrong/invalid.  If the vstart is missing at the start of the list of events then add it.  The events themselves are also touched up, the 'mwr' events are generally missing the recipient, and the timestamps are also normalised here such that vstart always happens at $t = 0$.  This makes the analysis further down easier.

In [6]:
observations = dict()

for k,v in original_observations.items():
    try:
        label = correct_label(k)

        events = list(map(Event, v['events']))

        if not events:
            print(f'Observation {label} has no events!')
            continue

        if events[0].type != 'vstart':
            if verbose:
                print(f'Observation {label} is missing a vstart as the very first event!')
            # for the moment add a vstart with t = 0
            events.insert(0, Event([0.0, "", "vstart", "", ""]))
    
        vstart_timestamp = events[0].timestamp
        for event in events:
            # normalise timestamps so that the vstart timestamp is always 0
            event.timestamp = event.timestamp - vstart_timestamp

        observations[label] = events

    except ValueError:
        if k.startswith('x'):
            print(f'Observation {k!r} is marked as invalid (starts with "x")')
        else:
            print(f'Observation {k!r} is invalid?!')

Observation 'xxxx' is marked as invalid (starts with "x")
Observation 'xxxx F5A 25.06. B habituation control' is marked as invalid (starts with "x")
Observation 'xxxx F5A 25.06. A habituation control' is marked as invalid (starts with "x")
Observation 'xxxx F5B 18.04. D habituation ?' is marked as invalid (starts with "x")
Observation F6A 2017.04.28 B habituation control has no events!


In [7]:
print(f'Have {len(observations)} valid observations')

Have 320 valid observations


This is where the fun stuff really starts, so initially we have some basic scaffolding code, although with the definition of two different kinds of latency, `max_latency` and `base_latency`.  We also define a `CombiningData` type; this is the combination of the observation id plus the subject.  The output data has one row for each unique `CombiningData`.  The `get_first_occurance_timestamp` searches a list of events until both the subject and recipient are matched.

In [8]:
max_latency = 1000000
base_latency = 120

class CombiningData(namedtuple('CombiningData', ['obs_id', 'subject'])):
    def get_csv(self):
        return '{}, {}, {}, {}, {}, {}'.format(*self.obs_id.split(' '), self.subject)

def get_first_occurance_timestamp(subject, recipient, events):
    for event in events:
        if event.type == 'vstart':
            continue
        if event.recipient == recipient and event.subject == subject:
           return event.timestamp
    return max_latency

The `GetLatency` class takes a list of events and determines the timestamp of the first event for each combination subject and recipient.  We are not interested in any events involving the partner, only intruder and neighbour.  If there is no event in the list for any subject and recipient pair, then insert a "maximum latency"; this is the lowest latency of all combinations for this list of events plus the "base latency", which was specified in the previous cell (`base_latency`).

In [9]:
class GetLatency:
    __slots__ = \
        'm_intruder', \
        'f_intruder', \
        'nm_intruder', \
        'nf_intruder', \
        'm_neighbour', \
        'f_neighbour', \
        'nm_neighbour', \
        'nf_neighbour'

    def __init__(self, events):
        # generate a list of the slots and get the first timestamp for each.
        tuples = [(x, y, f'{x}_{y}') for x in ['m', 'f', 'nm', 'nf'] for y in ['intruder', 'neighbour']]
        for t in tuples:
            setattr(self, t[2], get_first_occurance_timestamp(t[0], t[1], events))

        # now for all events in this observation calculate the minimal latency.
        min_latency = min(map(lambda x: getattr(self, x[2]), tuples))

        # check that every observation has a minimum latency
        assert min_latency != max_latency, events
        
        # cap all latencies, if there is no event in this observation then
        # get_first_occurance_timestamp returns a very large latency which is
        # definitely going to be much much larger than base_latency plus min_latency.
        for t in tuples:
            setattr(self, t[2], min(base_latency + min_latency, getattr(self, t[2])))

Now that we have the scaffolding setup this is a straightforward traverse of the data generating a dictionary of `CombiningData` to latencies, called `latencies` (not very original).  For each `CombiningData` there are two latencies, the intruder latency and the neighbour latency.  So for each observation we should be entering four entries into `latencies`.

In [10]:
Latency = namedtuple('Latency', ['intruder', 'neighbour'])

latencies = dict()

for label, events in observations.items():
    latency = GetLatency(events)

    if verbose:
        print(f'{label}, m, {latency.m_intruder}, {latency.m_neighbour}')
        print(f'{label}, f, {latency.f_intruder}, {latency.f_neighbour}')
        print(f'{label}, nm, {latency.nm_intruder}, {latency.nm_neighbour}')
        print(f'{label}, nf, {latency.nf_intruder}, {latency.nf_neighbour}')

    # store latencies for next step
    latencies[CombiningData(label, 'm')] = Latency(latency.m_intruder, latency.m_neighbour)
    latencies[CombiningData(label, 'f')] = Latency(latency.f_intruder, latency.f_neighbour)
    latencies[CombiningData(label, 'nm')] = Latency(latency.nm_intruder, latency.nm_neighbour)
    latencies[CombiningData(label, 'nf')] = Latency(latency.nf_intruder, latency.nf_neighbour)

This is the core of what we did in the first python script, so first up define a data structure to hold the data.  We already have the `CombiningData`, so that is the "key", and the output will have one row for each `CombiningData`.  The value type is the `BehaviourModifier`.

In [11]:
# this can't be a namedTuple, as we need to update the fields using set & get attr.
# Use slots for speed & to ensure that only the fields we want are valid.
class BehaviourModifier:
    __slots__ = \
        'app_intruder', \
        'app_neighbour', \
        'app_partner', \
        'fsp_intruder', \
        'fsp_neighbour', \
        'fsp_partner', \
        'sb_intruder', \
        'sb_neighbour', \
        'sb_partner', \
        'mwr_partner', \
        'l_intruder', \
        'l_neighbour'

    def __init__(self):
        self.app_intruder = 0
        self.app_neighbour = 0
        self.app_partner = 0
        self.fsp_intruder = 0
        self.fsp_neighbour = 0
        self.fsp_partner = 0
        self.sb_intruder = 0
        self.sb_neighbour = 0
        self.sb_partner = 0
        self.mwr_partner = 0
        self.l_intruder = None
        self.l_neighbour = None
    
    def get_csv(self):
        return f'{self.app_intruder}, {self.app_neighbour}, {self.app_partner}, ' + \
            f'{self.fsp_intruder}, {self.fsp_neighbour}, {self.fsp_partner}, ' + \
            f'{self.sb_intruder}, {self.sb_neighbour}, {self.sb_partner}, ' + \
            f'{self.mwr_partner}, {self.l_intruder}, {self.l_neighbour}'

Iterate over the observations, then over the events, and total up how many events of each type are present.

In [12]:
# key is a CombiningData, value is BehaviourModifier
combo_dict = dict()

for label in observations.keys():
    for subject in ['m', 'f', 'nm', 'nf']:
        combo_dict[CombiningData(label, subject)] = BehaviourModifier()

# iterate over the observations
for label, events in observations.items():
    # iterate over the events in the observation
    for event in events:
        if not event.subject or event.type == 'vstart':
            if event.type != 'vstart':
                print(f'Missing subject: "{combo.obs_id}" {event}, skipping...')
            continue

        combo = CombiningData(label, event.subject)
        data = combo_dict[combo]

        # now increment certain fields
        field = f'{event.type}_{event.recipient}'

        try:
            setattr(data, field, getattr(data, field) + 1)

        except AttributeError:
            # pre-agg should just be ignored in the analysis
            assert field == 'pre-agg_', field

            # ignore errors
            continue

        combo_dict[combo] = data

Missing subject: "F1B 2017.05.01 D experiment treatment" "63.464, , app, intruder", skipping...
Missing subject: "F6A 2017.05.01 B experiment control" "17.959999999999997, , app, intruder", skipping...
Missing subject: "F6A 2017.05.01 B experiment control" "78.898, , app, partner", skipping...


Inserting the latencies can be done in one place.  As both the `combo_dict` and `latencies` use the same set of keys, the lookup should never fail.

In [13]:
for combo_label, item in combo_dict.items():
    item.l_intruder = latencies[combo_label].intruder
    item.l_neighbour = latencies[combo_label].neighbour

Sanity check: there should be $4 \times len(observations)$ in combo data, as each observation has four subjects.

In [14]:
print(f'Combo dict has {len(combo_dict)} items, should have {4 * len(observations)} items')
assert len(combo_dict) == 4 * len(observations)

Combo dict has 1280 items, should have 1280 items


Finally output the data to a csv file.

In [15]:
# now output the data to a csv file for further processing
with open('output_data.csv','w') as f:
    f.write('tank_num, date, int_pos, phase, cond, subject, app_int, app_n, app_p, fsp_int, fsp_n, fsp_p, sb_int, sb_n, sb_p, mwr_p, l_int, l_n\n')
    for k, v in combo_dict.items():
        f.write(f'{k.get_csv()}, {v.get_csv()}\n')

    print(f'Wrote {f.tell()} bytes')

Wrote 123273 bytes
