In [26]:
import numpy as np
from pandas import DataFrame, read_csv
# read_csv()
import os

BLOCK_SIZE = 20
FILTER_THRESHOLD = 5

In [27]:
from collections import namedtuple, defaultdict


# Point = namedtuple('Point',['window', 'snp', 'bp'])

class Point:
    def __init__(self, snp, bp=0):
        self.snp, self.bp = snp, bp
    
    @property
    def window(self):
        return self.snp // BLOCK_SIZE

# class Specimen:
#     def __init__(self, ident, sequence)
#         ident, sequence
    
class Node:
    def __init__(self, ident, start, end):
        self.id = ident
        self.start = start #Point()
        self.end = end #Point()
        self.upstream = defaultdict(lambda: 0)  # {nothing_node:501, Node: 38,  Node: 201, Node: 3}
        self.downstream = defaultdict(lambda: 0) # {Node: 38,  Node: 201, Node: 3}
        self.specimens = set()
    
    def __len__(self):
        return len(self.specimens)
    
    def __repr__(self):
        return "N%s(%s, %s)" % (str(self.id), str(self.start.snp), str(self.end.snp))
    
    def __hash__(self):
        return hash(self.id) + hash(self.start.snp) + hash(self.end.snp)
    
#     def details(self):
        

a = Point(0)
b = Point(14)
str(Node(57, a, b))
nothing_node = Node(-1, Point(None), Point(None))
global_nodes = {0: nothing_node}


In [28]:
def read_data(file_path = "../test_data/KE_chromo10.txt"):
    """Individuals are rows, not columns"""
    loci = []
    with open(file_path) as ke:
        for line in ke.readlines():
            loci.append(tuple(int(x) for x in line.split()))
            
    
    individuals = np.array(loci).T.tolist()
    return loci, individuals
alleles, individuals = read_data()
assert len(alleles) == 32767
assert len(individuals[1]) == 32767
assert len(individuals) == 501

In [29]:
def first(iterable):
    return next(iter(iterable))

In [30]:
def signature(individual, start_locus):
    return tuple(individual[start_locus : start_locus + BLOCK_SIZE])

def get_unique_signatures(individuals, start_locus, block_size = 20):
    unique_blocks = {}
    for individual in individuals:
        sig = signature(individual, start_locus)
        if sig not in unique_blocks:
            unique_blocks[sig] = Node(len(unique_blocks), Point(start_locus // block_size, start_locus), 
                                      Point(start_locus // block_size, start_locus + BLOCK_SIZE)) #TODO: -1?
    
    return unique_blocks
unique_blocks = get_unique_signatures(individuals, 0 )
    
assert len(unique_blocks) == 4
unique_blocks
# assert unique_blocks == {(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): 0,
#  (0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 1,
#  (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): 2,
#  (2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 3}

{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): N0(0, 0),
 (0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N1(0, 0),
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 0),
 (2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N3(0, 0)}

In [31]:
def get_all_signatures(alleles, individuals):
    unique_signatures = []
    for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE):  # discards remainder 
        sig = get_unique_signatures(individuals, locus_start, BLOCK_SIZE)
        unique_signatures.append(sig)
    return unique_signatures
unique_signatures = get_all_signatures(alleles, individuals)

In [32]:
unique_signatures[21]

{(0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N0(21, 21),
 (0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N1(21, 21),
 (0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(21, 21),
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N3(21, 21),
 (0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N4(21, 21),
 (0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2): N5(21, 21),
 (0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N6(21, 21),
 (0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2): N7(21, 21)}

In [33]:
def build_individuals(individuals, unique_signatures):
    simplified_individuals = []
    for i_specimen, specimen in enumerate(individuals):
        my_simplification = []
        for w, window in enumerate(unique_signatures):  # the length of the genome
            sig = signature(specimen, w * BLOCK_SIZE)
    #         print(sig, unique_signatures[w][sig])
    #         print(i_specimen, window)
            my_simplification.append(unique_signatures[w][sig])
        simplified_individuals.append(my_simplification)
    return simplified_individuals
simplified_individuals = build_individuals(individuals, unique_signatures)
print(simplified_individuals[500][:100])
len(simplified_individuals), len(simplified_individuals[60])

[N2(0, 0), N2(1, 1), N2(2, 2), N2(3, 3), N2(4, 4), N2(5, 5), N3(6, 6), N3(7, 7), N3(8, 8), N2(9, 9), N0(10, 10), N1(11, 11), N2(12, 12), N2(13, 13), N2(14, 14), N2(15, 15), N3(16, 16), N3(17, 17), N4(18, 18), N3(19, 19), N5(20, 20), N3(21, 21), N3(22, 22), N10(23, 23), N4(24, 24), N3(25, 25), N4(26, 26), N3(27, 27), N1(28, 28), N1(29, 29), N4(30, 30), N3(31, 31), N21(32, 32), N1(33, 33), N1(34, 34), N1(35, 35), N1(36, 36), N1(37, 37), N1(38, 38), N1(39, 39), N1(40, 40), N1(41, 41), N1(42, 42), N1(43, 43), N1(44, 44), N1(45, 45), N1(46, 46), N1(47, 47), N1(48, 48), N1(49, 49), N1(50, 50), N1(51, 51), N1(52, 52), N1(53, 53), N1(54, 54), N1(55, 55), N1(56, 56), N1(57, 57), N1(58, 58), N1(59, 59), N1(60, 60), N1(61, 61), N1(62, 62), N1(63, 63), N1(64, 64), N1(65, 65), N1(66, 66), N1(67, 67), N1(68, 68), N1(69, 69), N1(70, 70), N1(71, 71), N1(72, 72), N1(73, 73), N1(74, 74), N1(75, 75), N1(76, 76), N1(77, 77), N0(78, 78), N0(79, 79), N1(80, 80), N1(81, 81), N1(82, 82), N1(83, 83), N1(84, 84

(501, 1638)

# Nodes: Populate upstream and downstream

In [34]:
# build nodes:  first 4 are the 4 starting signatures in window 0.  
# For each node list which individuals are present at that node
# List transition rates from one node to all other upstream and downstream
def populate_transitions(simplified_individuals):
    for i, indiv in enumerate(simplified_individuals):
        # look what variants are present
        for x, node in enumerate(indiv):
            node.specimens.add(i)
            if x + 1 < len(indiv):
                node.downstream[indiv[x+1]] += 1
            else:
                node.downstream[nothing_node] += 1
            if x-1 >= 0:
                node.upstream[indiv[x-1]] += 1
            else: 
                node.upstream[nothing_node] += 1
            

In [35]:
unique_signatures = get_all_signatures(alleles, individuals)
simplified_individuals = build_individuals(individuals, unique_signatures)
populate_transitions(simplified_individuals)

#### TODO: turn these into tests

In [36]:
simplified_individuals[50][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N1(1, 1): 286})

In [37]:
simplified_individuals[49][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N0(1, 1): 103})

In [38]:
simplified_individuals[500][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N2(1, 1): 82})

In [39]:
simplified_individuals[91][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N1(1, 1): 30})

In [40]:
[x.downstream.values() for x in unique_signatures[1000].values()]

[dict_values([299]), dict_values([120]), dict_values([82])]

In [41]:
[x.upstream.values() for x in unique_signatures[1000].values()]

[dict_values([102, 197]), dict_values([120]), dict_values([82])]

---------------

# Simple Merge

In [42]:
# TODO: add signature directly to node

In [43]:
from blist import blist
from copy import copy, deepcopy

In [44]:
def test_no_duplicate_nodes(global_nodes):
    unique_nodes = set()
    for node in global_nodes:
        if node in unique_nodes:
            print(node)
        else:
            unique_nodes.add(node)


In [45]:
# zoom_stack = [[]]
def simple_merge(global_nodes):
    new_layer = []  # TODO: copy old nodes to new layer conditionally
    n = 0
    while n < len(global_nodes):  # size of global_nodes changes, necessitating this weird loop
        node = global_nodes[n]
    #     print(node, type(node))
        if len(node.downstream) == 1: 
            next_node = first(node.downstream.keys())
            if len(node.specimens) == len(next_node.specimens):
                #Torsten deletes nodeA and modifies next_node
                next_node.upstream = node.upstream
                next_node.start = node.start
                #prepare to delete node by removing references
                for parent in node.upstream.keys():
                    if parent != nothing_node:
                        count = parent.downstream[node]
                        del parent.downstream[node]  # updating pointer 
                        parent.downstream[next_node] = count 
                global_nodes.remove(node)  #delete node
                # zoom_stack[0].append(merged)
                n -= 1
        n += 1
    return global_nodes        

In [46]:
def test_simple_merge():
    global_nodes = blist([node for window in unique_signatures for node in window.values()])  # think about referencing and deletion
    assert len(global_nodes) == 7180
    summary1 = simple_merge(global_nodes)
    assert len(summary1) == 3690
    return summary1
summary1 = test_simple_merge()

#### Neglect Nodes

In [47]:
def delete_node(node):
    """Changes references to this node to add to references to nothing_node"""
    for parent, count in node.upstream.items():
        parent.downstream[nothing_node] += parent.downstream[node]
        del parent.downstream[node]
    for descendant, count in node.downstream.items():
        descendant.upstream[nothing_node] += descendant.upstream[node]
        del descendant.upstream[node]
        

def neglect_nodes(all_nodes):
    nodes_to_delete = set()
#     filtered_nodes = copy(all_nodes)
#     filtered_nodes.remove(1)
#     assert len(all_nodes) != len(filtered_nodes)
    for node in all_nodes:
        if len(node.specimens) < FILTER_THRESHOLD:
            delete_node(node)  # TODO: check if this will orphan 
            nodes_to_delete.add(node)
    filtered_nodes = blist([x for x in all_nodes if x not in nodes_to_delete])
    # TODO: remove orphaned haplotypes in a node that transition to and from zero within a 10 window length
    return filtered_nodes 


def test_neglect_nodes(all_nodes):
    summary2 = neglect_nodes(all_nodes)
    assert len(summary2) == 2854
    return summary2
summary2 = test_neglect_nodes(summary1)

#### Split Groups

In [52]:
def split_one_group(prev_node, anchor, next_node):
    """ Called when up.specimens == down.specimens"""
    new = Node(777, prev_node.start, next_node.end, prev_node.upstream, next_node.downstream)  # TODO: what about case where more content is joining downstream?
    # Comment: That is actually the case we want to split up to obtain longer blocks later
    # Extension of full windows will take care of potential loss of information later
    
    if nothing_node != prev_node:
        new.specimens = anchor.specimens.intersection(prev_node.specimens)
    elif nothing_node != next_node:
        new.specimens = anchor.specimens.intersection(next_node.specimens)
    else:
        new.specimens = anchor.specimens
        for n in new.downstream.keys():
            if n != nothing_node:
                new.specimens = new.specimens.remove(n.specimens)
        for n in new.upstream.keys():
            if n != nothing_node:
                new.specimens = new.specimens.remove(n.specimens)
    
    if nothing_node == prev_node:
        new.start = anchor.start
        new.upstream = anchor.upstream
    if nothing_node is next_node:
        new.end = anchor.end
        new.downstream = anchor.downstream
    
    # Update upstream/downstream
    for n in new.upstream.keys():
        if n != nothing_node:
            new.upstream[n] = len(new.specimens.intersection(n.specimens))
            n.downstream[new] = new.upstream[n]
            n.downstream[prev_node] = n.downstream[prev_node] - n.downstream[new]
            if n.downstream[prev_node] == 0:
                del n.downstream[prev_node]

    for n in new.downstream.keys():
        if n != nothing_node:
            new.downstream[n] = len(new.specimens.intersection(n.specimens))
            n.upstream[new] = new.downstream[n]
            n.upstream[prev_node] = n.upstream[prev_node] - n.upstream[new]
            if n.upstream[prev_node] == 0:
                del n.upstream[prev_node]
                
    new.upstream[nothing_node] = len(new.specimens) - #Whatever is in all others keys together
    new.downstream[nothing_node] = len(new.specimens) - # whatever is in all others keys together
    if new.upstream[nothing_node] == 0:
        del new.upstream[nothing_node]
    
    if new.downstream[nothing_node] == 0:
        del new.downstream[nothing_node]
    
    # Update Specimens in prev_node, anchor, next_node
    if prev_node != nothing_node:
        prev_node.specimens = prev_node.specimens.remove(new.specimens)
    
    if next_node != nothing_node:
        next_node.specimens = next_node.specimens.remove(new.specimens)
    
    anchor.specimens = anchor.specimens.remove(new.specimens)
        
    ## anchor.specimens.difference_update(prev_node.specimens) REASON?
    return new

SyntaxError: invalid syntax (<ipython-input-52-3a240befdc8c>, line 44)

In [55]:
test_graph = summary2  # deepcopy(
example = test_graph[7]
# original = deepcopy(example)
print(len(example))
def test_split_one_group(prev_node, anchor, next_node):
    x = split_one_group(prev_node, anchor, next_node)
    assert x
    answer = set(int(x)-1 for x in '14  16  19  20  28  56  59  69  88 133 140 155 159 160 175 193 199 201 224 249 252 258 260 267 268 283 292 295 318 322 325 332 341 344 346 351 354 357 362 364 367 373 374 375 381 386 392 393 394 402 403 417 421 424 426 431 434 435 438 442 445 447 452 455 457 462 463 464 467 471 473 475 476 477 478 480 483 484 494 497 501'.split())
    assert x.specimens == answer, 'Specimens set does not agree with HaploBlocker' + str(x.specimens.difference(answer))
    return x

x = test_split_one_group(first(example.upstream),  example, first(example.downstream) )

81


NameError: name 'split_one_group' is not defined

In [25]:
def split_groups(all_nodes):
    """This is called crossmerge in the R code"""
    n = 0
    number_of_windows = len(first(simplified_individuals))
    while n < len(all_nodes):  # size of global_nodes changes, necessitating this weird loop
        node = all_nodes[n]
        #check if all transitition upstream match with one of my downstream nodes
        if set(node.upstream.values()) == set(node.downstream.values()):
            if node.start.snp != 0 and node.end.window != number_of_windows: #chr begin or end
                if len(node.specimens) > 0:
                    # Matchup upstream and downstream with specimen identities
                    for up in node.upstream:
                        for down in node.downstream:
                            if up.specimens == down.specimens:
                                new_node = split_one_group(up, node, down)
                    if not node.specimens:  
                        # all content has been moved
                        all_nodes.remove(node)
                        n -= 1
        n += 1
    return all_nodes
    

def test_split_groups(all_nodes):
    summary3 = split_groups(all_nodes)
    assert summary3
    return summary3
summary3 = test_split_groups(summary2)
    

### TODO: Test if split_groups() is working now

In [54]:
len(example)

NameError: name 'example' is not defined

In [27]:
print(len(example), type(example))
example

0 <class '__main__.Node'>


N3(5, 6)

In [28]:
len(example.specimens)

0

In [29]:
x.upstream.__repr__()

'defaultdict(<function Node.__init__.<locals>.<lambda> at 0x000001D9A869B730>, {})'

In [30]:
x.downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N3(8, 8): 81})

In [31]:
len(x)

81

In [32]:
first(example.upstream).downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N3(5, 6): 81, N-1(None, None): 1})

In [33]:
unique_signatures[1]

{(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2): N0(0, 1),
 (2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0): N1(1, 1),
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 1)}

In [34]:
u = first(example.upstream)
u

N2(0, 4)

In [35]:
set().difference_update({})

In [66]:
summary1[4].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N1(6, 6): 155, N2(6, 6): 158, N-1(None, None): 2})