In [25]:
import numpy as np
from pandas import DataFrame, read_csv
# read_csv()
import os

BLOCK_SIZE = 20

In [79]:
from collections import namedtuple, defaultdict


# Point = namedtuple('Point',['window', 'snp', 'bp'])

class Point:
    def __init__(self, snp, bp=0):
        self.snp, self.bp = snp, bp
    
    @property
    def window(self):
        return self.snp // BLOCK_SIZE

class Node:
    def __init__(self, id, start, end):
        self.id = id
        self.start = start #Point()
        self.end = end #Point()
        self.upstream = defaultdict(lambda: 0)  # {nothing_node:501, Node: 38,  Node: 201, Node: 3}
        self.downstream = defaultdict(lambda: 0) # {Node: 38,  Node: 201, Node: 3}
        self.specimens = []
    
    def __len__(self):
        return len(self.specimens)
    
    def __repr__(self):
        return "N%i(%i, %i)" % (self.id, self.start.snp, self.end.snp)

a = Point(0)
b = Point(14)
str(Node(57, a, b))
nothing_node = Node(-1, Point(None), Point(None))
global_nodes = {0: nothing_node}


In [59]:
def read_data(file_path = "../test_data/KE_chromo10.txt"):
    """Individuals are rows, not columns"""
    loci = []
    with open(file_path) as ke:
        for line in ke.readlines():
            loci.append(tuple(int(x) for x in line.split()))
            
    
    individuals = np.array(loci).T.tolist()
    return loci, individuals
alleles, individuals = read_data()
assert len(alleles) == 32767
assert len(individuals[1]) == 32767
assert len(individuals) == 501

In [80]:
def signature(individual, start_locus):
    return tuple(individual[start_locus : start_locus + BLOCK_SIZE])

def get_unique_signatures(individuals, start_locus, block_size = 20):
    unique_blocks = {}
    for individual in individuals:
        sig = signature(individual, start_locus)
        if sig not in unique_blocks:
            unique_blocks[sig] = Node(len(unique_blocks), Point(start_locus // block_size, start_locus), 
                                      Point(start_locus // block_size, start_locus + BLOCK_SIZE)) #TODO: -1?
    
    return unique_blocks
unique_blocks = get_unique_signatures(individuals, 0 )
    
assert len(unique_blocks) == 4
unique_blocks
# assert unique_blocks == {(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): 0,
#  (0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 1,
#  (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): 2,
#  (2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 3}

{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): N0(0, 0),
 (0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N1(0, 0),
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 0),
 (2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N3(0, 0)}

In [81]:
def get_all_signatures(alleles, individuals):
    unique_signatures = []
    for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE):  # discards remainder 
        sig = get_unique_signatures(individuals, locus_start, BLOCK_SIZE)
        unique_signatures.append(sig)
    return unique_signatures
unique_signatures = get_all_signatures(alleles, individuals)

In [82]:
unique_signatures[21]

{(0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N0(21, 21),
 (0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2): N1(21, 21),
 (0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(21, 21),
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N3(21, 21),
 (0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N4(21, 21),
 (0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2): N5(21, 21),
 (0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N6(21, 21),
 (0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2): N7(21, 21)}

In [83]:
simplified_individuals = []
for i_specimen, specimen in enumerate(individuals):
    my_simplification = []
    for w, window in enumerate(unique_signatures):  # the length of the genome
        sig = signature(specimen, w * BLOCK_SIZE)
#         print(sig, unique_signatures[w][sig])
#         print(i_specimen, window)
        my_simplification.append(unique_signatures[w][sig])
    simplified_individuals.append(my_simplification)
print(simplified_individuals[500][:100])
len(simplified_individuals), len(simplified_individuals[60])

[N2(0, 0), N2(1, 1), N2(2, 2), N2(3, 3), N2(4, 4), N2(5, 5), N3(6, 6), N3(7, 7), N3(8, 8), N2(9, 9), N0(10, 10), N1(11, 11), N2(12, 12), N2(13, 13), N2(14, 14), N2(15, 15), N3(16, 16), N3(17, 17), N4(18, 18), N3(19, 19), N5(20, 20), N3(21, 21), N3(22, 22), N10(23, 23), N4(24, 24), N3(25, 25), N4(26, 26), N3(27, 27), N1(28, 28), N1(29, 29), N4(30, 30), N3(31, 31), N21(32, 32), N1(33, 33), N1(34, 34), N1(35, 35), N1(36, 36), N1(37, 37), N1(38, 38), N1(39, 39), N1(40, 40), N1(41, 41), N1(42, 42), N1(43, 43), N1(44, 44), N1(45, 45), N1(46, 46), N1(47, 47), N1(48, 48), N1(49, 49), N1(50, 50), N1(51, 51), N1(52, 52), N1(53, 53), N1(54, 54), N1(55, 55), N1(56, 56), N1(57, 57), N1(58, 58), N1(59, 59), N1(60, 60), N1(61, 61), N1(62, 62), N1(63, 63), N1(64, 64), N1(65, 65), N1(66, 66), N1(67, 67), N1(68, 68), N1(69, 69), N1(70, 70), N1(71, 71), N1(72, 72), N1(73, 73), N1(74, 74), N1(75, 75), N1(76, 76), N1(77, 77), N0(78, 78), N0(79, 79), N1(80, 80), N1(81, 81), N1(82, 82), N1(83, 83), N1(84, 84

(501, 1638)

# Nodes: Populate upstream and downstream

In [84]:
# build nodes:  first 4 are the 4 starting signatures in window 0.  
# For each node list which individuals are present at that node
# List transition rates from one node to all other upstream and downstream

for i, indiv in enumerate(simplified_individuals):
    # look what variants are present
    for x, node in enumerate(indiv):
        if x + 1 < len(indiv):
            node.downstream[indiv[x+1]] += 1
        else:
            node.downstream[nothing_node] += 1
        if x-1 >= 0:
            node.upstream[indiv[x-1]] += 1
        else: 
            node.upstream[nothing_node] += 1

#### TODO: turn these into tests

In [86]:
simplified_individuals[50][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N1(1, 1): 286})

In [87]:
simplified_individuals[49][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N0(1, 1): 103})

In [88]:
simplified_individuals[500][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N2(1, 1): 82})

In [91]:
simplified_individuals[91][0].downstream

defaultdict(<function __main__.Node.__init__.<locals>.<lambda>()>,
            {N1(1, 1): 30})

In [96]:
[x.downstream.values() for x in unique_signatures[1000].values()]

[dict_values([299]), dict_values([120]), dict_values([82])]

In [97]:
[x.upstream.values() for x in unique_signatures[1000].values()]

[dict_values([102, 197]), dict_values([120]), dict_values([82])]

---------------

In [42]:
unique_signatures[0]

{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): 0,
 (0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 1,
 (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): 2,
 (2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): 3}