In [1]:
import argparse
from ast import arg
import networkx as nx
import scipy.sparse as sp
import numpy as np
from Bio.Seq import Seq
from Bio import SeqIO, Align
#import MultiAlignment_func_python
import math 
import copy

def writeSequence(f, sequence):
    length = len(sequence)
    line=int(length/100)
    for k in range(line):
        f.write(sequence[100*k:100*(k+1)]+'\n')
    if length>line*100:
        f.write(sequence[100*line:]+'\n')

def ParseRecord(record):
    (A_ID,B_ID) = list(map(str,record[0:2]))
    (A_Orientation, A_start, A_end, A_length,B_Orientation, B_start, B_end, B_length) = list(map(int,record[4:]))
    if A_Orientation==0:
        A_name = A_ID+'_F'
    else:
        A_name = A_ID+'_R'
    if B_Orientation==0:
        B_name = B_ID+'_F'
    else:
        B_name = B_ID+'_R'
    return (A_name,B_name)


In [2]:

def setupRegressionModelFromMHAP(reads):    
    row=0
    indptr=[0]
    indices=[]
    data=[]
    response=[]

    size=len(reads)
    for i in range(size-1):
        for j in range(i+1,size):
            A = reads[i]
            B = reads[j]
            key_1 = A+"-"+B
            key_2 = B+"-"+A
            if key_1 in edge_dict:
                for overlap in edge_dict[key_1]:
                    row+=1
                    indices.append(j)
                    data.append(1)
                    indices.append(i)
                    data.append(-1)
                    indptr.append(indptr[-1]+2)
                    if overlap['A_Orientation']==0 and overlap['B_Orientation']==0:
                        response+=[overlap['A_start']-overlap['B_start']]
                    elif overlap['A_Orientation']==0 and overlap['B_Orientation']==1:
                        response+=[int(overlap['A_start']-(overlap['B_length']-overlap['B_end']))]
                    elif overlap['A_Orientation']==1 and overlap['B_Orientation']==0:
                        response+=[int(overlap['A_length']-overlap['A_end']-overlap['B_start'])]
                    elif overlap['A_Orientation']==1 and overlap['B_Orientation']==1:
                        response+=[overlap['A_length']-overlap['A_end']-(overlap['B_length']-overlap['B_end'])]     
            if key_2 in edge_dict:
                for overlap in edge_dict[key_2]:
                    row+=1
                    indices.append(i)
                    data.append(1)
                    indices.append(j)
                    data.append(-1)
                    indptr.append(indptr[-1]+2)
                    if overlap['A_Orientation']==0 and overlap['B_Orientation']==0:
                        response+=[overlap['A_start']-overlap['B_start']]
                    elif overlap['A_Orientation']==0 and overlap['B_Orientation']==1:
                        response+=[overlap['A_start']-(int(overlap['B_length'])-int(overlap['B_end']))]
                    elif overlap['A_Orientation']==1 and overlap['B_Orientation']==0:
                        response+=[overlap['A_length']-overlap['A_end']-overlap['B_start']]
                    elif overlap['A_Orientation']==1 and overlap['B_Orientation']==1:
                        response+=[overlap['A_length']-overlap['A_end']-(overlap['B_length']-overlap['B_end'])]                       

    design=sp.csr_matrix((data, indices, indptr), shape=(row, size))
    response=np.matrix(response).T

    return(design, response)

def deleteRowsCsr(mat, indices):
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

#####Huber M-estimate of reads coordinates 
def IRLS(X,Y,reads,thr1=4,thr2=20):
    if Y.shape[0]==0:
        return([], [])

    t=X.T
    A=t.dot(X)
    y=sp.csr_matrix(Y)
    b=t.dot(y).todense()
    estimate, exitCode = sp.linalg.lgmres(A, b, atol=1e-05) #compute ordinary least squares estiamte
    residual=abs((X.dot(sp.csr_matrix(estimate).T)-y).todense()).T.getA()[0]
    max_residual=max(residual)
    print("Initial max_residual:", round(max_residual,5),"Regression Size:",Y.shape[0])
    threshold=sp.csr_matrix(np.ones(len(residual))*thr1).toarray()[0] #threshold of Huber's weight function
    old_estimate=estimate
    n=0
    
    while n<1000:
        index=np.where(residual>threshold)[0]
        reweight=np.ones(len(residual))
        reweight[index]=threshold[index]/residual[index] # update weights for alignments with residuals greater than thr1
        reweight=sp.diags(reweight)
        t=X.T
        A=t.dot(reweight).dot(X)
        y=sp.csr_matrix(Y)
        b=t.dot(reweight).dot(y).todense()
        estimate, exitCode = sp.linalg.lgmres(A, b, atol=1e-05) #compute weighted least squares estimate
        residual=abs((X.dot(sp.csr_matrix(estimate).T)-y).todense()).T.getA()[0]
        max_residual=max(residual)
        #print("max_residual:", round(max_residual,5))
        diff=max(abs(estimate-old_estimate))
        if diff<1: # convergence condition of estimates
            break
        else:
            old_estimate=estimate
            n+=1
    print("IRLS Stop at the %d rounds with max diff %d, max residual %f"%(n,diff,round(max_residual,5)))
    
    outlier_overlap=[]
    for i in range(len(residual)):        
        n1=X.indices[2*i]
        n2=X.indices[2*i+1]
        if residual[i]>thr2:
            outlier_overlap.append((reads[n1],reads[n2],residual[i]))
    
    index=np.where(residual>thr2)[0]
    X=deleteRowsCsr(X,index)
    Y=np.delete(Y,index,0)
    
    t=X.T
    A=t.dot(X)
    y=sp.csr_matrix(Y)
    b=t.dot(y).todense()
    estimate, exitCode = sp.linalg.lgmres(A, b, atol=1e-05)
    #residual=abs((X.dot(sp.csr_matrix(estimate).T)-y).todense()).T.getA()[0]
    #max_residual=max(residual)

    #remove alignments with residuals greater than thr2
    G = nx.Graph()
    for i in range(X.shape[0]):
        n1=X.indices[2*i]
        n2=X.indices[2*i+1]
        G.add_edge(n1,n2)

#divide reads into separate connected components
    reads_list=[]
    estimates_list=[]
    for c in nx.connected_components(G):
        sub_index=list(G.subgraph(c).nodes)
        sub_estimates=[estimate[i] for i in sub_index]
        sub_reads=[reads[i] for i in sub_index]
        estimates_list.append(list(map(int,np.round(sub_estimates))))
        reads_list.append(sub_reads)

    return(estimates_list, reads_list, outlier_overlap)


In [3]:

#####Huber M-estimate of reads coordinates 
def IRLS(X,Y,reads,thr1=4,thr2=20):
    if Y.shape[0]==0:
        return([], [])

    t=X.T
    A=t.dot(X)
    y=sp.csr_matrix(Y)
    b=t.dot(y).todense()
    estimate, exitCode = sp.linalg.lgmres(A, b, atol=1e-05) #compute ordinary least squares estiamte
    residual=abs((X.dot(sp.csr_matrix(estimate).T)-y).todense()).T.getA()[0]
    max_residual=max(residual)
    print("Initial max_residual:", round(max_residual,5),"Regression Size:",Y.shape[0])
    threshold=sp.csr_matrix(np.ones(len(residual))*thr1).toarray()[0] #threshold of Huber's weight function
    old_estimate=estimate
    n=0
    '''
    while n<1000:
        index=np.where(residual>threshold)[0]
        reweight=np.ones(len(residual))
        reweight[index]=threshold[index]/residual[index] # update weights for alignments with residuals greater than thr1
        reweight=sp.diags(reweight)
        t=X.T
        A=t.dot(reweight).dot(X)
        y=sp.csr_matrix(Y)
        b=t.dot(reweight).dot(y).todense()
        estimate, exitCode = sp.linalg.lgmres(A, b, atol=1e-05) #compute weighted least squares estimate
        residual=abs((X.dot(sp.csr_matrix(estimate).T)-y).todense()).T.getA()[0]
        max_residual=max(residual)
        #print("max_residual:", round(max_residual,5))
        diff=max(abs(estimate-old_estimate))
        if diff<1: # convergence condition of estimates
            break
        else:
            old_estimate=estimate
            n+=1
    print("IRLS Stop at the %d rounds with max diff %d, max residual %f"%(n,diff,round(max_residual,5)))
    '''
    #remove alignments with residuals greater than thr2
    G = nx.Graph()
    outlier_overlap=[]
    for i in range(len(residual)):        
        n1=X.indices[2*i]
        n2=X.indices[2*i+1]
        if residual[i]>thr2:
            outlier_overlap.append((reads[n1],reads[n2],residual[i]))
        else:
            G.add_edge(n1,n2)

#divide reads into separate connected components
    reads_list=[]
    estimates_list=[]
    for c in nx.connected_components(G):
        sub_index=list(G.subgraph(c).nodes)
        sub_estimates=[estimate[i] for i in sub_index]
        sub_reads=[reads[i] for i in sub_index]
        estimates_list.append(list(map(int,np.round(sub_estimates))))
        reads_list.append(sub_reads)

    return(estimates_list, reads_list, outlier_overlap)


In [4]:
def importLongReads(filename):
    ReadSeq={}
    ReadInd=[]
    SortID = 1
    for seq_record in SeqIO.parse(filename, 'fasta'):
        ReadSeq[str(SortID)+"_F"] = str(seq_record.seq).upper()
        ReadInd.append(str(SortID)+"_F")
        ReadSeq[str(SortID)+"_R"] = str(Seq(seq_record.seq).reverse_complement()).upper()
        ReadInd.append(str(SortID)+"_R")
        SortID +=1
    return(ReadSeq,ReadInd)       

In [None]:
# command for self alignment using MHAP https://mhap.readthedocs.io/en/latest/index.html#minhash-alignment-process-mhap-a-probabilistic-sequence-overlap-algorithm
#java -Xmx128g -server -jar /home/limengtian/pkgs/mhap-2.1.1.jar -s ./2k-reads/SRR_HIFI_2k.fasta > ./2k-reads/MHAP_HIFI_2K.out 2> MHAP.log

In [5]:
ReadSeq,ReadInd = importLongReads('./2k-reads/SRR_HIFI_2k.fasta')

aligner = Align.PairwiseAligner(mode = 'local', match_score=1, mismatch_score=-2, open_gap_score=-2, extend_gap_score=-1)

HangingOut = 50
G = nx.Graph()
edge_dict = {}
with open("./2k-reads/MHAP_HIFI_2k.out",'r') as fin:
    for line in fin:
        record = line.strip().split()
        (A_ID,B_ID) = list(map(str,record[0:2]))
        (A_Orientation, A_start, A_end, A_length,B_Orientation, B_start, B_end, B_length) = list(map(int,record[4:]))
        A_name,B_name = ParseRecord(record)
        overlap={'A_name':A_name,'B_name':B_name,'A_Orientation':A_Orientation,'A_start':A_start,'A_end':A_end,'A_length':A_length,'B_Orientation':B_Orientation,'B_start':B_start,'B_end':B_end,'B_length':B_length}
        key = A_name+"-"+B_name
        if (A_Orientation==B_Orientation and min(A_start,B_start)<HangingOut and min(A_length-A_end,B_length-B_end)<HangingOut) or (A_Orientation!=B_Orientation and min(A_start,B_length-B_end)<HangingOut and min(A_length-A_end,B_start)<HangingOut):
            G.add_edge(A_name,B_name)
            if key in edge_dict:
                edge_dict[key].append(overlap)
            else:
                edge_dict[key]=[overlap]
fin.close()

In [6]:
consensus_list = []
contig_num = 1

Reads_List_1 = []
Estimate_List_1 = []
Outliers_1 = []
for subG in nx.connected_components(G):
    sub_reads = list(G.subgraph(subG).nodes())
    print("SubG.size",len(sub_reads))
    design, response = setupRegressionModelFromMHAP(sub_reads)
    estimates_list, reads_list, outliers=IRLS(design, response, sub_reads)            
    Estimate_List_1.append(estimates_list)
    Reads_List_1.append(reads_list)
    Outliers_1.append(outliers)


SubG.size 3313
Initial max_residual: 55100.26038 Regression Size: 11147
SubG.size 20
Initial max_residual: 18.70056 Regression Size: 47
SubG.size 98
Initial max_residual: 85.29575 Regression Size: 359
SubG.size 69
Initial max_residual: 29.66952 Regression Size: 246
SubG.size 50
Initial max_residual: 26.92271 Regression Size: 170
SubG.size 2
Initial max_residual: 0.0 Regression Size: 1
SubG.size 8
Initial max_residual: 2.33333 Regression Size: 9
SubG.size 18
Initial max_residual: 20.30935 Regression Size: 32
SubG.size 12
Initial max_residual: 8.488 Regression Size: 23
SubG.size 5
Initial max_residual: 6.6 Regression Size: 9
SubG.size 38
Initial max_residual: 12.96058 Regression Size: 87
SubG.size 2
Initial max_residual: 0.0 Regression Size: 1
SubG.size 6
Initial max_residual: 11.675 Regression Size: 9


In [7]:
print([len(o) for o in Outliers_1])
k=0
estimates_list = Estimate_List_1[k]
reads_list = Reads_List_1[k]
outliers = Outliers_1[k]
for i in range(len(estimates_list)):
    estimates = estimates_list[i]
    reads=reads_list[i]
    starts=np.array(estimates)
    ind=starts.argsort()
    for r_i in ind:
        print(reads[r_i],estimates[r_i])
    break

[8037, 0, 14, 3, 7, 0, 0, 1, 0, 0, 0, 0, 0]
640_F -32489
869_R -25785
1808_F -25742
2000_F -21080
1655_R -20243
1378_R -18026
1044_F -17041
202_R -16950
921_F -15756
498_R -14835
1403_R -10641
736_R -5083
1887_F -3351
1063_R -2689
1072_R -1778
612_R 2720
95_R 6268
127_R 8439
1565_R 10017


In [8]:
for o in Outliers_1[0]:
    print(o)

('893_F', '1977_F', 1543.3509900507051)
('351_R', '1977_F', 1768.0610537367465)
('389_R', '1977_F', 1649.609899147581)
('292_R', '1977_F', 1768.4124909595266)
('640_F', '1977_F', 1792.887709205912)
('1283_F', '1977_F', 5435.441009609625)
('893_F', '1283_F', 3891.09001955892)
('209_R', '893_F', 29079.99724843043)
('432_R', '893_F', 9293.407698965333)
('266_F', '893_F', 9950.000873583398)
('489_R', '893_F', 3017.625468619539)
('662_F', '893_F', 10506.396473263208)
('351_R', '640_F', 26.826655469165416)
('351_R', '1390_F', 336.6786263545291)
('351_R', '357_F', 318.92378742485744)
('351_R', '1342_F', 322.78965716830135)
('351_R', '743_F', 241.38891670884914)
('351_R', '1808_F', 43.2906528041749)
('351_R', '1484_F', 105.77618757162054)
('351_R', '1715_F', 343.63080061871733)
('351_R', '2000_F', 28.865519730697997)
('389_R', '640_F', 151.27781005833094)
('389_R', '1390_F', 461.12978094369464)
('389_R', '1808_F', 172.74180739334042)
('389_R', '1484_F', 234.22734216078607)
('389_R', '1715_F', 

In [67]:
print([len(o) for o in Outliers])

[889, 1, 12, 4, 14, 0, 0, 2, 0, 0, 1, 0, 0]


In [71]:
k=0
estimates_list = Estimate_List[k]
reads_list = Reads_List[k]
outliers = Outliers[k]
for i in range(len(estimates_list)):
    estimates = estimates_list[i]
    reads=reads_list[i]
    starts=np.array(estimates)
    ind=starts.argsort()
    for r_i in ind:
        print(reads[r_i],estimates[r_i])
    break

1164_R -38869
319_R -36597
1567_F -28518
458_F -27472
555_F -27318
1271_R -26963
893_F -26675
1283_F -25486
1370_R -25266
1266_F -23070
1448_F -20843
266_F -20293
743_F -19648
88_R -19129
1529_R -18977
1715_F -18696
432_R -16959
1977_F -16362
888_R -14610
1390_F -13999
662_F -13787
1342_F -12461
286_R -12091
1074_R -11391
357_F -11181
1926_F -11122
398_R -11057
74_F -10164
71_R -8312
1221_F -7587
1520_F -6706
605_R -5807
351_R -5449
292_R -4423
389_R -3309
640_F -2684
1499_R -1324
569_R -442
1075_R 657
1484_F 1900
361_F 3388
869_R 3980
1808_F 4038
764_R 4315
469_F 4422
1825_F 7374
2000_F 8710
1655_R 9543
866_R 9844
430_R 9862
1378_R 11738
1044_F 12726
202_R 12822
921_F 14012
498_R 14931
996_F 16935
1403_R 19140
1437_F 19250
1179_R 20591
1974_F 21379
1936_R 22051
736_R 24687
1738_F 25415
1887_F 26421
1063_R 27083
1072_R 27994
1670_R 32352
612_R 32492
95_R 36040
1180_R 36210
127_R 38211
1565_R 39789


In [62]:
print(outliers)

[('209_R', '893_F', 62496.79050033321), ('489_R', '893_F', 50213.52985679412), ('432_R', '1283_F', 232.73187824949855), ('489_R', '1283_F', 50212.96259543807), ('1030_R', '1283_F', 50223.24925867253), ('652_R', '750_F', 109871.56761654917), ('750_F', '865_F', 109873.23757778635), ('205_F', '1918_F', 39113.89156332144), ('205_F', '865_F', 109876.36682618939), ('143_R', '282_F', 191.55475978193135), ('282_F', '1699_F', 173.90815374589147), ('182_F', '1082_F', 190.00963876190508), ('910_R', '1822_F', 43.31425670629869), ('514_R', '1112_F', 107036.93794413083), ('514_R', '999_F', 204.4818073203951), ('893_R', '1536_F', 47815.24908579425), ('171_F', '1536_F', 22.5254856477477), ('439_R', '1536_F', 3899.651865720276), ('317_F', '1536_F', 60164.97223921094), ('893_R', '1030_F', 61621.44437053288), ('171_F', '235_F', 22.760357621416915), ('171_F', '279_F', 23.718358490536048), ('594_R', '1131_F', 86519.26274847989), ('193_R', '1131_F', 86432.25844933209), ('307_R', '1131_F', 86512.2691864187),

In [64]:
for o in outliers:
    if o[2]>10000:
        print(o)

('209_R', '893_F', 62496.79050033321)
('489_R', '893_F', 50213.52985679412)
('489_R', '1283_F', 50212.96259543807)
('1030_R', '1283_F', 50223.24925867253)
('652_R', '750_F', 109871.56761654917)
('750_F', '865_F', 109873.23757778635)
('205_F', '1918_F', 39113.89156332144)
('205_F', '865_F', 109876.36682618939)
('514_R', '1112_F', 107036.93794413083)
('893_R', '1536_F', 47815.24908579425)
('317_F', '1536_F', 60164.97223921094)
('893_R', '1030_F', 61621.44437053288)
('594_R', '1131_F', 86519.26274847989)
('193_R', '1131_F', 86432.25844933209)
('307_R', '1131_F', 86512.2691864187)
('439_R', '1939_F', 43914.68197161444)
('317_F', '1939_F', 12349.63840187622)
('241_F', '807_F', 26696.24662328972)
('241_F', '1557_F', 26704.265583575623)
('635_R', '648_F', 57149.661499375856)
('395_R', '1578_F', 44053.22736640957)
('395_R', '971_F', 44068.779881594994)
('288_R', '431_F', 35954.3375722975)
('807_R', '1301_F', 25822.94892396499)
('706_R', '948_F', 21370.874847324507)
('632_F', '1814_F', 26346.51

In [None]:

for k in range(len(Estimate_List)):
    estimates_list = Estimate_List[k]
    reads_list = Reads_List[k]
    for i in range(len(estimates_list)):
        estimates = estimates_list[i]
        reads=reads_list[i]

In [58]:
#reads = ['272_R', '913_F', '198_R', '495_F', '465_R', '407_F', '682_F', '1955_F', '940_F', '678_R', '137_F', '1451_R', '1651_R', '1057_R', '1767_R', '93_F', '47_F', '11_R', '246_F', '420_F', '1175_R', '142_F', '1235_F'] 
#estimates = [-26415, -21060, -15896, -14614, -29513, -16680, -17984, 10157, -1612, 1276, 7732, 2666, -1176, 24782, 22335, -2363, 14738, 20051, 22453, 14204, 4968, 17695, -15743]
reads = ['140_F', '43_R', '1666_F', '824_R', '1563_F', '413_R', '1651_F', '1451_F', '1235_R', '1175_F', '137_R', '678_F', '940_R', '47_R', '420_R', '142_R', '93_R', '1873_F', '1767_F', '1057_F', '528_F', '788_R', '11_F', '246_R']
estimates = [-17469, -31424, -33231, -22782, -24149, -22108, 24049, 17992, 35356, 17411, 12891, 20178, 24281, 6458, 4716, 4796, 23707, -7500, 697, -5028, -13600, -15760, 1944, -1425]
consensus_l = generateConsensus_1(reads,estimates,consensus_dict,3)

Sorted reads: ['1666_F', '43_R', '1563_F', '413_R', '824_R', '788_R', '140_F', '528_F', '1873_F', '1057_F', '246_R', '1767_F', '11_F', '142_R', '47_R', '420_R', '137_R', '1175_F', '1451_F', '678_F', '1651_F', '940_R', '93_R', '1235_R']
[13817, 16901, 22126, 23884, 24331, 33327, 33544, 34106, 39149, 44223, 46545, 46667, 48972, 51314, 54264, 54803, 61284, 64104, 66340, 67724, 70172, 70623, 71378, 84727]
Start Read: 1666_F 13817 13817
Read: 43_R 16901 15094 total: 1
Current consensus length: 13817 1
Align read to query_backup: 11232 [1686, 13816, 1, 11925]
diff: 3168 estimate diff: 3084
Align read to consensus: 11092 [1, 11975, 153, 11925]
Align using aligner: (((0, 163), (164, 181), (181, 201), (202, 206), (207, 226), (228, 259), (260, 411), (412, 467), (468, 529), (530, 586), (587, 595), (596, 605), (606, 652), (653, 698), (699, 709), (710, 736), (737, 743), (744, 768), (769, 806), (807, 819), (820, 858), (859, 885), (886, 902), (903, 907), (908, 922), (923, 959), (960, 969), (970, 1018

Align read to query_backup: 10869 [2050, 13044, 1, 10986]
diff: 1775 estimate diff: 1758
Align read to consensus: 10707 [1, 11042, 3, 10986]
Align using aligner: (((0, 30), (31, 60), (61, 82), (84, 133), (134, 146), (147, 178), (179, 195), (195, 216), (217, 325), (326, 431), (432, 451), (452, 479), (480, 548), (549, 579), (580, 617), (618, 735), (736, 926), (927, 970), (970, 991), (992, 1002), (1003, 1150), (1151, 1167), (1168, 1198), (1199, 1287), (1288, 1318), (1319, 1341), (1342, 1421), (1422, 1496), (1498, 1619), (1620, 1653), (1654, 1716), (1717, 1740), (1741, 1770), (1771, 1845), (1846, 1919), (1920, 1990), (1991, 2001), (2002, 2089), (2090, 2113), (2114, 2263), (2264, 2289), (2290, 2305), (2306, 2506), (2507, 2535), (2536, 2552), (2553, 2635), (2636, 2700), (2701, 3013), (3014, 3081), (3081, 3082), (3083, 3112), (3113, 3160), (3161, 3183), (3184, 3390), (3391, 3456), (3457, 3460), (3460, 3542), (3542, 3545), (3546, 3564), (3565, 3569), (3569, 4137), (4137, 4194), (4195, 4794), (

Align read to query_backup: 9119 [6667, 15856, 4509, 13696]
Read: 1873_F 39149 13418 total: 1
Current consensus length: 15856 1
Align read to query_backup: 7538 [8268, 15856, 1, 7592]
diff: 5826 estimate diff: 5822
Align read to consensus: 7538 [54, 7642, 1, 7592]
Align using aligner: (((0, 473), (473, 1173), (1174, 1359), (1360, 2659), (2659, 2847), (2848, 2951), (2951, 4832), (4833, 4934), (4934, 5156), (5156, 5533), (5534, 5770), (5770, 5926), (5927, 6435), (6435, 6952), (6952, 7285), (7285, 7589)), ((0, 473), (474, 1174), (1174, 1359), (1359, 2658), (2659, 2847), (2847, 2950), (2951, 4832), (4832, 4933), (4934, 5156), (5157, 5534), (5534, 5770), (5771, 5927), (5927, 6435), (6436, 6953), (6954, 7287), (7288, 7592)))
Read: 1057_F 44223 16020 total: 2
Current consensus length: 21691 2
Align read to query_backup: 10917 [2470, 13415, 1, 10944]
diff: 5073 estimate diff: 5074
Align read to consensus: 10885 [54, 11003, 1, 10944]
Align using aligner: (((0, 188), (188, 189), (190, 378), (379

Align read to query_backup: 8413 [8200, 16856, 1, 8683]
diff: 6479 estimate diff: 6481
Align read to consensus: 8368 [68, 8753, 6, 8683]
Align using aligner: (((0, 200), (201, 289), (289, 307), (308, 364), (364, 488), (488, 575), (575, 578), (578, 664), (665, 745), (746, 891), (892, 932), (939, 942), (944, 949), (951, 1003), (1004, 1095), (1095, 1236), (1237, 1245), (1245, 1375), (1375, 1481), (1482, 1604), (1605, 1969), (1970, 1992), (1993, 2025), (2025, 2066), (2067, 2505), (2506, 2573), (2574, 2592), (2593, 2649), (2649, 2780), (2780, 3017), (3018, 3274), (3274, 3395), (3395, 3457), (3457, 3701), (3701, 3735), (3735, 3787), (3789, 3835), (3835, 3927), (3928, 3935), (3936, 3940), (3940, 3993), (3994, 4007), (4007, 4177), (4177, 4249), (4249, 4296), (4297, 4305), (4306, 4311), (4311, 4390), (4391, 4444), (4445, 4522), (4523, 4559), (4560, 4623), (4624, 4757), (4757, 4765), (4765, 4798), (4798, 4801), (4801, 4828), (4828, 4877), (4877, 4973), (4973, 5095), (5096, 5293), (5294, 5469), (

Current consensus length: 49180 12
Align read to query_backup: 12774 [2193, 15117, 1, 12926]
diff: 1389 estimate diff: 1384
Align read to consensus: 12299 [1, 13012, 69, 12926]
Align using aligner: (((0, 65), (66, 101), (102, 154), (155, 181), (182, 259), (260, 420), (421, 487), (488, 529), (530, 564), (565, 727), (728, 813), (814, 844), (845, 885), (886, 1036), (1037, 1048), (1049, 1145), (1146, 1198), (1200, 1204), (1205, 1336), (1337, 1352), (1353, 1458), (1459, 1473), (1474, 1524), (1525, 1560), (1561, 1623), (1624, 1729), (1731, 1744), (1745, 1760), (1760, 1984), (1985, 2044), (2045, 2150), (2151, 2188), (2189, 2199), (2200, 2208), (2209, 2225), (2225, 2267), (2268, 2293), (2294, 2416), (2417, 2465), (2466, 2467), (2468, 2663), (2664, 2667), (2668, 2804), (2805, 2928), (2929, 3003), (3004, 3037), (3038, 3041), (3042, 3126), (3128, 3144), (3145, 3169), (3170, 3173), (3174, 3193), (3193, 3207), (3208, 3286), (3287, 3471), (3472, 3518), (3519, 3547), (3548, 3550), (3551, 3571), (3571

Current consensus length: 53043 14
Align read to query_backup: 12626 [223, 12892, 1, 12680]
diff: 431 estimate diff: 451
Align read to consensus: 12221 [1, 12740, 37, 12680]
Align using aligner: (((0, 81), (82, 90), (91, 130), (131, 163), (164, 216), (217, 265), (266, 395), (396, 554), (555, 595), (596, 635), (636, 654), (655, 715), (716, 906), (907, 958), (959, 989), (991, 1005), (1006, 1046), (1047, 1091), (1092, 1111), (1112, 1117), (1117, 1118), (1119, 1131), (1131, 1160), (1161, 1186), (1187, 1256), (1256, 1280), (1281, 1379), (1380, 1456), (1457, 1559), (1560, 1565), (1566, 1570), (1571, 1612), (1613, 1632), (1633, 1636), (1637, 1641), (1641, 1699), (1700, 1707), (1708, 1725), (1726, 1779), (1780, 1819), (1820, 1840), (1841, 2003), (2004, 2133), (2134, 2147), (2148, 2159), (2160, 2182), (2183, 2226), (2227, 2229), (2230, 2240), (2241, 2244), (2244, 2256), (2257, 2338), (2339, 2407), (2408, 2503), (2504, 2518), (2519, 2554), (2555, 2576), (2577, 2582), (2583, 2587), (2587, 2595), 

Current consensus length: 54309 16
Align read to query_backup: 2715 [11664, 14440, 1, 2777]
diff: 13363 estimate diff: 13349
Align read to consensus: 2727 [92, 2871, 1, 2777]
Align using aligner: (((0, 52), (52, 131), (131, 169), (169, 423), (424, 642), (643, 1019), (1019, 1020), (1021, 1142), (1143, 1214), (1215, 1519), (1520, 1659), (1659, 1737), (1738, 2060), (2060, 2367), (2368, 2441), (2442, 2780)), ((0, 52), (53, 132), (133, 171), (172, 426), (426, 644), (644, 1020), (1021, 1022), (1022, 1143), (1143, 1214), (1214, 1518), (1518, 1657), (1658, 1736), (1736, 2058), (2059, 2366), (2366, 2439), (2439, 2777)))
Final read count: 17 contig length: 67296
Start Read: 528_F 34106 14475
Final read count: 1 contig length: 14475


In [56]:
estimates=estimates_list[0]
Reads=reads_list[0]
#estimates = [101,0]
#Reads = ['51843_F','49731_R']
with open('./layout_292.txt','w') as fout:
    end=np.array(estimates)
    ind=end.argsort()
    for i in ind:
        print(Reads[i],end[i])
        fout.write(" "*(end[i]-min(end))+ReadSeq[Reads[i]]+" "+Reads[i]+'\n')

1164_R -38810
319_R -36540
1567_F -28459
458_F -27414
555_F -27260
1271_R -26905
893_F -26613
1283_F -25426
1370_R -25207
1266_F -23012
1448_F -20787
266_F -20227
743_F -19587
88_R -19069
1529_R -18921
1715_F -18637
432_R -16892
1977_F -16303
888_R -14553
1390_F -13940
662_F -13720
1342_F -12404
286_R -12025
1074_R -11335
357_F -11123
1926_F -11054
398_R -10996
74_F -10104
71_R -8244
1221_F -7516
1520_F -6639
605_R -5750
351_R -5391
292_R -4365
389_R -3252
640_F -2627
1499_R -1257
569_R -376
1075_R 725
1484_F 1959
361_F 3464
869_R 4039
1808_F 4097
764_R 4394
469_F 4499
1825_F 7459
2000_F 8768
1655_R 9601
866_R 9936
430_R 9951
1378_R 11796
1044_F 12785
202_R 12881
921_F 14070
498_R 14990
996_F 17027
1403_R 19198
1437_F 19342
1179_R 20684
1974_F 21475
1936_R 22147
736_R 24745
1738_F 25513
1887_F 26479
1063_R 27140
1072_R 28051
1670_R 32449
612_R 32549
95_R 36097
1180_R 36313
127_R 38268
1565_R 39846


In [53]:
print(list(edge_dict.items())[:10])

[('1977_F-893_F', [{'A_name': '1977_F', 'B_name': '893_F', 'A_Orientation': 0, 'A_start': 7, 'A_end': 4058, 'A_length': 15868, 'B_Orientation': 0, 'B_start': 10318, 'B_end': 14359, 'B_length': 14370}]), ('1977_F-351_R', [{'A_name': '1977_F', 'B_name': '351_R', 'A_Orientation': 0, 'A_start': 10935, 'A_end': 15857, 'A_length': 15868, 'B_Orientation': 1, 'B_start': 9849, 'B_end': 14774, 'B_length': 14793}]), ('1977_F-389_R', [{'A_name': '1977_F', 'B_name': '389_R', 'A_Orientation': 0, 'A_start': 13049, 'A_end': 15857, 'A_length': 15868, 'B_Orientation': 1, 'B_start': 11336, 'B_end': 14138, 'B_length': 14139}]), ('1977_F-292_R', [{'A_name': '1977_F', 'B_name': '292_R', 'A_Orientation': 0, 'A_start': 11939, 'A_end': 15857, 'A_length': 15868, 'B_Orientation': 1, 'B_start': 9364, 'B_end': 13284, 'B_length': 13285}]), ('1977_F-640_F', [{'A_name': '1977_F', 'B_name': '640_F', 'A_Orientation': 0, 'A_start': 13676, 'A_end': 15857, 'A_length': 15868, 'B_Orientation': 0, 'B_start': 0, 'B_end': 2178

In [98]:
starts = np.array(estimates)
ends=np.array([estimates[i]+len(ReadSeq[Reads[i]]) for i in range(len(estimates))])
lengths = np.array([len(ReadSeq[Reads[i]]) for i in range(len(estimates))])
print(max(ends)-min(starts),max(lengths),np.mean(lengths),np.median(lengths))
#print(edge_dict['51843_F-49731_R'])

59831 33220 13437.800711743772 13286.0
