In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

In [3]:
# with open('uk.txt', 'r') as f:
#     lines = []
#     for line in f:
#         lines.append(line.strip().split('__')[-1].split('.')[0])
# ' && '.join([f'(./dav.sh etl -ws Mon -etl v10-no-deoverlap -s {line})' for line in lines])

# Samples

In [2]:
def read_fq(file):
    lines = []
    with open(file, 'r') as f:
        for line in f:
            lines.append(line.strip())
    return lines[1::4]

def get_mc_read(path):
    # All reads
    x = read_fq(path)
    # Filter out corrupted reads
    x = [r for r in x if len(r) > 200 and 'N' not in r]
    if x:
        # Most common read
        r = [(y, f) for y, f in Counter(x).most_common(30)]
        return r
    return None

In [3]:
import glob
from collections import defaultdict, Counter

samples_loci_dict = defaultdict(dict)
loci_samples_dict = defaultdict(dict)
for sample_path in glob.glob('data/str_data/loci_reads/*'):
    sample_id = sample_path.split('/')[-1]
    for read_path in glob.glob(f'{sample_path}/*'):
        locus = read_path.split('/')[-1].split('.')[0]
        r = get_mc_read(read_path)
        if r:
            samples_loci_dict[sample_id][locus] = read_path
            loci_samples_dict[locus][sample_id] = read_path

# STR markers

In [4]:
import pandas as pd
import numpy as np
import tqdm
from str_finder.repeat_pattern import GreedyRepeatPattern

COMPLEMENTARY_DICT = {
    'A': 'T',
    'T': 'A',
    'G': 'C',
    'C': 'G'
}

def get_complementary_read(x):
    return ''.join([COMPLEMENTARY_DICT[_] for _ in x[::-1]])

def retrieve_alleles(x):
    x = sorted(list(x.items()), key=lambda x: -x[1])[:2]
    if len(x) < 2:
        return x
    if x[0][1] / x[1][1] < 3:
        return x
    else:
        return x[:1]

str_markers_ref = pd.read_excel('data/str_data/STR patterns.xlsx')

In [7]:
valid_markers = str_markers_ref[~str_markers_ref.Allele1Pattern.isna()]

In [8]:
valid_markers.shape

(19, 11)

In [17]:
results = []
for s in tqdm.tqdm_notebook(samples_loci_dict):
    df = pd.DataFrame({'sample_id': [s]})
    for l, path in samples_loci_dict[s].items():
        if l not in valid_markers.LocusName.tolist():
            continue
        # Read
        r = get_mc_read(path)
        if r is None:
            for allele_id in range(1, 3):
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available reads (with length > 100)'
            continue
        # Repeat pattern
        locus_rp = valid_markers.query(f'LocusName == "{l}"')[f'Allele1Pattern'].values[0]
        if isinstance(locus_rp, str):
            rp = GreedyRepeatPattern(locus_rp)
            
            alleles = defaultdict(int)
            annotations = dict()
            for x, c in r:
                matches = rp.match(x)
                if matches[1] == 0:
                    matches = rp.match(get_complementary_read(x))
                    if matches[1] != 0:
                        alleles[matches[1]] += c
                        annotations[matches[1]] = matches[2]
                else:
                    alleles[matches[1]] += c
                    annotations[matches[1]] = matches[2]
            
            result = retrieve_alleles(alleles)
            
            for allele_id, (allele, support) in enumerate(result):
                df[f'{l}_{allele_id+1}'] = allele
                df[f'{l}_{allele_id+1}_annotation'] = annotations[allele]
                df[f'{l}_{allele_id+1}_support (# reads)'] = support
        else:
            for allele_id in range(1, 3):           
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available patterns'
    results.append(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [18]:
results = pd.concat(results, axis=0).reset_index(drop=True)

In [22]:
results[[c for c in results.columns if 'id' not in c and 'annotation' not in c and 'support' not in c]].sum(axis=0).sort_index()

D12ATA63_1       726.0
D12ATA63_2       662.0
D14S1434_1       741.0
D14S1434_2       555.0
D1S1677_1        640.0
D1S1677_2        454.0
D2S1776_1        501.0
D2S1776_2        388.0
D3S4529_1        415.0
D3S4529_2        362.0
D5S2800_1        859.0
D5S2800_2        502.0
D6S474_1         644.0
D6S474_2         568.0
DXS10075_1       571.0
DXS10075_2        51.0
DXS10079_1       940.0
DXS10079_2        61.0
DXS10101_1      1381.0
DXS10101_2        88.0
DXS10134_1       623.0
DXS10146_1       793.0
DXS10146_2        68.0
DXS10147_1       318.0
DXS10147_2       293.0
DXS10148_1       654.0
DXS10148_2        64.0
DXS7133_1        472.0
DXS7133_2         27.0
DXS7424_1        760.0
DXS7424_2         68.0
DXS8377_1       1512.0
DXS8377_2       1120.0
DXS9895_1        708.0
DXS9895_2         48.0
GATA172D05_1     463.0
GATA172D05_2      32.0
dtype: float64

In [25]:
results = results[['sample_id'] + sorted(results.columns[1:])]

In [26]:
results

Unnamed: 0,sample_id,D12ATA63_1,D12ATA63_1_annotation,D12ATA63_1_support (# reads),D12ATA63_2,D12ATA63_2_annotation,D12ATA63_2_support (# reads),D14S1434_1,D14S1434_1_annotation,D14S1434_1_support (# reads),...,DXS9895_1_support (# reads),DXS9895_2,DXS9895_2_annotation,DXS9895_2_support (# reads),GATA172D05_1,GATA172D05_1_annotation,GATA172D05_1_support (# reads),GATA172D05_2,GATA172D05_2_annotation,GATA172D05_2_support (# reads)
0,S1,14,[TGT]4 TA [TTA]9,166,17.0,[TGT]5 TA [TTA]11,136.0,15,[GGAT]3 AG [ATAG]8 [ACAG]3,406,...,439,14.0,[AGAT]8 AA [GATA]3 [GAAT]2,372.0,10,[CTAT]10,357,,,
1,S10,14,[TGT]4 TA [TTA]9,150,18.0,[TGT]4 TA [TTA]13,111.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,693,...,356,,,,10,[CTAT]10,107,,,
2,S11,13,[TGT]4 TA [TTA]8,246,,,,14,[GGAT]3 AG [ATAG]7 [ACAG]3,308,...,268,,,,8,[CTAT]8,142,,,
3,S12,14,[TGT]4 TA [TTA]9,126,20.0,[TGT]4 TA [TTA]15,104.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,351,...,348,,,,8,[CTAT]8,85,,,
4,S13,16,[TGT]5 TA [TTA]10,193,18.0,[TGT]4 TA [TTA]13,113.0,13,[GGAT]3 AG [ATAG]6 [ACAG]3,488,...,371,,,,6,[CTAT]6,129,,,
5,S14,14,[TGT]4 TA [TTA]9,146,18.0,[TGT]4 TA [TTA]13,102.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,454,...,359,,,,11,[CTAT]11,135,,,
6,S15,14,[TGT]4 TA [TTA]9,98,18.0,[TGT]4 TA [TTA]13,76.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,257,...,215,,,,10,[CTAT]10,109,,,
7,S16,16,[TGT]5 TA [TTA]10,94,15.0,[TGT]5 TA [TTA]9,92.0,14,[GGAT]3 AG [ATAG]7 [ACAG]3,260,...,219,,,,12,[CTAT]12,67,,,
8,S17,14,[TGT]4 TA [TTA]9,111,18.0,[TGT]4 TA [TTA]13,93.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,302,...,254,,,,11,[CTAT]11,111,,,
9,S18,16,[TGT]5 TA [TTA]10,138,18.0,[TGT]4 TA [TTA]13,95.0,17,[GGAT]3 AG [ATAG]10 [ACAG]3,328,...,246,,,,12,[CTAT]12,108,,,


In [27]:
results.to_excel('data/str_data/annotation_results_v3.xlsx', index=False)

In [28]:
results

Unnamed: 0,sample_id,D12ATA63_1,D12ATA63_1_annotation,D12ATA63_1_support (# reads),D12ATA63_2,D12ATA63_2_annotation,D12ATA63_2_support (# reads),D14S1434_1,D14S1434_1_annotation,D14S1434_1_support (# reads),...,DXS9895_1_support (# reads),DXS9895_2,DXS9895_2_annotation,DXS9895_2_support (# reads),GATA172D05_1,GATA172D05_1_annotation,GATA172D05_1_support (# reads),GATA172D05_2,GATA172D05_2_annotation,GATA172D05_2_support (# reads)
0,S1,14,[TGT]4 TA [TTA]9,166,17.0,[TGT]5 TA [TTA]11,136.0,15,[GGAT]3 AG [ATAG]8 [ACAG]3,406,...,439,14.0,[AGAT]8 AA [GATA]3 [GAAT]2,372.0,10,[CTAT]10,357,,,
1,S10,14,[TGT]4 TA [TTA]9,150,18.0,[TGT]4 TA [TTA]13,111.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,693,...,356,,,,10,[CTAT]10,107,,,
2,S11,13,[TGT]4 TA [TTA]8,246,,,,14,[GGAT]3 AG [ATAG]7 [ACAG]3,308,...,268,,,,8,[CTAT]8,142,,,
3,S12,14,[TGT]4 TA [TTA]9,126,20.0,[TGT]4 TA [TTA]15,104.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,351,...,348,,,,8,[CTAT]8,85,,,
4,S13,16,[TGT]5 TA [TTA]10,193,18.0,[TGT]4 TA [TTA]13,113.0,13,[GGAT]3 AG [ATAG]6 [ACAG]3,488,...,371,,,,6,[CTAT]6,129,,,
5,S14,14,[TGT]4 TA [TTA]9,146,18.0,[TGT]4 TA [TTA]13,102.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,454,...,359,,,,11,[CTAT]11,135,,,
6,S15,14,[TGT]4 TA [TTA]9,98,18.0,[TGT]4 TA [TTA]13,76.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,257,...,215,,,,10,[CTAT]10,109,,,
7,S16,16,[TGT]5 TA [TTA]10,94,15.0,[TGT]5 TA [TTA]9,92.0,14,[GGAT]3 AG [ATAG]7 [ACAG]3,260,...,219,,,,12,[CTAT]12,67,,,
8,S17,14,[TGT]4 TA [TTA]9,111,18.0,[TGT]4 TA [TTA]13,93.0,16,[GGAT]3 AG [ATAG]9 [ACAG]3,302,...,254,,,,11,[CTAT]11,111,,,
9,S18,16,[TGT]5 TA [TTA]10,138,18.0,[TGT]4 TA [TTA]13,95.0,17,[GGAT]3 AG [ATAG]10 [ACAG]3,328,...,246,,,,12,[CTAT]12,108,,,


# Auto pattern generation

In [43]:
from str_finder.repeat_pattern import *
from str_finder.auto_pg import *

In [87]:
l = 'DXS10134'
rr = []
for s, path in tqdm.tqdm_notebook(loci_samples_dict[l].items()):
    #print(s)
    r = get_mc_read(path)
    rr.append(r)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [99]:
for l in valid_markers.LocusName.tolist():
    print('\n\n\n\n', l)
    pattern = valid_markers.query(f'LocusName == "{l}"')[f'Allele1Pattern'].values[0]
    rp = GreedyRepeatPattern(pattern)

    for s, path in tqdm.tqdm_notebook(loci_samples_dict[l].items()):
        r = get_mc_read(path)

        alleles = defaultdict(int)
        for x, c in r:
            matches = rp.match(x)
            if matches[1] == 0:
                matches = rp.match(get_complementary_read(x))
                if matches[1] != 0:
                    alleles[matches[1]] += c
            else:
                alleles[matches[1]] += c
        print(s, alleles)
        print(retrieve_alleles(alleles))





 D1S1677


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {13: 197, 15: 158, 14: 20, 12: 12})
[(13, 197), (15, 158)]
S10 defaultdict(<class 'int'>, {14: 157, 12: 228, 11: 11, 13: 6})
[(12, 228), (14, 157)]
S11 defaultdict(<class 'int'>, {13: 207, 14: 179, 12: 13})
[(13, 207), (14, 179)]
S12 defaultdict(<class 'int'>, {15: 267, 14: 31})
[(15, 267)]
S13 defaultdict(<class 'int'>, {13: 278, 14: 192, 12: 23})
[(13, 278), (14, 192)]
S14 defaultdict(<class 'int'>, {14: 168, 12: 227, 13: 27, 11: 16})
[(12, 227), (14, 168)]
S15 defaultdict(<class 'int'>, {14: 169, 15: 89, 13: 17})
[(14, 169), (15, 89)]
S16 defaultdict(<class 'int'>, {14: 101, 15: 69, 13: 3})
[(14, 101), (15, 69)]
S17 defaultdict(<class 'int'>, {14: 138, 15: 109, 13: 7})
[(14, 138), (15, 109)]
S18 defaultdict(<class 'int'>, {13: 162, 15: 112, 12: 10, 14: 4})
[(13, 162), (15, 112)]
S19 defaultdict(<class 'int'>, {13: 168, 15: 127, 12: 12, 14: 4})
[(13, 168), (15, 127)]
S2 defaultdict(<class 'int'>, {13: 122, 16: 70, 12: 10})
[(13, 122), (16, 70)]
S20 defau

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {14: 166, 17: 136, 13: 8, 16: 5})
[(14, 166), (17, 136)]
S10 defaultdict(<class 'int'>, {14: 150, 18: 111, 17: 13, 13: 4})
[(14, 150), (18, 111)]
S11 defaultdict(<class 'int'>, {13: 246, 12: 19})
[(13, 246)]
S12 defaultdict(<class 'int'>, {14: 126, 20: 104, 19: 9, 21: 4})
[(14, 126), (20, 104)]
S13 defaultdict(<class 'int'>, {16: 193, 18: 113, 15: 7, 17: 7})
[(16, 193), (18, 113)]
S14 defaultdict(<class 'int'>, {14: 146, 18: 102, 13: 10, 17: 2})
[(14, 146), (18, 102)]
S15 defaultdict(<class 'int'>, {14: 98, 18: 76, 13: 5, 17: 2})
[(14, 98), (18, 76)]
S16 defaultdict(<class 'int'>, {15: 92, 16: 94, 14: 8})
[(16, 94), (15, 92)]
S17 defaultdict(<class 'int'>, {14: 111, 18: 93, 13: 5, 17: 3})
[(14, 111), (18, 93)]
S18 defaultdict(<class 'int'>, {16: 138, 18: 95, 15: 17, 17: 3})
[(16, 138), (18, 95)]
S19 defaultdict(<class 'int'>, {14: 132, 18: 74, 13: 6, 17: 5})
[(14, 132), (18, 74)]
S2 defaultdict(<class 'int'>, {19: 87, 18: 92, 17: 2})
[(18, 92), (19, 87)]
S

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {15: 406, 16: 292, 14: 31})
[(15, 406), (16, 292)]
S10 defaultdict(<class 'int'>, {16: 693, 15: 63})
[(16, 693)]
S11 defaultdict(<class 'int'>, {14: 308, 16: 223, 15: 21, 13: 15})
[(14, 308), (16, 223)]
S12 defaultdict(<class 'int'>, {16: 351, 17: 266, 15: 28})
[(16, 351), (17, 266)]
S13 defaultdict(<class 'int'>, {13: 488, 14: 431, 12: 15})
[(13, 488), (14, 431)]
S14 defaultdict(<class 'int'>, {16: 454, 17: 360, 15: 48})
[(16, 454), (17, 360)]
S15 defaultdict(<class 'int'>, {16: 257, 17: 201, 15: 15, 14: 2})
[(16, 257), (17, 201)]
S16 defaultdict(<class 'int'>, {14: 260, 17: 191, 16: 15, 13: 12})
[(14, 260), (17, 191)]
S17 defaultdict(<class 'int'>, {16: 302, 17: 198, 15: 24})
[(16, 302), (17, 198)]
S18 defaultdict(<class 'int'>, {15: 320, 17: 328, 16: 25, 14: 15})
[(17, 328), (15, 320)]
S19 defaultdict(<class 'int'>, {17: 560, 16: 64})
[(17, 560)]
S2 defaultdict(<class 'int'>, {16: 255, 17: 251, 15: 23})
[(16, 255), (17, 251)]
S20 defaultdict(<class 'int

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {8: 538, 11: 475, 10: 21, 7: 17, 4: 2, 12: 2})
[(8, 538), (11, 475)]
S10 defaultdict(<class 'int'>, {11: 370, 12: 330, 10: 20, 13: 2})
[(11, 370), (12, 330)]
S11 defaultdict(<class 'int'>, {10: 819, 9: 23})
[(10, 819)]
S12 defaultdict(<class 'int'>, {8: 461, 11: 389, 10: 30, 7: 11, 12: 2})
[(8, 461), (11, 389)]
S13 defaultdict(<class 'int'>, {10: 478, 11: 422, 9: 27, 6: 2})
[(10, 478), (11, 422)]
S14 defaultdict(<class 'int'>, {10: 501, 12: 446, 11: 31, 9: 32})
[(10, 501), (12, 446)]
S15 defaultdict(<class 'int'>, {8: 375, 11: 297, 10: 16, 6: 2})
[(8, 375), (11, 297)]
S16 defaultdict(<class 'int'>, {12: 477, 11: 30, 13: 5, 10: 2})
[(12, 477)]
S17 defaultdict(<class 'int'>, {11: 608, 10: 44, 12: 4, 9: 2})
[(11, 608)]
S18 defaultdict(<class 'int'>, {11: 410, 12: 386, 10: 32})
[(11, 410), (12, 386)]
S19 defaultdict(<class 'int'>, {9: 552, 8: 18, 10: 5, 6: 2})
[(9, 552)]
S2 defaultdict(<class 'int'>, {10: 393, 13: 318, 12: 32, 9: 22, 8: 2})
[(10, 393), (13, 31

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {10: 806, 9: 39, 8: 6, 6: 6})
[(10, 806)]
S10 defaultdict(<class 'int'>, {8: 386, 11: 347, 10: 22, 7: 7})
[(8, 386), (11, 347)]
S11 defaultdict(<class 'int'>, {10: 522, 9: 27, 8: 4, 5: 2})
[(10, 522)]
S12 defaultdict(<class 'int'>, {8: 341, 9: 304, 7: 7, 6: 4, 10: 4})
[(8, 341), (9, 304)]
S13 defaultdict(<class 'int'>, {10: 413, 9: 404, 8: 6, 5: 2, 6: 2})
[(10, 413), (9, 404)]
S14 defaultdict(<class 'int'>, {8: 411, 11: 403, 10: 16, 7: 16})
[(8, 411), (11, 403)]
S15 defaultdict(<class 'int'>, {10: 274, 11: 226, 9: 12, 7: 7})
[(10, 274), (11, 226)]
S16 defaultdict(<class 'int'>, {8: 196, 10: 168, 7: 14, 9: 8})
[(8, 196), (10, 168)]
S17 defaultdict(<class 'int'>, {7: 286, 11: 257, 10: 13, 8: 2, 6: 2, 9: 2})
[(7, 286), (11, 257)]
S18 defaultdict(<class 'int'>, {8: 336, 10: 340, 7: 24, 9: 14, 11: 2})
[(10, 340), (8, 336)]
S19 defaultdict(<class 'int'>, {10: 413, 9: 25, 5: 5, 8: 5})
[(10, 413)]
S2 defaultdict(<class 'int'>, {8: 330, 9: 238, 7: 8, 10: 2, 12: 2})

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {18: 900, 17: 35})
[(18, 900)]
S10 defaultdict(<class 'int'>, {15: 450, 18: 473, 17: 19, 14: 14})
[(18, 473), (15, 450)]
S11 defaultdict(<class 'int'>, {15: 890, 14: 22})
[(15, 890)]
S12 defaultdict(<class 'int'>, {18: 377, 19: 369, 17: 14})
[(18, 377), (19, 369)]
S13 defaultdict(<class 'int'>, {19: 446, 18: 20, 15: 2})
[(19, 446)]
S14 defaultdict(<class 'int'>, {18: 908, 17: 47})
[(18, 908)]
S15 defaultdict(<class 'int'>, {15: 348, 18: 331, 17: 18, 14: 4})
[(15, 348), (18, 331)]
S16 defaultdict(<class 'int'>, {18: 298, 24: 171, 17: 11, 23: 4})
[(18, 298), (24, 171)]
S17 defaultdict(<class 'int'>, {18: 781, 17: 33})
[(18, 781)]
S18 defaultdict(<class 'int'>, {18: 427, 17: 18, 15: 9})
[(18, 427)]
S19 defaultdict(<class 'int'>, {15: 391, 18: 299, 17: 16, 14: 9})
[(15, 391), (18, 299)]
S2 defaultdict(<class 'int'>, {19: 306, 18: 291, 17: 16})
[(19, 306), (18, 291)]
S20 defaultdict(<class 'int'>, {18: 645, 17: 48})
[(18, 645)]
S21 defaultdict(<class 'int'>, {1

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {15: 385, 12: 379, 14: 21, 11: 16, 16: 4})
[(15, 385), (12, 379)]
S10 defaultdict(<class 'int'>, {12: 336, 15: 339, 14: 13, 11: 6})
[(15, 339), (12, 336)]
S11 defaultdict(<class 'int'>, {14: 255, 16: 251, 13: 16, 15: 9, 17: 6, 11: 2})
[(14, 255), (16, 251)]
S12 defaultdict(<class 'int'>, {14: 299, 15: 294, 13: 13})
[(14, 299), (15, 294)]
S13 defaultdict(<class 'int'>, {12: 465, 13: 449, 11: 11})
[(12, 465), (13, 449)]
S14 defaultdict(<class 'int'>, {12: 458, 16: 389, 15: 23, 11: 9, 17: 2})
[(12, 458), (16, 389)]
S15 defaultdict(<class 'int'>, {14: 234, 13: 229, 12: 14, 6: 2})
[(14, 234), (13, 229)]
S16 defaultdict(<class 'int'>, {15: 196, 16: 176, 14: 6, 17: 2, 8: 2})
[(15, 196), (16, 176)]
S17 defaultdict(<class 'int'>, {12: 220, 14: 242, 11: 7, 13: 6})
[(14, 242), (12, 220)]
S18 defaultdict(<class 'int'>, {12: 352, 13: 331, 11: 15})
[(12, 352), (13, 331)]
S19 defaultdict(<class 'int'>, {15: 216, 13: 220, 14: 22, 12: 4})
[(13, 220), (15, 216)]
S2 defaultd

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {12: 477, 13: 389, 11: 43, 14: 2})
[(12, 477), (13, 389)]
S10 defaultdict(<class 'int'>, {12: 485, 11: 34})
[(12, 485)]
S11 defaultdict(<class 'int'>, {13: 387, 12: 41, 14: 7})
[(13, 387)]
S12 defaultdict(<class 'int'>, {12: 362, 11: 26, 9: 5, 2: 1})
[(12, 362)]
S13 defaultdict(<class 'int'>, {11: 573, 10: 37, 9: 7, 8: 2, 6: 4, 12: 2})
[(11, 573)]
S14 defaultdict(<class 'int'>, {13: 489, 12: 41, 14: 10, 6: 2, 7: 2, 9: 2})
[(13, 489)]
S15 defaultdict(<class 'int'>, {12: 230, 11: 31, 10: 1})
[(12, 230)]
S16 defaultdict(<class 'int'>, {12: 282, 11: 24, 8: 1, 10: 1, 5: 1})
[(12, 282)]
S17 defaultdict(<class 'int'>, {9: 439, 8: 27, 5: 4})
[(9, 439)]
S18 defaultdict(<class 'int'>, {12: 446, 11: 40, 10: 4, 9: 5})
[(12, 446)]
S19 defaultdict(<class 'int'>, {12: 326, 11: 29, 7: 2, 13: 4, 18: 1, 8: 1})
[(12, 326)]
S2 defaultdict(<class 'int'>, {12: 344, 11: 22})
[(12, 344)]
S20 defaultdict(<class 'int'>, {9: 414, 8: 23, 6: 4, 4: 2, 10: 4})
[(9, 414)]
S21 defaultdict

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {18: 231, 21: 199, 20: 24, 17: 20, 16: 2})
[(18, 231), (21, 199)]
S10 defaultdict(<class 'int'>, {22: 204, 21: 28, 6: 1, 18: 1})
[(22, 204)]
S11 defaultdict(<class 'int'>, {19: 162, 18: 23, 12: 1})
[(19, 162)]
S12 defaultdict(<class 'int'>, {21: 152, 20: 18, 17: 2, 7: 2, 19: 1})
[(21, 152)]
S13 defaultdict(<class 'int'>, {21: 195, 20: 33, 18: 2, 14: 1, 16: 1})
[(21, 195)]
S14 defaultdict(<class 'int'>, {19: 255, 18: 25})
[(19, 255)]
S15 defaultdict(<class 'int'>, {20: 153, 19: 19, 12: 2, 16: 1})
[(20, 153)]
S16 defaultdict(<class 'int'>, {20: 114, 19: 11, 18: 3, 6: 1, 16: 2})
[(20, 114)]
S17 defaultdict(<class 'int'>, {20: 189, 19: 19, 17: 2, 7: 3, 8: 1, 16: 1})
[(20, 189)]
S18 defaultdict(<class 'int'>, {19: 226, 18: 15, 12: 2, 6: 2, 17: 2, 10: 1})
[(19, 226)]
S19 defaultdict(<class 'int'>, {20: 130, 19: 26, 18: 2, 16: 1, 9: 1})
[(20, 130)]
S2 defaultdict(<class 'int'>, {19: 145, 18: 10, 13: 3, 6: 2, 17: 3, 9: 1})
[(19, 145)]
S20 defaultdict(<class 'int'>

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {30: 328, 29: 18, 31: 3, 18: 2, 25: 2})
[(30, 328)]
S10 defaultdict(<class 'int'>, {29: 161, 28: 13, 24: 2})
[(29, 161)]
S11 defaultdict(<class 'int'>, {29: 144, 28: 8, 16: 2, 23: 1})
[(29, 144)]
S12 defaultdict(<class 'int'>, {30: 122, 29: 2, 18: 1, 28: 1, 25: 1, 24: 1})
[(30, 122)]
S13 defaultdict(<class 'int'>, {28: 208, 27: 5, 22: 1, 29: 1, 20: 1})
[(28, 208)]
S14 defaultdict(<class 'int'>, {28: 225, 27: 14})
[(28, 225)]
S15 defaultdict(<class 'int'>, {30: 98, 29: 8, 23: 1, 31: 1, 20: 1, 24: 1})
[(30, 98)]
S16 defaultdict(<class 'int'>, {27: 112, 26: 7, 18: 2, 16: 2, 25: 1, 23: 1})
[(27, 112)]
S17 defaultdict(<class 'int'>, {25: 203, 24: 7, 18: 2, 21: 1})
[(25, 203)]
S18 defaultdict(<class 'int'>, {29: 148, 28: 10, 21: 2, 23: 2, 16: 1})
[(29, 148)]
S19 defaultdict(<class 'int'>, {28: 172, 27: 6, 25: 1})
[(28, 172)]
S2 defaultdict(<class 'int'>, {30: 94, 29: 9, 28: 2, 24: 1, 18: 1, 21: 1})
[(30, 94)]
S20 defaultdict(<class 'int'>, {27: 183, 26: 11, 15: 

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {13: 2451, 12: 37, 11: 12})
[(13, 2451)]
S10 defaultdict(<class 'int'>, {13: 1030, 12: 15, 11: 2})
[(13, 1030)]
S11 defaultdict(<class 'int'>, {13: 1011, 11: 7, 12: 3})
[(13, 1011)]
S12 defaultdict(<class 'int'>, {13: 613, 12: 10, 11: 4})
[(13, 613)]
S13 defaultdict(<class 'int'>, {13: 991, 12: 6, 11: 2})
[(13, 991)]
S14 defaultdict(<class 'int'>, {13: 1046, 12: 2})
[(13, 1046)]
S15 defaultdict(<class 'int'>, {13: 707})
[(13, 707)]
S16 defaultdict(<class 'int'>, {13: 554, 12: 7})
[(13, 554)]
S17 defaultdict(<class 'int'>, {13: 1053, 12: 7, 11: 6})
[(13, 1053)]
S18 defaultdict(<class 'int'>, {13: 753, 11: 11, 12: 10, 14: 2})
[(13, 753)]
S19 defaultdict(<class 'int'>, {13: 875, 14: 2, 12: 2, 11: 2})
[(13, 875)]
S2 defaultdict(<class 'int'>, {12: 116, 11: 4, 10: 1})
[(12, 116)]
S20 defaultdict(<class 'int'>, {13: 973})
[(13, 973)]
S21 defaultdict(<class 'int'>, {13: 621, 11: 3})
[(13, 621)]
S22 defaultdict(<class 'int'>, {13: 649, 12: 4, 11: 2})
[(13, 649)]
S

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {16: 62, 15: 27, 14: 1, 8: 1})
[(16, 62), (15, 27)]
S10 defaultdict(<class 'int'>, {17: 72, 16: 8, 8: 2, 6: 1, 11: 1, 9: 1})
[(17, 72)]
S11 defaultdict(<class 'int'>, {17: 36, 16: 6, 14: 2, 12: 1, 11: 1, 5: 1, 10: 1})
[(17, 36)]
S12 defaultdict(<class 'int'>, {15: 37, 14: 3, 12: 1, 5: 1, 10: 1, 13: 1})
[(15, 37)]
S13 defaultdict(<class 'int'>, {14: 104, 8: 2, 5: 1})
[(14, 104)]
S14 defaultdict(<class 'int'>, {18: 64, 17: 9, 11: 1, 8: 1})
[(18, 64)]
S15 defaultdict(<class 'int'>, {19: 64, 18: 5, 6: 1, 14: 1})
[(19, 64)]
S16 defaultdict(<class 'int'>, {15: 46, 14: 3, 8: 1, 7: 1})
[(15, 46)]
S17 defaultdict(<class 'int'>, {18: 62, 17: 6, 11: 1, 9: 2, 12: 1, 7: 1})
[(18, 62)]
S18 defaultdict(<class 'int'>, {16: 49, 15: 6, 5: 1, 4: 1, 7: 1, 13: 1})
[(16, 49)]
S19 defaultdict(<class 'int'>, {17: 57, 16: 7, 5: 1, 12: 1})
[(17, 57)]
S2 defaultdict(<class 'int'>, {17: 21, 10: 2, 7: 3, 18: 2, 16: 4, 9: 2, 15: 1, 14: 2, 5: 1})
[(17, 21)]
S20 defaultdict(<class 'int'>

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {9: 581, 6: 795, 7: 677, 8: 25})
[(6, 795), (7, 677)]
S10 defaultdict(<class 'int'>, {9: 484, 7: 512, 8: 23, 6: 14})
[(7, 512), (9, 484)]
S11 defaultdict(<class 'int'>, {9: 443, 7: 501, 8: 18, 6: 18})
[(7, 501), (9, 443)]
S12 defaultdict(<class 'int'>, {7: 445, 9: 202, 6: 14, 8: 10, 4: 1})
[(7, 445), (9, 202)]
S13 defaultdict(<class 'int'>, {6: 632, 8: 426, 5: 11, 4: 2})
[(6, 632), (8, 426)]
S14 defaultdict(<class 'int'>, {6: 597, 8: 401, 5: 15, 7: 3})
[(6, 597), (8, 401)]
S15 defaultdict(<class 'int'>, {7: 334, 9: 205, 8: 15, 6: 14})
[(7, 334), (9, 205)]
S16 defaultdict(<class 'int'>, {8: 312, 6: 392, 5: 2, 3: 1, 4: 1})
[(6, 392), (8, 312)]
S17 defaultdict(<class 'int'>, {8: 452, 6: 467, 5: 20})
[(6, 467), (8, 452)]
S18 defaultdict(<class 'int'>, {7: 468, 6: 61, 4: 2, 2: 4})
[(7, 468)]
S19 defaultdict(<class 'int'>, {9: 335, 7: 412, 8: 15, 6: 7, 3: 4})
[(7, 412), (9, 335)]
S2 defaultdict(<class 'int'>, {6: 494, 8: 334, 7: 3, 5: 10})
[(6, 494), (8, 334)]
S

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {15: 100, 18: 83, 17: 6})
[(15, 100), (18, 83)]
S10 defaultdict(<class 'int'>, {14: 62})
[(14, 62)]
S11 defaultdict(<class 'int'>, {})
[]
S12 defaultdict(<class 'int'>, {21: 15})
[(21, 15)]
S13 defaultdict(<class 'int'>, {19: 25})
[(19, 25)]
S14 defaultdict(<class 'int'>, {16: 52, 12: 2, 15: 6})
[(16, 52)]
S15 defaultdict(<class 'int'>, {16: 34})
[(16, 34)]
S16 defaultdict(<class 'int'>, {15: 30, 14: 3})
[(15, 30)]
S17 defaultdict(<class 'int'>, {16: 42})
[(16, 42)]
S18 defaultdict(<class 'int'>, {17: 24, 16: 5})
[(17, 24)]
S19 defaultdict(<class 'int'>, {14: 41, 13: 2})
[(14, 41)]
S2 defaultdict(<class 'int'>, {16: 42})
[(16, 42)]
S20 defaultdict(<class 'int'>, {12: 61, 11: 7})
[(12, 61)]
S21 defaultdict(<class 'int'>, {15: 60})
[(15, 60)]
S22 defaultdict(<class 'int'>, {12: 69, 11: 6})
[(12, 69)]
S23 defaultdict(<class 'int'>, {17: 29, 16: 3})
[(17, 29)]
S24 defaultdict(<class 'int'>, {11: 23})
[(11, 23)]
S25 defaultdict(<class 'int'>, {12: 92, 17: 75, 1

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {11: 621, 9: 317, 10: 45, 7: 7, 8: 11})
[(11, 621), (9, 317)]
S10 defaultdict(<class 'int'>, {10: 400, 9: 21, 8: 4, 5: 4, 7: 2})
[(10, 400)]
S11 defaultdict(<class 'int'>, {9: 370, 8: 10, 10: 4, 7: 2})
[(9, 370)]
S12 defaultdict(<class 'int'>, {9: 345, 8: 19, 7: 2, 6: 4, 2: 1})
[(9, 345)]
S13 defaultdict(<class 'int'>, {9: 319, 8: 22, 5: 7, 7: 4, 10: 5})
[(9, 319)]
S14 defaultdict(<class 'int'>, {9: 463, 8: 45})
[(9, 463)]
S15 defaultdict(<class 'int'>, {9: 318, 8: 22, 6: 4, 7: 4, 5: 1})
[(9, 318)]
S16 defaultdict(<class 'int'>, {11: 343, 10: 36, 7: 4})
[(11, 343)]
S17 defaultdict(<class 'int'>, {11: 443, 10: 43, 7: 4, 12: 2})
[(11, 443)]
S18 defaultdict(<class 'int'>, {9: 373, 8: 22, 7: 2, 4: 1})
[(9, 373)]
S19 defaultdict(<class 'int'>, {10: 288, 9: 20, 7: 3, 11: 1})
[(10, 288)]
S2 defaultdict(<class 'int'>, {10: 311, 9: 23, 8: 3, 7: 2})
[(10, 311)]
S20 defaultdict(<class 'int'>, {10: 606, 9: 29, 11: 4, 7: 4, 5: 2})
[(10, 606)]
S21 defaultdict(<class 'in

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {14: 429, 17: 238, 13: 88, 16: 54, 15: 26, 18: 17, 12: 12})
[(14, 429), (17, 238)]
S10 defaultdict(<class 'int'>, {17: 268, 16: 56, 18: 20, 15: 11})
[(17, 268)]
S11 defaultdict(<class 'int'>, {17: 233, 16: 47, 18: 12, 15: 14, 14: 2, 19: 3})
[(17, 233)]
S12 defaultdict(<class 'int'>, {17: 215, 16: 25, 18: 15, 15: 8, 10: 2, 12: 2})
[(17, 215)]
S13 defaultdict(<class 'int'>, {16: 271, 15: 57, 17: 26, 14: 9})
[(16, 271)]
S14 defaultdict(<class 'int'>, {15: 384, 14: 56, 16: 19, 13: 7})
[(15, 384)]
S15 defaultdict(<class 'int'>, {16: 175, 15: 46, 17: 20, 11: 2})
[(16, 175)]
S16 defaultdict(<class 'int'>, {15: 134, 14: 21, 16: 7, 13: 4, 12: 1})
[(15, 134)]
S17 defaultdict(<class 'int'>, {17: 213, 16: 56, 18: 20, 15: 7, 14: 4})
[(17, 213)]
S18 defaultdict(<class 'int'>, {16: 243, 15: 44, 17: 25, 18: 2, 14: 2, 9: 1})
[(16, 243)]
S19 defaultdict(<class 'int'>, {10: 310, 9: 21, 11: 9, 8: 2})
[(10, 310)]
S2 defaultdict(<class 'int'>, {16: 245, 15: 46, 17: 16, 13: 1, 1

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {30: 70, 37: 31, 36: 18, 29: 20, 28: 11, 35: 8, 15: 2, 25: 1})
[(30, 70), (37, 31)]
S10 defaultdict(<class 'int'>, {36: 41, 35: 22, 34: 7, 37: 2, 8: 1, 7: 1, 38: 1, 32: 2, 10: 1, 31: 1, 5: 1, 29: 1})
[(36, 41), (35, 22)]
S11 defaultdict(<class 'int'>, {30: 44, 29: 15, 28: 8, 25: 2, 27: 3, 26: 2, 6: 1, 5: 1})
[(30, 44), (29, 15)]
S12 defaultdict(<class 'int'>, {33: 49, 32: 33, 34: 2, 31: 4, 30: 2, 29: 3, 10: 2, 5: 1, 6: 1})
[(33, 49), (32, 33)]
S13 defaultdict(<class 'int'>, {33: 45, 32: 10, 35: 4, 31: 7, 34: 3, 13: 1, 14: 1, 8: 1, 29: 1, 17: 1, 12: 1, 18: 1})
[(33, 45)]
S14 defaultdict(<class 'int'>, {28: 112, 27: 34, 26: 13, 25: 5, 24: 1, 29: 1})
[(28, 112)]
S15 defaultdict(<class 'int'>, {31: 37, 30: 14, 29: 10, 32: 2, 27: 2, 28: 5, 7: 1, 26: 1, 12: 1})
[(31, 37), (30, 14)]
S16 defaultdict(<class 'int'>, {29: 65, 28: 16, 27: 9, 26: 3, 24: 1, 8: 1, 5: 1, 15: 1})
[(29, 65)]
S17 defaultdict(<class 'int'>, {28: 102, 27: 27, 26: 12, 29: 2, 17: 1})
[(28, 102)]

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {16: 439, 14: 372, 18: 17, 15: 37, 13: 20, 12: 3})
[(16, 439), (14, 372)]
S10 defaultdict(<class 'int'>, {16: 356, 18: 13, 15: 22, 12: 1})
[(16, 356)]
S11 defaultdict(<class 'int'>, {15: 268, 16: 25, 14: 25, 17: 20})
[(15, 268)]
S12 defaultdict(<class 'int'>, {13: 348, 15: 34, 12: 11, 14: 2})
[(13, 348)]
S13 defaultdict(<class 'int'>, {15: 371, 16: 32, 17: 27, 14: 19, 13: 4, 9: 2, 10: 2})
[(15, 371)]
S14 defaultdict(<class 'int'>, {14: 359, 16: 36, 15: 8, 13: 14, 12: 2})
[(14, 359)]
S15 defaultdict(<class 'int'>, {14: 215, 13: 27, 16: 10, 15: 6})
[(14, 215)]
S16 defaultdict(<class 'int'>, {14: 219, 13: 25, 16: 11})
[(14, 219)]
S17 defaultdict(<class 'int'>, {14: 254, 16: 25, 13: 13, 15: 5})
[(14, 254)]
S18 defaultdict(<class 'int'>, {14: 246, 16: 24, 13: 16, 15: 4})
[(14, 246)]
S19 defaultdict(<class 'int'>, {14: 259, 13: 20, 16: 14, 15: 6})
[(14, 259)]
S2 defaultdict(<class 'int'>, {15: 273, 17: 20, 16: 23, 14: 16})
[(15, 273)]
S20 defaultdict(<class 'int

HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

S1 defaultdict(<class 'int'>, {10: 357, 9: 15, 6: 4, 7: 1, 8: 2})
[(10, 357)]
S10 defaultdict(<class 'int'>, {10: 107, 9: 9, 3: 1, 8: 1, 7: 1})
[(10, 107)]
S11 defaultdict(<class 'int'>, {8: 142, 7: 6, 5: 1, 4: 1})
[(8, 142)]
S12 defaultdict(<class 'int'>, {8: 85, 7: 8, 2: 1, 4: 1})
[(8, 85)]
S13 defaultdict(<class 'int'>, {6: 129, 5: 2, 3: 1})
[(6, 129)]
S14 defaultdict(<class 'int'>, {11: 135, 10: 8, 12: 1, 9: 3, 7: 1, 8: 1})
[(11, 135)]
S15 defaultdict(<class 'int'>, {10: 109, 9: 6, 8: 2, 7: 1, 11: 1})
[(10, 109)]
S16 defaultdict(<class 'int'>, {12: 67, 13: 2, 6: 1, 10: 2, 11: 2})
[(12, 67)]
S17 defaultdict(<class 'int'>, {11: 111, 6: 2, 10: 7, 7: 1, 8: 1, 12: 1})
[(11, 111)]
S18 defaultdict(<class 'int'>, {12: 108, 11: 16, 10: 1, 8: 1})
[(12, 108)]
S19 defaultdict(<class 'int'>, {8: 111, 7: 6, 6: 2, 4: 1})
[(8, 111)]
S2 defaultdict(<class 'int'>, {11: 125, 10: 4, 7: 2})
[(11, 125)]
S20 defaultdict(<class 'int'>, {8: 114, 7: 4, 5: 2, 6: 1})
[(8, 114)]
S21 defaultdict(<class 'int'>, 

In [78]:
alleles

defaultdict(int, {14: 2451, 13: 37, 12: 12})

In [54]:
pattern = '[TCT]n tcc [TCT]n [TCCTCT]n [TCT]n'
rp = GreedyRepeatPattern(pattern)
result = sorted([rp.match(rr) for rr in r], key=lambda x: x[1], reverse=True)

In [55]:
result

[([<str_finder.repeat_pattern.Match at 0x118fd0be0>,
   <str_finder.repeat_pattern.Match at 0x118fd0c18>,
   <str_finder.repeat_pattern.Match at 0x118fd0d68>,
   <str_finder.repeat_pattern.Match at 0x118fd0ac8>,
   <str_finder.repeat_pattern.Match at 0x118fd0e10>,
   <str_finder.repeat_pattern.Match at 0x118fd0b38>,
   <str_finder.repeat_pattern.Match at 0x118fd0ba8>,
   <str_finder.repeat_pattern.Match at 0x118fd0b00>,
   <str_finder.repeat_pattern.Match at 0x118fd0d30>,
   <str_finder.repeat_pattern.Match at 0x118fd0da0>,
   <str_finder.repeat_pattern.Match at 0x118fd0eb8>,
   <str_finder.repeat_pattern.Match at 0x118fd0a58>,
   <str_finder.repeat_pattern.Match at 0x116c42048>,
   <str_finder.repeat_pattern.Match at 0x116c42080>,
   <str_finder.repeat_pattern.Match at 0x116c420b8>,
   <str_finder.repeat_pattern.Match at 0x116c420f0>,
   <str_finder.repeat_pattern.Match at 0x116c42128>,
   <str_finder.repeat_pattern.Match at 0x116c42160>,
   <str_finder.repeat_pattern.Match at 0x116c4

In [20]:
import tqdm

common_patterns_stats = defaultdict(Counter)
for index, l in enumerate(tqdm.tqdm_notebook(loci_samples_dict)):
    if l != 'GATA172D05':
        continue
    locus_rp = str_markers_ref.query(f'LocusName == "{l}"').RepeatStructure.values[0]
    print('Locus: ', l, locus_rp)
    for s, path in tqdm.notebook.tqdm(loci_samples_dict[l].items()):
        #print(s)
        r = get_mc_read(path)
        patterns = list(set([
            ' | '.join(CasualPatternsGenerator(x).patterns)
            for x in tqdm.notebook.tqdm(r) if 'N' not in x
        ]))
        common_patterns_stats[l].update(patterns)  

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))

Locus:  GATA172D05 nan


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))






**Manual annotation**

In [56]:
pattern = '[TCT]n tcc [TCT]n [TCCTCT]n [TCT]n'

In [57]:
l = 'DXS8377'


allele_dict = defaultdict(list)
for s, path in tqdm.tqdm_notebook(loci_samples_dict[l].items()):
    r = get_mc_read(path)
    if r is None:
        continue

    rp = GreedyRepeatPattern(pattern)
    result = sorted([rp.match(rr) for rr in r], key=lambda x: x[1], reverse=True)[0]
    if result[1] > 0:
        allele_dict[pattern].append(result[1])
print(sorted([
    (pattern, len(occ), sum(occ), max(occ))
    for pattern, occ in allele_dict.items()],
    key=lambda x: (x[2], -len(x[0].split())), reverse=True
))

# for allele_str, _ in common_patterns_stats[l].most_common(4):
#     print('\n\n\n')
#     allele_dict = defaultdict(list)
#     for s, path in tqdm.tqdm_notebook(loci_samples_dict[l].items()):
#         r = get_mc_read(path)
#         if r is None:
#             continue

#         for pattern in allele_str.split(' | '):
#             rp = GreedyRepeatPattern(pattern)
#             result = sorted([rp.match(rr) for rr in r], key=lambda x: x[1], reverse=True)[0]
#             if result[1] > 0:
#                 allele_dict[pattern].append(result[1])
#     print(sorted([
#         (pattern, len(occ), sum(occ), max(occ))
#         for pattern, occ in allele_dict.items()],
#         key=lambda x: (x[2], -len(x[0].split())), reverse=True
#     ))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))


[('[TCT]n tcc [TCT]n [TCCTCT]n [TCT]n', 47, 1952, 47)]
