In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

In [2]:
from str_finder.utils import inspect_storage_dir

samples_loci_dict, loci_samples_dict = inspect_storage_dir('data/str_data/loci_reads')

# STR markers

In [4]:
import pandas as pd

str_markers_ref = pd.read_excel('data/str_data/STR_patterns.xlsx')
valid_markers = str_markers_ref[~str_markers_ref.Allele1Pattern.isna()]
valid_markers.head()

Unnamed: 0,Code,Chr,LocusStart,LocusEnd,LocusName,LocusLength,RepeatStructure,IsSTR,Allele1Pattern,Allele2Pattern,Unnamed: 10
1,chr1:163589926-163590185 D1S1677,chr1,163589926,163590185,D1S1677,259,[TTCC]n,True,[TTCC]n,[AAGG]n,
7,chr12:107928490-107928728 D12ATA63,chr12,107928490,107928728,D12ATA63,238,[TTG]n [TTA]n,True,[TTG]n [TTA]n,[TAA]n CA [ACA]n,"[('[TGT]n TA [TTA]n', 48, 843, 21)]"
11,chr14:94841954-94842205 D14S1434,chr14,94841954,94842205,D14S1434,251,[CTGT]n [CTAT]n,True,[CTGT]n [CTAT]n,[GGAT]n AG [ATAG]n [ACAG]n,"[('[CTGT]n [CTAT]n CC [ATCC]n', 48, 793, 17)]"
35,chr2:168788793-168789036 D2S1776,chr2,168788793,168789036,D2S1776,243,[AGAT]n,True,[AGAT]n,[ATCT]n,"[('[AGAT]n [GATA]n', 48, 669, 16)]"
46,chr3:85803384-85803631 D3S4529,chr3,85803384,85803631,D3S4529,247,[GATA]n AATA [GATA]n,True,[GATA]n,[CTAT]n TTA [TCTA]n [TCTG]n,


# Annotation

In [5]:
import tqdm 
from str_finder.utils import *
from str_finder.repeat_pattern import GreedyRepeatPattern

results = []
for s in tqdm.tqdm_notebook(samples_loci_dict):
    df = pd.DataFrame({'sample_id': [s]})
    for l, path in samples_loci_dict[s].items():
        if l not in valid_markers.LocusName.tolist():
            continue
        
        r = get_most_common_reads_from_path(path)
        if r is None:
            for allele_id in range(1, 3):
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available reads (with length > 100)'
            continue
        # Repeat pattern
        locus_rp = valid_markers.query(f'LocusName == "{l}"')[f'Allele1Pattern'].values[0]
        if isinstance(locus_rp, str):
            rp = GreedyRepeatPattern(locus_rp)
            
            alleles, annotations = check_repeat_pattern(rp, r)
            result = retrieve_true_alleles(alleles)
            
            for allele_id, (allele, support) in enumerate(result):
                df[f'{l}_{allele_id+1}'] = allele
                df[f'{l}_{allele_id+1}_annotation'] = annotations[allele]
                df[f'{l}_{allele_id+1}_support (# reads)'] = support
        else:
            for allele_id in range(1, 3):           
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available patterns'
    results.append(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [8]:
annotations = pd.concat(results, axis=0).reset_index(drop=True)
assert (
    annotations[[c 
                 for c in annotations.columns
                 if 'id' not in c and 'annotation' not in c and 'support' not in c
                ]].sum(axis=0).sort_index() > 0
).all()
annotations = annotations[['sample_id'] + sorted(annotations.columns[1:])]

In [14]:
annotations.to_excel('data/str_data/annotation_results_v4.xlsx', index=False)