In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

# Reference (Forensic v5)

In [2]:
import pandas as pd

ref = pd.ExcelFile('data/Forensic_STR_Sequence_Structure_Guide_v5.xlsx')
ref.sheet_names

['S1A. Common Use A-STRs',
 'S1B. Common Use XY-STRs',
 'S1C. Additional A-STRs',
 'S1D. Additional XY-STRs',
 'Sequence strings for all STRs',
 'Change log']

**A-STRs**

In [3]:
from collections import defaultdict

def retrieve_markers_info(df, meta=''):
    markers = defaultdict(list)
    df[df.columns[0]] = df[df.columns[0]].fillna(0)
    for index, row in df.iterrows():
        if row[df.columns[0]] > 0:
            markers['marker'].append(row[4])
            markers['pattern'].append(row[3])
            markers['position'].append(df.iloc[index+1].values[1])
            markers['metadata'].append(meta)
    return markers

In [4]:
a_strs_df = ref.parse('S1A. Common Use A-STRs', skiprows=3)
a_str_markers = retrieve_markers_info(a_strs_df, 'autosomal')

**XY-STRs**

In [5]:
xy_strs_df = ref.parse('S1B. Common Use XY-STRs', skiprows=2)
xy_str_markers = retrieve_markers_info(xy_strs_df, 'xy')

**A-STRs (additional)**

In [6]:
a_strs_df_add = ref.parse('S1C. Additional A-STRs', skiprows=2)
a_str_markers_add = retrieve_markers_info(a_strs_df_add, 'autosomal')

**XY-STRs (additional)**

In [7]:
xy_strs_df_add = ref.parse('S1D. Additional XY-STRs', skiprows=2)
xy_str_markers_add = retrieve_markers_info(xy_strs_df_add, 'xy')

In [18]:
markers_df = pd.concat(
    map(pd.DataFrame,
        [a_str_markers, xy_str_markers, a_str_markers_add, xy_str_markers_add])
).reset_index(drop=True)
markers_df = markers_df[['position', 'marker', 'pattern', 'metadata']]
markers_df.marker = markers_df.marker.apply(lambda x: '_'.join(x.split()))

In [23]:
markers_df.to_csv('data/str_data/ref_str_data.csv', index=False)

with open('data/str_data/ref_str_data_small.csv', 'w') as f:
    for index, row in markers_df.iterrows():
        f.write(f'{row.position} {row.marker}\n')

In [3]:
from str_finder.utils import inspect_storage_dir

samples_loci_dict, loci_samples_dict = inspect_storage_dir('data/str_data/loci_reads')

# STR markers

In [4]:
import pandas as pd

str_markers_ref = pd.read_excel('data/str_data/STR_patterns.xlsx')
valid_markers = str_markers_ref[~str_markers_ref.Allele1Pattern.isna()]
valid_markers.head()

Unnamed: 0,Code,Chr,LocusStart,LocusEnd,LocusName,LocusLength,RepeatStructure,IsSTR,Allele1Pattern,Allele2Pattern,Unnamed: 10
1,chr1:163589926-163590185 D1S1677,chr1,163589926,163590185,D1S1677,259,[TTCC]n,True,[TTCC]n,[AAGG]n,
7,chr12:107928490-107928728 D12ATA63,chr12,107928490,107928728,D12ATA63,238,[TTG]n [TTA]n,True,[TTG]n [TTA]n,[TAA]n CA [ACA]n,"[('[TGT]n TA [TTA]n', 48, 843, 21)]"
11,chr14:94841954-94842205 D14S1434,chr14,94841954,94842205,D14S1434,251,[CTGT]n [CTAT]n,True,[CTGT]n [CTAT]n,[GGAT]n AG [ATAG]n [ACAG]n,"[('[CTGT]n [CTAT]n CC [ATCC]n', 48, 793, 17)]"
35,chr2:168788793-168789036 D2S1776,chr2,168788793,168789036,D2S1776,243,[AGAT]n,True,[AGAT]n,[ATCT]n,"[('[AGAT]n [GATA]n', 48, 669, 16)]"
46,chr3:85803384-85803631 D3S4529,chr3,85803384,85803631,D3S4529,247,[GATA]n AATA [GATA]n,True,[GATA]n,[CTAT]n TTA [TCTA]n [TCTG]n,


# Annotation

In [6]:
import tqdm 
from str_finder.utils import *
from str_finder.repeat_pattern import GreedyRepeatPattern

results = []
for s in tqdm.tqdm_notebook(samples_loci_dict):
    df = pd.DataFrame({'sample_id': [s]})
    for l, path in samples_loci_dict[s].items():
        if l not in valid_markers.LocusName.tolist():
            continue
        
        r = get_most_common_reads_from_path(path)
        if r is None:
            for allele_id in range(1, 3):
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available reads (with length > 100)'
            continue
        # Repeat pattern
        locus_rp = valid_markers.query(f'LocusName == "{l}"')[f'Allele1Pattern'].values[0]
        if isinstance(locus_rp, str):
            rp = GreedyRepeatPattern(locus_rp)
            
            alleles, annotations = check_repeat_pattern(rp, r)
            result = retrieve_true_alleles(alleles)
            print(l, result)
            
            for allele_id, (allele, support) in enumerate(result):
                df[f'{l}_{allele_id+1}'] = allele
                df[f'{l}_{allele_id+1}_annotation'] = annotations[allele]
                df[f'{l}_{allele_id+1}_support (# reads)'] = support
        else:
            for allele_id in range(1, 3):           
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available patterns'
    results.append(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))

D12ATA63 [(13, 166), (16, 136)]
D14S1434 [(12, 409), (13, 292)]
D1S1677 [(13, 197), (15, 158)]
D2S1776 [(8, 538), (11, 475)]
D3S4529 [(9, 806)]
D5S2800 [(17, 900)]
D6S474 [(15, 385), (12, 379)]
DXS10075 [(12, 477), (13, 389)]
DXS10079 [(18, 231), (21, 199)]
DXS10101 [(30, 328)]
DXS10134 [(13, 2451)]
DXS10146 [(16, 62), (15, 27)]
DXS10147 [(6, 795), (7, 677)]
DXS10148 [(15, 100), (18, 83)]
DXS7133 [(11, 621), (9, 317)]
DXS7424 [(14, 429), (17, 238)]
DXS8377 [(30, 70), (37, 31)]
DXS9895 [(16, 439), (14, 372)]
GATA172D05 [(10, 357)]
D12ATA63 [(13, 150), (17, 111)]
D14S1434 [(13, 693)]
D1S1677 [(12, 228), (14, 157)]
D2S1776 [(11, 370), (12, 330)]
D3S4529 [(7, 388), (10, 347)]
D5S2800 [(17, 473), (14, 450)]
D6S474 [(15, 339), (12, 336)]
DXS10075 [(12, 485)]
DXS10079 [(22, 204)]
DXS10101 [(29, 161)]
DXS10134 [(13, 1030)]
DXS10146 [(17, 72)]
DXS10147 [(7, 512), (9, 484)]
DXS10148 [(14, 62)]
DXS7133 [(10, 400)]
DXS7424 [(17, 268)]
DXS8377 [(36, 41), (35, 22)]
DXS9895 [(16, 356)]
GATA172D05 [(1

DXS10148 [(12, 92), (17, 75)]
DXS7133 [(11, 1510)]
DXS7424 [(16, 371), (17, 312)]
DXS8377 [(33, 76), (34, 71)]
DXS9895 [(14, 827)]
GATA172D05 [(10, 183), (8, 176)]
D12ATA63 [(17, 190)]
D14S1434 [(13, 270), (14, 262)]
D1S1677 [(14, 273)]
D2S1776 [(12, 551)]
D3S4529 [(5, 320), (9, 272)]
D5S2800 [(14, 400), (23, 244)]
D6S474 [(13, 281), (12, 280)]
DXS10075 [(11, 388)]
DXS10079 [(20, 166)]
DXS10101 [(29, 155)]
DXS10134 [(13, 849)]
DXS10146 [(17, 66)]
DXS10147 [(6, 455), (8, 382)]
DXS10148 [(12, 57)]
DXS7133 [(11, 501)]
DXS7424 [(15, 217)]
DXS8377 [(26, 105)]
DXS9895 [(16, 226)]
GATA172D05 [(10, 92)]
D12ATA63 [(18, 191)]
D14S1434 [(10, 342), (14, 284)]
D1S1677 [(13, 321)]
D2S1776 [(11, 905)]
D3S4529 [(7, 371), (10, 279)]
D5S2800 [(17, 712)]
D6S474 [(13, 731)]
DXS10075 [(10, 424)]
DXS10079 [(18, 204)]
DXS10101 [(33, 103)]
DXS10134 [(13, 949)]
DXS10146 [(17, 61)]
DXS10147 [(7, 526)]
DXS10148 [(12, 62)]
DXS7133 [(12, 533)]
DXS7424 [(17, 266)]
DXS8377 [(35, 34)]
DXS9895 [(15, 309)]
GATA172D05 [

DXS10146 [(18, 36)]
DXS10147 [(6, 624), (8, 428)]
DXS10148 [(8, 122)]
DXS7133 [(10, 751)]
DXS7424 [(15, 350)]
DXS8377 [(34, 47), (33, 17)]
DXS9895 [(14, 405)]
GATA172D05 [(11, 91)]
D12ATA63 [(13, 213)]
D14S1434 [(14, 296), (12, 287)]
D1S1677 [(14, 151), (15, 89)]
D2S1776 [(10, 366), (11, 282)]
D3S4529 [(8, 246), (9, 230)]
D5S2800 [(17, 666)]
D6S474 [(12, 293), (15, 250)]
DXS10075 [(12, 397)]
DXS10079 [(19, 195)]
DXS10101 [(28, 160)]
DXS10134 [(13, 881)]
DXS10146 [(14, 31)]
DXS10147 [(6, 415), (8, 297)]
DXS10148 [(8, 97)]
DXS7133 [(9, 290)]
DXS7424 [(13, 246)]
DXS8377 [(35, 30), (34, 11)]
DXS9895 [(14, 249)]
GATA172D05 [(10, 85)]
D12ATA63 [(13, 167), (15, 158)]
D14S1434 [(13, 482), (14, 412)]
D1S1677 [(12, 261), (14, 206)]
D2S1776 [(11, 485), (12, 394)]
D3S4529 [(6, 386), (8, 296)]
D5S2800 [(17, 1087)]
D6S474 [(13, 372), (14, 364)]
DXS10075 [(12, 566)]
DXS10079 [(19, 235)]
DXS10101 [(27, 222)]
DXS10134 [(13, 919)]
DXS10146 [(18, 80)]
DXS10147 [(7, 599), (9, 461)]
DXS10148 [(16, 66)]
DXS

In [8]:
annotations = pd.concat(results, axis=0).reset_index(drop=True)
assert (
    annotations[[c 
                 for c in annotations.columns
                 if 'id' not in c and 'annotation' not in c and 'support' not in c
                ]].sum(axis=0).sort_index() > 0
).all()
annotations = annotations[['sample_id'] + sorted(annotations.columns[1:])]

In [14]:
annotations.to_excel('data/str_data/annotation_results_v4.xlsx', index=False)