In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

# Reference (Forensic v5)

In [2]:
import pandas as pd
pd.set_option('max_colwidth', 400)

ref = pd.ExcelFile('data/Forensic_STR_Sequence_Structure_Guide_v5.xlsx')
ref.sheet_names

['S1A. Common Use A-STRs',
 'S1B. Common Use XY-STRs',
 'S1C. Additional A-STRs',
 'S1D. Additional XY-STRs',
 'Sequence strings for all STRs',
 'Change log']

**A-STRs**

In [3]:
from collections import defaultdict

def retrieve_markers_info(df, meta=''):
    markers = defaultdict(list)
    df[df.columns[0]] = df[df.columns[0]].fillna(0)
    for index, row in df.iterrows():
        if row[df.columns[0]] > 0:
            markers['marker'].append(row[4])
            markers['pattern'].append(row[3])
            markers['position'].append(df.iloc[index+1].values[1])
            markers['metadata'].append(meta)
    return markers

In [4]:
a_strs_df = ref.parse('S1A. Common Use A-STRs', skiprows=3)
a_str_markers = retrieve_markers_info(a_strs_df, 'autosomal')

**XY-STRs**

In [5]:
xy_strs_df = ref.parse('S1B. Common Use XY-STRs', skiprows=2)
xy_str_markers = retrieve_markers_info(xy_strs_df, 'xy')

**A-STRs (additional)**

In [6]:
a_strs_df_add = ref.parse('S1C. Additional A-STRs', skiprows=2)
a_str_markers_add = retrieve_markers_info(a_strs_df_add, 'autosomal')

**XY-STRs (additional)**

In [7]:
xy_strs_df_add = ref.parse('S1D. Additional XY-STRs', skiprows=2)
xy_str_markers_add = retrieve_markers_info(xy_strs_df_add, 'xy')

In [8]:
markers_df = pd.concat(
    map(pd.DataFrame,
        [a_str_markers, xy_str_markers, a_str_markers_add, xy_str_markers_add])
).reset_index(drop=True)
markers_df = markers_df[['position', 'marker', 'pattern', 'metadata']]
markers_df.marker = markers_df.marker.apply(lambda x: '_'.join(x.split()))

In [9]:
markers_df.to_csv('data/str_data/ref_str_data.csv', index=False)

with open('data/str_data/ref_str_data_small.csv', 'w') as f:
    for index, row in markers_df.iterrows():
        if 'GRCH37' not in row.marker and '+' not in row.position and len(row.marker) < 30:
            f.write(f'{row.position} {row.marker}\n')

# STR markers

In [10]:
import pandas as pd

str_markers_ref = pd.read_csv('data/str_data/ref_str_data.csv')
str_markers_ref

Unnamed: 0,position,marker,pattern,metadata
0,chr1:163589926-163590185,D1S1677,[TTCC]n *,autosomal
1,chr1:230769516-230769783,D1S1656,CCTA [TCTA]n TCA [TCTA]n,autosomal
2,chr2:1489553-1489784,TPOX,[AATG]n,autosomal
3,chr2:68011847-68012094,D2S441,[TCTA]n TCA [TCTA]n,autosomal
4,chr2:168788793-168789036,D2S1776,[AGAT]n,autosomal
...,...,...,...,...
131,chrX:134520365-134520719,DXS10101,[AAAG]n gaaagaag [GAAA]n a [GAAA]n aaga [AAAG]n aaaaagaa [AAAG]n AA,xy
132,chrX:150403692-150404185,DXS10146_(GRCH38),[AAAG]n a [AAAG]n N10 [GGAA]n N10 [GGGA]n ggaaggga [GGAA]n a [GGAA]n,xy
133,chrX:150403692-150404185,DXS10146_(GRCH37),[TTCC]n t [TTCC]n t TTCC N12 TTCC TCCC N10 [TTCC]n tttctt [CTTT]n CTTC [CTTT]n t [CTTT]n,xy
134,chrX:150481726-150482131,DXS10134_(GRCH37),[GAAA]n gaga [GAAA]n aa GAAA gaga [GAAA]n N22 GAAA gtaa [GAAA]n aaa [GAAA]n aaa [GAAA]n,xy


# Annotation

## Inspect actual reads

In [11]:
from str_finder.utils import inspect_storage_dir

sl, ls = inspect_storage_dir('data/str_data/loci_reads', '\\')

**Only markers with more than 10 samples will be annotated**

In [12]:
display(pd.DataFrame({
    'locus': list(ls.keys()),
    'samples': [', '.join(v.keys()) for k, v in ls.items()]
}))

Unnamed: 0,locus,samples
0,D12ATA63,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
1,D14S1434,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
2,D14S608,S1
3,D1S1677,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
4,D2S1360,S1
5,D2S1776,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
6,D3S4529,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
7,D4S2366,"S1, S17"
8,D5S2800,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"
9,D6S474,"S1, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S2, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S3, S30, S31, S32, S33, S34, S35, S36, S37, S38, S39, S4, S40, S41, S42, S43, S44, S45, S46, S47, S48, S5, S6, S7, S8, S9"


## Annotate

In [13]:
import tqdm 
from str_finder.utils import *
from str_finder.repeat_pattern import GreedyRepeatPattern

def get_pattern(str_markers_ref, locus):
    locus_rp = str_markers_ref.query(f'marker == "{locus}"')[f'pattern'].values[0]
    if locus_rp[-1] == '*':
        locus_rp = ' '.join(locus_rp.split()[:-1])
    return locus_rp


results = []
for s in tqdm.tqdm_notebook(sl):
    df = pd.DataFrame({'sample_id': [s]})
    for l, path in sl[s].items():
        # Known locus
        if l not in str_markers_ref.marker.tolist():
            continue
        # Locus with enough samples
        if len(ls[l]) < 10:
            continue
        
        # Retrieve the most common read (to speed up the process and remove outlier - reads with errors)
        r = get_most_common_reads_from_path(path)
        
        if r is None:
            for allele_id in range(1, 3):
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available reads (with length > 100)'
            continue

            # Repeat pattern
        locus_rp = get_pattern(str_markers_ref, l)
        
        if isinstance(locus_rp, str):
            rp = GreedyRepeatPattern(locus_rp)
            
            alleles, annotations = check_repeat_pattern(rp, r)
            result = retrieve_true_alleles(alleles)
            
            for allele_id, (allele, support) in enumerate(result):
                df[f'{l}_{allele_id+1}'] = allele
                df[f'{l}_{allele_id+1}_annotation'] = annotations[allele]
                df[f'{l}_{allele_id+1}_support (# reads)'] = support
        else:
            for allele_id in range(1, 3):           
                df[f'{l}_{allele_id}'] = 0
                df[f'{l}_{allele_id}_annotation'] = 'No available patterns'
    results.append(df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [14]:
annotations = pd.concat(results, axis=0).reset_index(drop=True)
assert (
    annotations[[c 
                 for c in annotations.columns
                 if 'id' not in c and 'annotation' not in c and 'support' not in c
                ]].sum(axis=0).sort_index() > 0
).all()
annotations = annotations[['sample_id'] + sorted(annotations.columns[1:])]

In [15]:
!pip install openpyxl



In [16]:
annotations.to_excel('data/str_data/annotation_results_v5.xlsx', index=False)