# Laragen validation

Based on the reviewer comments we need to validate the sequencing calls from Laragen and combine this with the data from oir Nanopore sequencing.

We seek to show: 

1. Read in the laragen sequencing and "flag" low quality sequneces
2. Compare the calling performance of Laragen vs Nanopore
3. Compare the two nanopore runs (using different barcodes and run on different days)

In [61]:
from Bio import SeqIO
import pandas as pd
import os

df = pd.DataFrame()
labels = []
full_seq = []
filenames = []
data_dir = '../Data/LevSeq-review-sanger/'
files = os.listdir(data_dir)
for f in files:
    if '.ab1' in f:
        handle = open(f'{data_dir}{f}', "rb")
        for record in SeqIO.parse(handle, "abi"):
            # Sequence is in record.Seq
            note_str = str(record)
            full_seq.append(str(record.seq))
            labels.append(note_str)
            filenames.append(f)
df['filename'] = filenames
df['seq'] = full_seq
df['labels'] = labels
df.to_csv('../Data/Laragen_Validation/ab1_processed.csv', index=False)
laragen_seqs = df.copy()
laragen_seqs['id'] = [int(num.split('_')[0]) for num in laragen_seqs['filename'].values]
laragen_seqs = laragen_seqs.sort_values('id')


## There are two barcodes that we test to show that LevSeq gets the same irrespective of barcoding

In [62]:
barcode_label = '400-1-25'
df = pd.read_csv('../Data/Laragen_Validation/variants.csv')
df = df[df['name'] == barcode_label]
# Add in the aligned variants
df['LaragenNumber'] = laragen_seqs['filename'].values
df['LaragenInfo'] = laragen_seqs['labels'].values
df['LaragenSeq'] = laragen_seqs['seq'].values

## Only consider high quality bases from Laragen

Here we aim to drop low quality bases so that they don't confuse the sequnces

In [53]:
#print(df['labels'].values[0])

## Align all sequences to find the variants


In [63]:
ref_seq = df['refseq'].values[0]

In [64]:
# Align to the reference
import os
from Bio import SeqIO
from Bio.PDB.Polypeptide import aa1

count_correct, count_wrong = 0, 0
laragen_variants = []
with open(f'seqs/{barcode_label}.fa', 'w+') as fout:
    fout.write(f'>ref\n{ref_seq}\n')
    for well, laragen, nanopore in df[['Well', 'LaragenSeq', 'refseq']].values:
        # Align them
        fout.write(f'>{well}\n{laragen.replace("-", "")}\n')

In [65]:
os.system(f'clustal-omega --force -i seqs/{barcode_label}.fa -o seqs/{barcode_label}_msa.fa')

## Read in the aligned sequence and calculate the variants w.r.t. to the refseq

In [66]:
seqs = [str(record.seq) for record in SeqIO.parse(f'seqs/{barcode_label}_msa.fa', "fasta")]
# Read in the ref seq
ref_seq = seqs[0]
# Read it back in 
laragen_variants = []
for laragen_gappy in seqs[1:]:
    ref_seq_non_gapped_idx = 1
    variant = []
    for i, nc in enumerate(laragen_gappy):
        if nc != ref_seq[i]:
            if nc != '-' and ref_seq[i] != '-':
                variant.append(f'{ref_seq[i]}{ref_seq_non_gapped_idx}{nc}')
            elif nc == '-':
                variant.append(f'{ref_seq[i]}{ref_seq_non_gapped_idx}DEL')
            # Ignore inserts for now
        if ref_seq[i] != '-':
            ref_seq_non_gapped_idx += 1
    if len(variant) > 0:
        laragen_variants.append('_'.join(variant))
    else:
        laragen_variants.append('#PARENT#')


In [67]:
df['LaragenVariants'] = laragen_variants
# Now look at which ones agree and which disagree
df[[c for c in df.columns if c != 'LaragenInfo']].to_csv(f'LaragenVsNanopore_{barcode_label}.csv', index=False)