In [1]:
import dt4dds.analysis.alignment
import dt4dds.analysis.fileio
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as pg
import scipy.stats
import Bio.Seq
import gzip

rng = np.random.default_rng()

### generate random reference sequences and random sequencing reads

In [8]:
bases = ['A', 'C', 'G', 'T']
n_ref_seqs = 12000
n_read_seqs = 100000
seq_length = 102


with open('../../data/SeqSimilarity/design_files.fasta', 'w') as f:
    for i in range(n_ref_seqs):
        f.write(f'>{str(i).zfill(6)}\n')
        f.write(''.join(rng.choice(bases, size=seq_length, replace=True)))
        f.write('\n\n')


seqlist = [''.join(rng.choice(bases, size=seq_length, replace=True)) for i in range(n_read_seqs)]

with gzip.open('../../data/SeqSimilarity/R1.fq.gz', 'wt') as f:
    f.writelines(['\n'.join([
        f"@Read{str(i).zfill(9)}",
        str(sequence),
        "+",
        "F"*len(sequence),
        ''
    ]) for i, sequence in enumerate(seqlist)])

with gzip.open('../../data/SeqSimilarity/R2.fq.gz', 'wt') as f:
    f.writelines(['\n'.join([
        f"@Read{str(i).zfill(9)}",
        str(Bio.Seq.Seq(sequence).reverse_complement()),
        "+",
        "F"*len(sequence),
        ''
    ]) for i, sequence in enumerate(seqlist)])

In [9]:
def get_similarity(reference_file, fw_read_file, rv_read_file, n_reads):
    references_fw = dt4dds.analysis.fileio.ensure_no_duplicate_references(dt4dds.analysis.fileio.read_reference(reference_file))
    references_rv = dt4dds.analysis.fileio.reverse_complement_references(references_fw)

    reads = dt4dds.analysis.fileio.initial_sampler(dt4dds.analysis.fileio.paired_read_generator(fw_read_file, rv_read_file), n_reads)

    mappings = dt4dds.analysis.alignment.align_paired_reads_caller(
        reads, 
        references_fw, 
        references_rv, 
        read_window=60, 
        score_cutoff=0, 
    )
    return pd.DataFrame([(m.similarity, m.read_number) for mapping in mappings for m in mapping], columns=['sim', 'read'])

In [11]:
reference_file = '../../data/SeqSimilarity/design_files.fasta'
fw_read_file = '../../data/SeqSimilarity/R1.fq.gz'
rv_read_file = '../../data/SeqSimilarity/R2.fq.gz'
n_reads = 10000

df1 = get_similarity(reference_file, fw_read_file, rv_read_file, n_reads)
df1['group'] = 'random'

In [12]:
reference_file = '../../data/Aging/0a_Genscript_GCall/design_files.fasta'
fw_read_file = '../../data/Aging/0a_Genscript_GCall/R1.fq.gz'
rv_read_file = '../../data/Aging/0a_Genscript_GCall/R2.fq.gz'
n_reads = 10000

df2 = get_similarity(reference_file, fw_read_file, rv_read_file, n_reads)
df2['group'] = 'exp'

In [26]:
df = pd.concat([df1, df2]).reset_index()

In [4]:
fig = px.histogram(
    df,
    x='sim',
    color='group',
    barmode='group',
    histnorm='probability',
)


val_09 = df.loc[(df.group == 'exp') & (df.sim < 0.85)  & (df.sim > 0.7)].shape[0]/df.loc[(df.group == 'exp')].shape[0]
fig.add_annotation(x=0.765, y=0.05,
    text=f"{val_09*100:0.1f}%",
    showarrow=False,
    yshift=0, 
    xanchor="left",
    font_color="#222222",
    font_family="Inter",
    font_size=28/3, 
)
val_07 = df.loc[(df.group == 'exp') & (df.sim < 0.7)].shape[0]/df.loc[(df.group == 'exp')].shape[0]
fig.add_annotation(x=0.615, y=0.05,
    text=f"{val_07*100:0.1f}%",
    showarrow=False,
    yshift=0, 
    xanchor="left",
    font_color="#222222",
    font_family="Inter",
    font_size=28/3, 
)

fig.add_vline(
    x=0.85,
    line_dash='dash',
    line_width=2,
)
fig.add_vline(
    x=0.7,
    line_dash='dash',
    line_width=2,
)

fig.update_traces(marker_line_width=0, selector=dict(type='histogram')) 
fig.update_layout(
    template="simple_white", 
    height=330, 
    width=330, 
    showlegend=False,
    margin=dict(l=0, r=20, t=20, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

fig.update_xaxes(
    title='Similarity score',
    range=[0.4, 1.01],
    tickformat="0.0%",
    dtick=0.1,
    minor_ticks="outside", 
    minor_dtick=0.05,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3,
)

fig.update_yaxes(
    title='Probability',
    range=[0, 0.3],
    tickformat="0.0%",
    dtick=0.1,
    minor_ticks="outside", 
    minor_dtick=0.05,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3,
)

fig.show()
fig.write_image("similarity_comparison.svg")