## Rustyread analysis

## Compare read generation

Run `snakemake --use-conda -j {numbre of job} {your option} data/compare/rustyread.paf data/compare/badread.paf data/real_reads.paf` before run next cells

### Read length

In [None]:
import pandas
import altair

altair.data_transformers.disable_max_rows()

def get_length(path):
    with open(path) as fh:
        line_count = 0
        for line in fh:
            line_count += 1
            if line_count % 4 == 2:
                yield len(line[:-1])
            
        
data = [('real', x) for x in get_length("data/real_reads.fastq")]
data += [('rustyread', x) for x in get_length("data/compare/rustyread.fastq")]
data += [('badread', x) for x in get_length("data/compare/badread.fastq")]

df = pandas.DataFrame(data, columns=["origin", "length"])
print(df["length"].sum())

print(df.groupby("origin").describe())

altair.Chart(df).mark_area(
    interpolate='step-after',
    line=True,
    opacity=0.3
).encode(
    x=altair.X("length", bin=altair.Bin(maxbins=600), scale=altair.Scale(domain=[0, 64_000])),
    y=altair.Y("count()", stack=False),
    color=altair.Color("origin"),
).interactive()

### Read identity

In [None]:
import pandas
import altair

altair.data_transformers.disable_max_rows()

def get_identity(path):
    with open(path) as fh:
        for line in fh:
            line = line.split("\t")
            nm = int([e.split(":")[2] for e in line if e.startswith("NM:i:")][0])
            if nm > int(line[1]):
                yield 0
            else:
                yield 1 - nm / int(line[1])

data = [('real', x) for x in get_identity("data/real_reads.paf")]
data += [('rustyread', x) for x in get_identity("data/compare/rustyread.paf")]
data += [('badread', x) for x in get_identity("data/compare/badread.paf")]

df = pandas.DataFrame(data, columns=["origin", "identity"])

print(df.groupby('origin').describe())

altair.Chart(df).mark_area(
    interpolate='step-after',
    line=True,
    opacity=0.3
).encode(
    x=altair.X("identity", bin=altair.Bin(maxbins=200)),
    y=altair.Y("count()", stack=False),
    color='origin',
).interactive()

### Quality

In [None]:
import pandas
import altair
from collections import Counter

altair.data_transformers.disable_max_rows()

def get_qual(path):
    with open(path) as fh:
        line_count = 0
        for line in fh:
            line_count += 1
            if line_count % 4 == 0:
                for q in line[:-1]:
                    yield ord(q) - 33
    
tmp = Counter(get_qual("data/real_reads.fastq"))
data = [("real", i, tmp[i]) for i in sorted(tmp.keys())]

tmp = Counter(get_qual("data/compare/rustyread.fastq"))
data += [("rustyread", i, tmp[i]) for i in sorted(tmp.keys()) if i > 0]

tmp = Counter(get_qual("data/compare/badread.fastq"))
data += [("badread", i, tmp[i]) for i in sorted(tmp.keys()) if i > 0]

df = pandas.DataFrame(data, columns=["origin", "x", "y"])

print(df.groupby('origin').describe())

altair.Chart(df).mark_area(
    interpolate='step-after',
    line=True,
    opacity=0.3
).encode(
    x=altair.X("x", bin=altair.Bin(maxbins=200)),
    y=altair.Y("y", stack=False),
    color='origin',
).interactive()

## Effect of parametre

Run `snakemake --use-conda -j 1 {your option} -R ms_all` before run next cell

In [None]:
import os
import re
import altair
import pandas

data = list()
with os.scandir("benchmarks/pe/") as it:
    for entry in it:
        if entry.is_file() and entry.name.startswith("thread_"):
            threads = int(re.search("thread_(\d+).tsv", entry.name).group(1))
            with open(entry.path) as fh:
                next(fh)
                for line in fh:
                    wall_time = float(line.split("\t")[0])
                    data.append((threads, wall_time))
        
data.sort()
raw_df = pandas.DataFrame(data, columns=["nb_threads", "wall_time"])

df = pandas.DataFrame(sorted(list(set(raw_df["nb_threads"]))), columns=["nb_threads"])

df["mean"] = list(raw_df.groupby("nb_threads").mean()["wall_time"])
df["std"] = list(raw_df.groupby("nb_threads").std()["wall_time"])

df["linear"] = [df["mean"][0] / x for x in df["nb_threads"]]

base = altair.Chart(df).transform_calculate(
    ymin="datum.mean-datum.std",
    ymax="datum.mean+datum.std"
)

point = base.mark_circle().encode(
    x=altair.X('nb_threads', scale=altair.Scale(domain=[0, 9])),
    y='mean',
)

line = base.mark_line().encode(
    x='nb_threads',
    y='linear'
)

errorbars = base.mark_errorbar().encode(
    x='nb_threads',
    y='ymin:Q',
    y2='ymax:Q'
)

point + errorbars + line

In [None]:
import os
import re
import altair
import pandas

data = list()
with os.scandir("benchmarks/pe/") as it:
    for entry in it:
        if entry.is_file() and entry.name.startswith("thread_"):
            threads = int(re.search("thread_(\d+).tsv", entry.name).group(1))
            with open(entry.path) as fh:
                next(fh)
                for line in fh:
                    wall_time = float(line.split("\t")[9])
                    data.append((threads, wall_time))
data.sort()      
raw_df = pandas.DataFrame(data, columns=["nb_threads", "cpu_time"])

df = pandas.DataFrame(sorted(list(set(raw_df["nb_threads"]))), columns=["nb_threads"])
df["mean"] = list(raw_df.groupby("nb_threads").mean()["cpu_time"])
df["std"] = list(raw_df.groupby("nb_threads").std()["cpu_time"])

df["linear"] = [df["mean"][0] / x for x in df["nb_threads"]]

base = altair.Chart(df).transform_calculate(
    ymin="datum.mean-datum.std",
    ymax="datum.mean+datum.std"
)

point = base.mark_circle().encode(
    x=altair.X('nb_threads', scale=altair.Scale(domain=[0, 9])),
    y='mean',
)

errorbars = base.mark_errorbar().encode(
    x='nb_threads',
    y='ymin:Q',
    y2='ymax:Q'
)

point + errorbars

In [None]:
import os
import re
import altair
import pandas

data = list()
with os.scandir("benchmarks/pe/") as it:
    for entry in it:
        if entry.is_file() and entry.name.startswith("quantity_"):
            threads = int(re.search("quantity_(\d+).tsv", entry.name).group(1))
            with open(entry.path) as fh:
                next(fh)
                for line in fh:
                    wall_time = float(line.split("\t")[0])
                    data.append((threads, wall_time))
        
raw_df = pandas.DataFrame(data, columns=["quantity", "wall_time"])

df = pandas.DataFrame(list(set(raw_df["quantity"])), columns=["quantity"])
df["mean"] = list(raw_df.groupby("quantity").mean()["wall_time"])
df["std"] = list(raw_df.groupby("quantity").std()["wall_time"])

base = altair.Chart(df).transform_calculate(
    ymin="datum.mean-datum.std",
    ymax="datum.mean+datum.std"
)

point = base.mark_circle().encode(
    x=altair.X('quantity', scale=altair.Scale(domain=[9, 51])),
    y='mean',
)

errorbars = base.mark_errorbar().encode(
    x='quantity',
    y='ymin:Q',
    y2='ymax:Q'
)

point + errorbars

In [None]:
import os
import re
import altair
import pandas

data = list()
with os.scandir("benchmarks/pe/") as it:
    for entry in it:
        if entry.is_file() and entry.name.startswith("identity_"):
            threads = int(re.search("identity_(\d+).tsv", entry.name).group(1))
            with open(entry.path) as fh:
                next(fh)
                for line in fh:
                    wall_time = float(line.split("\t")[0])
                    data.append((threads, wall_time))
        
raw_df = pandas.DataFrame(data, columns=["identity", "wall_time"])

df = pandas.DataFrame(list(set(raw_df["identity"])), columns=["identity"])
df["mean"] = list(raw_df.groupby("identity").mean()["wall_time"])
df["std"] = list(raw_df.groupby("identity").std()["wall_time"])

base = altair.Chart(df).transform_calculate(
    ymin="datum.mean-datum.std",
    ymax="datum.mean+datum.std"
)

point = base.mark_circle().encode(
    x=altair.X('identity', scale=altair.Scale(domain=[85, 100])),
    y='mean',
)

errorbars = base.mark_errorbar().encode(
    x='identity',
    y='ymin:Q',
    y2='ymax:Q'
)

point + errorbars

In [None]:
# Some setup
%load_ext autoreload

%autoreload 2

import altair
altair.data_transformers.disable_max_rows()

def custom_theme():
    return {
        'config': {
            #"background": "#333",
            #"title": {"color": "#fff"},
            #"style": {"guide-label": {"fill": "#fff"}, "guide-title": {"fill": "#fff"}},
            #"axis": {"domainColor": "#fff", "gridColor": "#888", "tickColor": "#fff"},
            'view': {
                'height': 750,
                'width': 1000,
            },
            'range': {
                #'category': {'scheme': 'bluepurple'}
            }
        }
    }

altair.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
altair.themes.enable('custom_theme')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))