# Introduction

Things are getting confusing where one notebook was generating several things.

Lets simplify so I can cache and compare more results.

This notebook will aim for collecting more information about the raw fastq read lenghts.

In [1]:
from Bio import SeqIO
from matplotlib import pyplot
import numpy
from pathlib import Path
import pandas
import pysam
from tqdm import tqdm
from xopen import xopen

In [2]:
experiments = pandas.read_csv("cached_experiments.tsv", sep="\t")

In [3]:
target = experiments.iloc[0]["target"]
target

'cache/ENCSR293MOX_1_1_ENCFF318SKH_reads.fastq.gz'

In [4]:
def score_fastq_reads(filename):
    read_len = []
    with xopen(filename, "rt") as instream:
        for record in SeqIO.parse(instream, "fastq"):
            read_len.append(len(record))
    
    read_len = pandas.Series(read_len)
    summary = read_len.describe()
    return {
        "count": summary["count"],
        "mean": summary["mean"],
        "median": numpy.median(read_len),
        "std": summary["std"],
        "min": summary["min"],
        "25%": summary["25%"],
        "50%": summary["50%"],
        "75%": summary["75%"],
        "max": summary["max"],
    }

score_fastq_reads("cache/ENCSR293MOX_1_1_ENCFF318SKH_reads.fastq.gz")


{'count': 690170.0,
 'mean': 1205.5905791326775,
 'median': 912.0,
 'std': 868.0045786701045,
 'min': 300.0,
 '25%': 577.0,
 '50%': 912.0,
 '75%': 1578.0,
 'max': 20450.0}

In [5]:
fastq_lengths = {}
for i, row in tqdm(experiments.iterrows(), total=experiments.shape[0]):
    name = "{}_{}".format(row["experiment"], row["tech_rep"])
    if row.output_type == "reads":
        fastq_lengths[name] = score_fastq_reads(row["target"])


100%|██████████| 276/276 [5:42:05<00:00, 74.37s/it]   


In [8]:
fastq_lengths = pandas.DataFrame(fastq_lengths)

In [10]:
print(fastq_lengths.T.to_csv())

,count,mean,median,std,min,25%,50%,75%,max
ENCSR293MOX_1_1,690170.0,1205.5905791326775,912.0,868.0045786701045,300.0,577.0,912.0,1578.0,20450.0
ENCSR081NRO_1_1,856966.0,1386.1037229014921,1240.0,935.0327891445207,300.0,729.0,1240.0,1743.0,20435.0
ENCSR257JBF_1_1,722117.0,1468.4182687846983,1404.0,928.9868700408534,300.0,821.0,1404.0,1772.0,19854.0
ENCSR257JBF_1_2,775345.0,1539.4406399731731,1431.0,987.833986721948,300.0,854.0,1431.0,1854.0,20390.0
ENCSR838WFC_1_1,2137168.0,1606.0831726845995,1567.0,718.6829338704822,50.0,1044.0,1567.0,2082.0,12755.0
ENCSR838WFC_2_1,2538701.0,1512.6869599846536,1457.0,676.1319888867405,50.0,956.0,1457.0,1964.0,6456.0
ENCSR902GAF_3_1,880468.0,1488.3266524166693,1361.0,888.6247582358219,300.0,801.0,1361.0,1935.0,20739.0
ENCSR902GAF_1_1,1118967.0,1312.6724738084322,1181.0,797.2221000969963,300.0,730.0,1181.0,1714.0,20607.0
ENCSR902GAF_2_1,1121461.0,1531.2538893461297,1391.0,924.8811009276827,300.0,833.0,1391.0,1976.0,20568.0
ENCSR181CES_1_1,1761843.0,1856.

In [12]:
fastq_lengths.T.to_csv("fastq_lengths.csv")