# Preprocess Serum Data
Here we preprocess the serum data. Here we will also explore two different methods: using fractions and without using fractions

In [1]:
import pandas as pd
import numpy as np
import os
import pysam
import re
import json as js
from scipy import stats
from joblib import  Parallel, delayed
from multiprocessing import cpu_count
import seaborn as sns
import random
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# PRAD Serum

In [2]:
cell_lines = [f for f in os.scandir("data/prad_serum/") if f.name.endswith(".srt.dd.bam")]
len(cell_lines)

72

In [5]:
%%bash
for f in data/prad_serum/*.srt.dd.bam;
do 
base=$(basename $f)
out=${base/.bam/.bed}
echo "$bedtools bamtobed -i $f > data/prad_serum/$out"
bedtools bamtobed -i $f > data/prad_serum/$out
done &> log/prad_cell_lines_bamtobed.out

In [6]:
for f in os.scandir("data/prad_serum"):
    if f.name.endswith(".srt.dd.bed"):
        name = f.name.split(".")[0]
        outfile = f"data/prad_serum/{name}.filter.bed"
        with open(outfile, "wt") as out, open(f, "rt") as file:
            for line in file:
                if re.match("chr[\d+,X,Y]", line) and "None" not in line:
                    out.write(line) 

## Test Frac vs no Frac
Here we test the difference in using fractions to assign oncRNA calls, as opposed to our previous method of just using most number of overlaps. This may help improve suprious calls (primarily RNAs that only had 1~5 bp overlap with our oncRNA locus annotations.

## No Fraction Version
This would just be straight up any intersects.

In [17]:
%%bash
for f in data/prad_serum/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -a $f  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/$out"
intersectBed -s -wo -a $f  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/$out
done

intersectBed -s -wo -a data/prad_serum/12557-06-T1_S14.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-06-T1_S14.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-06-T4_S15.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-06-T4_S15.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-11-T1_S16.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-11-T1_S16.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-11-T4_S17.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-11-T4_S17.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-13-T1_S18.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-13-T1_S18.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-13-T4_S19.filter.bed  -b data/pancancer_filter_oncRNAs.bed > no_frac/prad_serum/12557-13-T4_S19.oncRNA.bed
intersectBed -s -wo -a data/prad_serum/12557-17-T1_S20.filter.bed  -b data/pancancer_fil

## Fraction Version


In [18]:
%%bash
for f in data/prad_serum/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -f 0.9 -a $f  -b data/pancancer_filter_oncRNAs.bed > frac/prad_serum/$out"
intersectBed -s -wo -f 0.9 -a $f  -b data/pancancer_filter_oncRNAs.bed > frac/prad_serum/$out
done &> log/frac_prad_intersect.out

The `-f` option would be akin to the ratio heurstic used in COMPRSA paper to assign reads to smRNA annotations. https://www.nature.com/articles/s41598-020-61495-0

# PDAC Serum

In [19]:
cell_lines = [f for f in os.scandir("data/pdac_serum/") if f.name.endswith(".srt.dd.bed")]
len(cell_lines)

9

In [20]:
for f in os.scandir("data/pdac_serum"):
    if f.name.endswith(".srt.dd.bed"):
        name = f.name.split(".")[0]
        outfile = f"data/pdac_serum/{name}.filter.bed"
        with open(outfile, "wt") as out, open(f, "rt") as file:
            for line in file:
                if re.match("chr[\d+,X,Y]", line) and "None" not in line:
                    out.write(line) 

In [23]:
%%bash
#No frac version
for f in data/pdac_serum/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -a $f -b data/pancancer_filter_oncRNAs.bed > no_frac/pdac_serum/$out"
intersectBed -s -wo -a $f -b data/pancancer_filter_oncRNAs.bed > no_frac/pdac_serum/$out
done &> log/no_frac_pdac_intersect.out

In [24]:
%%bash
#Frac version
for f in data/pdac_serum/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -f 0.9 -a $f -b data/pancancer_filter_oncRNAs.bed > frac/pdac_serum/$out"
intersectBed -s -wo -f 0.9 -a $f -b data/pancancer_filter_oncRNAs.bed > frac/pdac_serum/$out
done &> log/frac_pdac_intersect.out

# BRCA

In [31]:
%%bash
#No frac version
for f in /rumi/shams/jwang/ISPY/data/bedfiles/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -a $f -b data/pancancer_filter_oncRNAs.bed > no_frac/brca_serum/$out"
intersectBed -s -wo -a $f -b data/pancancer_filter_oncRNAs.bed > no_frac/brca_serum/$out
done &> log/no_frac_brca_intersect.out

In [32]:
%%bash
#Frac version
for f in /rumi/shams/jwang/ISPY/data/bedfiles/*.filter.bed; do
out=$(basename $f)
out=${out/.filter.bed/.oncRNA.bed}
echo "intersectBed -s -wo -f 0.9 -a $f -b data/pancancer_filter_oncRNAs.bed > frac/brca_serum/$out"
intersectBed -s -wo -f 0.9 -a $f -b data/pancancer_filter_oncRNAs.bed > frac/brca_serum/$out
done &> log/frac_brca_intersect.out

# Create Count Matrix

In [56]:
frac_directories = [d for d in os.scandir("frac") if d.name.endswith("serum")]
len(frac_directories)

3

In [57]:
frac_sample_loci = {}
frac_empty = []
for cancer in frac_directories:
    cell_lines = [f for f in os.scandir(cancer) if f.name.endswith(".oncRNA.bed")]
    for f in cell_lines:
        sample = f.name.split(".")[0]
        if "CRC" in sample or "H" in sample:
            continue
        elif "T" in sample:
            splits = sample.split("_")
            sample = f"PRAD_{splits[0]}"
        elif "PNC" in sample:
            splits = sample.split("_")
            sample = splits[0]
        else:
            sample = f"ISPY_{sample}"
            
        if os.path.getsize(f) == 0:
            frac_empty.append(sample)
            continue
        oncRNA_bed = pd.read_csv(f, header=None, sep="\t")
        loci_features = oncRNA_bed[9] #Locus annotation as a feature.
        loci_bp_overlaps = oncRNA_bed[12] #Number of bp overlaps between read and locus annotation.
        read_ids = oncRNA_bed[3] #Query ID from original bamfile for each read.
        
        #First create read_id map
        read_id_locus_map = {}
        for i in range(len(read_ids)):
            _id = read_ids.iloc[i]
            num_bp_match = loci_bp_overlaps.iloc[i]
            locus = loci_features.iloc[i]

            if _id in read_id_locus_map: #Indicates multiple hits/overlaps for one read.
                if num_bp_match > read_id_locus_map[_id]["bp"]: #Update locus count if more number of bp matched. This is our simple binning procedure.
                    read_id_locus_map[_id] = {"locus":locus, "bp":num_bp_match}
            else:
                read_id_locus_map[_id] = {"locus":locus, "bp":num_bp_match}

        assert len(read_id_locus_map) == len(oncRNA_bed[3].unique()) #Ensures we do not overcount reads  
        #Counts of loci
        frac_sample_loci[sample] = {}
        for _id in read_id_locus_map: 
            locus = read_id_locus_map[_id]["locus"] 
            if locus in frac_sample_loci[sample]:
                frac_sample_loci[sample][locus] += 1
            else:
                frac_sample_loci[sample][locus] = 1
                

In [58]:
no_frac_directories = [d for d in os.scandir("no_frac") if d.name.endswith("serum")]
len(no_frac_directories)

3

In [59]:
sample_loci = {}
empty = []
for cancer in no_frac_directories:
    cell_lines = [f for f in os.scandir(cancer) if f.name.endswith(".oncRNA.bed")]
    for f in cell_lines:
        sample = f.name.split(".")[0]
        if "CRC" in sample or "H" in sample:
            continue
        elif "T" in sample:
            splits = sample.split("_")
            sample = f"PRAD_{splits[0]}"
        elif "PNC" in sample:
            splits = sample.split("_")
            sample = splits[0]
        else:
            sample = f"ISPY_{sample}"
            
        if os.path.getsize(f) == 0:
            empty.append(sample)
            continue
            
        oncRNA_bed = pd.read_csv(f, header=None, sep="\t")
        loci_features = oncRNA_bed[9] #Locus annotation as a feature.
        loci_bp_overlaps = oncRNA_bed[12] #Number of bp overlaps between read and locus annotation.
        read_ids = oncRNA_bed[3] #Query ID from original bamfile for each read.
        
        #First create read_id map
        read_id_locus_map = {}
        for i in range(len(read_ids)):
            _id = read_ids.iloc[i]
            num_bp_match = loci_bp_overlaps.iloc[i]
            locus = loci_features.iloc[i]

            if _id in read_id_locus_map: #Indicates multiple hits/overlaps for one read.
                if num_bp_match > read_id_locus_map[_id]["bp"]: #Update locus count if more number of bp matched. This is our simple binning procedure.
                    read_id_locus_map[_id] = {"locus":locus, "bp":num_bp_match}
            else:
                read_id_locus_map[_id] = {"locus":locus, "bp":num_bp_match}

        assert len(read_id_locus_map) == len(oncRNA_bed[3].unique()) #Ensures we do not overcount reads  
        #Counts of loci
        sample_loci[sample] = {}
        for _id in read_id_locus_map: 
            locus = read_id_locus_map[_id]["locus"] 
            if locus in sample_loci[sample]:
                sample_loci[sample][locus] += 1
            else:
                sample_loci[sample][locus] = 1
                

In [60]:
len(empty), len(frac_empty)

(1, 1)

In [61]:
empty, frac_empty

(['ISPY_S90'], ['ISPY_S90'])

In [62]:
with open(f'data/counts/frac_sample_loci.json', 'w') as f:
    js.dump(frac_sample_loci, f)
    f.close()

In [63]:
with open(f'data/counts/sample_loci.json', 'w') as f:
    js.dump(sample_loci, f)
    f.close()

# Done
Finished preprocessing