# Cancer Cell Lines Enriched Loci: DE Subtype Analysis
The goal in this notebook is to identify smRNA loci that are enriched in specific cancer subtype cell lines and not in normal cell lines via DESeq based analysis. In this notebook we will compare each cancer subtype cell line vs normal cell line separately.

In [1]:
import pandas as pd
import json as js
import warnings
import subprocess
warnings.filterwarnings("ignore")
import rpy2.ipython
%load_ext rpy2.ipython

In [2]:
%%R 
library(DESeq2)
library(EnhancedVolcano)

R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int

## Load Counts

In [3]:
with open('data/counts/cell_lines_loci_counts.json', 'r') as f:
    sample_loci = js.load(f)
    f.close()

In [4]:
# Filter out LM2_2_cell_S37 from consideration
del sample_loci["LM2_2_cell_S37"]

## DESeq: Subtype vs Normal

In [5]:
cell_lines = list(sample_loci.keys())

In [6]:
subtype_map = {"MDA453":"HER2", "SKBR-3":"HER2", 
              "HUMEC":"Normal", "ZR75-1":"Luminal", "HCC38": "TNBC",
              "CN34":"TNBC", "MCF7": "Luminal", "MDA231":"TNBC", 
              "LM2":"Met", "LM1a":"Met"}

In [7]:
col_name_map = {}
for name in cell_lines:
    col_name_map[name] = name.split("_")[-1]

In [8]:
subtypes = set(subtype_map.values())
normal_cell_lines = [s for s in cell_lines if "HUMEC" in s]
for sub in subtypes:
    if sub == "Normal":
        continue
    subtype_cell_lines = [s for s in cell_lines if subtype_map[s.split("_")[0]] == sub]
    deseq_cell_lines = subtype_cell_lines + normal_cell_lines
    sample_names = [col_name_map[name] for name in deseq_cell_lines] #Reduce sample name to just sample number.
    condition = ["Normal" if "HUMEC" in name else sub for name in deseq_cell_lines]
    coldata = pd.DataFrame({"condition": condition, 
                        "type" : ["single_read"]*len(condition), 
                           "full_name" : deseq_cell_lines})
    coldata.index = sample_names
    coldata = coldata.sort_index()
   
    
    #Unique loci after healthy exRNA filtering
    unique_loci = set()
    for cell_line in coldata["full_name"]: #Create counts only using SUB samples and normal samples.
        unique_loci.update(sample_loci[cell_line].keys())
    
    loci_counts = {} #Use the raw counts
    for locus in unique_loci:
        loci_counts[locus] = {} 
        for cell_group in coldata["full_name"]:
            loci_dict = sample_loci[cell_group]
            if locus in loci_dict:
                loci_counts[locus][cell_group] = loci_dict[locus]
            else:
                loci_counts[locus][cell_group] = 0   
                             
    cm = pd.DataFrame(loci_counts).T
    cm = cm.rename(columns=col_name_map)
    cm = cm.sort_index(axis=1)
    cm_out = f"results/DESeq_subtype/{sub}vNormal_cm.csv"
    cm.to_csv(cm_out)
    
    coldata_out = f"results/DESeq_subtype/{sub}vNormal_coldata.csv"
    coldata.to_csv(coldata_out)
    print(f"Rscript scripts/DESeq_subtype.R -m {cm_out} -d {coldata_out} -c1 {sub} -c2 Normal -o results/DESeq_subtype/")
    subprocess.call (f"Rscript scripts/DESeq_subtype.R -m {cm_out} -d {coldata_out} -c1 {sub} -c2 Normal -o results/DESeq_subtype/", shell=True)  

Rscript scripts/DESeq_subtype.R -m results/DESeq_subtype/LuminalvNormal_cm.csv -d results/DESeq_subtype/LuminalvNormal_coldata.csv -c1 Luminal -c2 Normal -o results/DESeq_subtype/
Rscript scripts/DESeq_subtype.R -m results/DESeq_subtype/MetvNormal_cm.csv -d results/DESeq_subtype/MetvNormal_coldata.csv -c1 Met -c2 Normal -o results/DESeq_subtype/
Rscript scripts/DESeq_subtype.R -m results/DESeq_subtype/HER2vNormal_cm.csv -d results/DESeq_subtype/HER2vNormal_coldata.csv -c1 HER2 -c2 Normal -o results/DESeq_subtype/
Rscript scripts/DESeq_subtype.R -m results/DESeq_subtype/TNBCvNormal_cm.csv -d results/DESeq_subtype/TNBCvNormal_coldata.csv -c1 TNBC -c2 Normal -o results/DESeq_subtype/


## Extract Significant Loci

In [9]:
humec_loci = set()
for cell_line, loci_dict in sample_loci.items():
    if "HUMEC" in cell_line:
        humec_loci.update(loci_dict.keys())
len(humec_loci)

408011

In [10]:
for sub in subtypes:
    if sub == "Normal":
        continue
    results = pd.read_csv(f"results/DESeq_subtype/{sub}vNormal_dds_res.csv")
    sig_results = results[(results["pvalue"] <= 0.1) & (results["log2FoldChange"] >= 1)] #Select for enriched loci.
    
    humec_fil_sig_loci = set()
    for locus in sig_results["row"]:
        if locus not in humec_loci: #Skip RNAs that are found in humec loci.
            humec_fil_sig_loci.add(locus)
            
    with open(f"results/DESeq_subtype/{sub}_sig_loci_deseq.bed", "wt") as out:   
        for locus in humec_fil_sig_loci:
            splits = locus.split(":")
            start = splits[1].split("-")[0]
            end = splits[1].split("-")[1]
            bed = f"{splits[0]}\t{start}\t{end}\t{locus}\t.\t{splits[-1]}"
            out.write(bed + "\n")   

# Done