# ETV4AAA induced mouse prostate malignancy by 10x Chromium

In [1]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
%load_ext rpy2.ipython

In [14]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri
import doubletdetection as dd

sc.settings.n_jobs=12
sc.set_figure_params()

## Preprocessing

### Read the cellranger output

In [2]:
#Data files
sample_strings = ['TY', 'EWT', 'E4A', 'E4Ap53']
file_base = './ETV4/raw/'
exp_strings = ['_2weeks_', '_4months_']
sample_id_strings = ['1', '2', '3']
data_file_end = 'matrix.mtx.gz'
barcode_file_end = 'barcodes.tsv.gz'
gene_file_end = 'features.tsv.gz'
cc_genes_file = './regev_lab_cell_cycle_genes.txt'

# Loop to load rest of data sets
data_file=[]
barcode_file=[]
gene_file=[]
for i in range(len(sample_strings)):
    #Parse Filenames
    sample = sample_strings[i]
    for j in range(len(exp_strings)):
        exp_string = exp_strings[j]
        for k in range(len(sample_id_strings)):
            sample_id = sample_id_strings[k]
            data_file.append(file_base+sample+exp_string+sample_id)
            barcode_file.append(file_base+sample+exp_string+sample_id)
            gene_file.append(file_base+sample+exp_string+sample_id)

In [3]:
data_file

['./ETV4/raw/TY_2weeks_1',
 './ETV4/raw/TY_2weeks_2',
 './ETV4/raw/TY_2weeks_3',
 './ETV4/raw/TY_4months_1',
 './ETV4/raw/TY_4months_2',
 './ETV4/raw/TY_4months_3',
 './ETV4/raw/EWT_2weeks_1',
 './ETV4/raw/EWT_2weeks_2',
 './ETV4/raw/EWT_2weeks_3',
 './ETV4/raw/EWT_4months_1',
 './ETV4/raw/EWT_4months_2',
 './ETV4/raw/EWT_4months_3',
 './ETV4/raw/E4A_2weeks_1',
 './ETV4/raw/E4A_2weeks_2',
 './ETV4/raw/E4A_2weeks_3',
 './ETV4/raw/E4A_4months_1',
 './ETV4/raw/E4A_4months_2',
 './ETV4/raw/E4A_4months_3',
 './ETV4/raw/E4Ap53_2weeks_1',
 './ETV4/raw/E4Ap53_2weeks_2',
 './ETV4/raw/E4Ap53_2weeks_3',
 './ETV4/raw/E4Ap53_4months_1',
 './ETV4/raw/E4Ap53_4months_2',
 './ETV4/raw/E4Ap53_4months_3']

### Filter the raw data using scCB2

In [4]:
%%R -i data_file
for (i in data_file) {
    print(paste0("./ETV4/filtered/", basename(i)))
}

[1] "./ETV4/filtered/TY_2weeks_1"
[1] "./ETV4/filtered/TY_2weeks_2"
[1] "./ETV4/filtered/TY_2weeks_3"
[1] "./ETV4/filtered/TY_4months_1"
[1] "./ETV4/filtered/TY_4months_2"
[1] "./ETV4/filtered/TY_4months_3"
[1] "./ETV4/filtered/EWT_2weeks_1"
[1] "./ETV4/filtered/EWT_2weeks_2"
[1] "./ETV4/filtered/EWT_2weeks_3"
[1] "./ETV4/filtered/EWT_4months_1"
[1] "./ETV4/filtered/EWT_4months_2"
[1] "./ETV4/filtered/EWT_4months_3"
[1] "./ETV4/filtered/E4A_2weeks_1"
[1] "./ETV4/filtered/E4A_2weeks_2"
[1] "./ETV4/filtered/E4A_2weeks_3"
[1] "./ETV4/filtered/E4A_4months_1"
[1] "./ETV4/filtered/E4A_4months_2"
[1] "./ETV4/filtered/E4A_4months_3"
[1] "./ETV4/filtered/E4Ap53_2weeks_1"
[1] "./ETV4/filtered/E4Ap53_2weeks_2"
[1] "./ETV4/filtered/E4Ap53_2weeks_3"
[1] "./ETV4/filtered/E4Ap53_4months_1"
[1] "./ETV4/filtered/E4Ap53_4months_2"
[1] "./ETV4/filtered/E4Ap53_4months_3"


In [5]:
%%R -i data_file
library(scCB2)
library(SummarizedExperiment)
# If raw data has three separate files within one directory
# and you want to control FDR at the default 1%:
for (i in data_file) {
    print(i)
    tmp =  QuickCB2(dir = i, FDR_threshold = 0.01, lower = 100, Ncores = 12)
    DropletUtils::write10xCounts(paste0("./ETV4/filtered/", basename(i)), tmp, version = "3")
}
# If raw data is in HDF5 format and 
# you'd like a Seurat object under default FDR threshold:
# RealCell_S <-  QuickCB2(h5file = "/path/to/raw/data/HDF5", AsSeurat = TRUE)

[1] "./ETV4/raw/TY_2weeks_1"
Time difference of 2.052453 mins
[1] "./ETV4/raw/TY_2weeks_2"
Time difference of 1.982656 mins
[1] "./ETV4/raw/TY_2weeks_3"
Time difference of 1.811328 mins
[1] "./ETV4/raw/TY_4months_1"
Time difference of 1.830371 mins
[1] "./ETV4/raw/TY_4months_2"
Time difference of 2.286967 mins
[1] "./ETV4/raw/TY_4months_3"
Time difference of 2.253415 mins
[1] "./ETV4/raw/EWT_2weeks_1"
Time difference of 2.275727 mins
[1] "./ETV4/raw/EWT_2weeks_2"
Time difference of 2.043041 mins
[1] "./ETV4/raw/EWT_2weeks_3"
Time difference of 1.533912 mins
[1] "./ETV4/raw/EWT_4months_1"
Time difference of 2.547668 mins
[1] "./ETV4/raw/EWT_4months_2"
Time difference of 2.379622 mins
[1] "./ETV4/raw/EWT_4months_3"
Time difference of 2.137279 mins
[1] "./ETV4/raw/E4A_2weeks_1"
Time difference of 1.793202 mins
[1] "./ETV4/raw/E4A_2weeks_2"
Time difference of 2.240405 mins
[1] "./ETV4/raw/E4A_2weeks_3"
Time difference of 2.739024 mins
[1] "./ETV4/raw/E4A_4months_1"
Time difference of 2.395

In [44]:
%%R -i data_file
basename(data_file[[1]])
paste0(data_file[[1]],"/../filtered/",basename(data_file[[1]]))

[1] "./ETV4/raw/TY_2weeks_1/../filtered/TY_2weeks_1"
