# ETV4AAA induced mouse prostate malignancy by 10x Chromium

If the relevant libraries and modules have been installed on the system, they must be loaded. 
The scanpy api gives access to all tools availalbe via the scanpy platform. 
Numpy and scipy are useful scientific computing libraries that are used by scanpy, and 
pandas is used to load in the data. 
Pandas is also the basis of the data structures used in scanpy (the Anndata module). 
Matplotlib and seaborn are used for plotting.

R libraries that are used in this tutorial include:
- scran for preprocessing
- MAST for differential expression analysis


In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler
import rpy2.rinterface_lib.callbacks
import logging
from rpy2.robjects import pandas2ri
import anndata2ri
import doubletdetection as dd
from gseapy.plot import barplot, dotplot
import os, sys
import gseapy as gp
from anndata import AnnData

import scipy.stats
import diffxpy.api as de
import anndata
from batchglm.api.models.tf1.glm_nb import Simulator

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()


plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 0   # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=200, dpi_save=300, facecolor='white')
sc.logging.print_versions()
sc.logging.print_header()

# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
%load_ext rpy2.ipython

%matplotlib inline

plt.rcParams.update({'font.size': 22})

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

-----
anndata     0.7.5
scanpy      1.6.1
sinfo       0.3.1
-----
Crypto                              3.10.1
OpenSSL                             20.0.1
PIL                                 8.1.0
absl                                NA
anndata                             0.7.5
anndata2ri                          1.0.6
anyio                               NA
appdirs                             1.4.4
astunparse                          1.6.3
attr                                21.2.0
babel                               2.9.0
backcall                            0.2.0
batchglm                            v0.7.4
bcrypt                              3.2.0
bioservices                         1.7.11
boto3                               1.17.107
botocore                            1.20.107
brotli                              NA
bs4                                 4.9.3
cairo                               1.20.0
certifi                             2021.05.30
cffi                                1.14.4
c

In [2]:
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
#.libPaths("/home/dan/R/x86_64-pc-linux-gnu-library/4.03")
#Sys.setenv(R_INSTALL_STAGED = FALSE)

#sudo apt-get install aptitude

#sudo apt-get install libcurl4-openssl-dev

#sudo apt-get install libxml2-dev

# Load all the R libraries we will be using in the notebook
library(scran)
library(RColorBrewer)
library(clusterExperiment)
library(ggplot2)
library(plyr)
library(MAST)
#library(scCB2)
#library(DropletUtils)
library(Seurat)
library(NMF)
library(BiocFileCache)

In [None]:
# Set up data loading

#Data files
sample_strings = ['TY', 'EWT', 'E4A', 'E4Ap53']
file_base = './data/'
exp_strings = ['_2weeks_', '_4months_']
sample_id_strings = ['1', '2', '3']
data_file_end = 'matrix.mtx.gz'
barcode_file_end = 'barcodes.tsv.gz'
gene_file_end = 'features.tsv.gz'
cc_genes_file = './Macosko_cell_cycle_genes.txt'

In [None]:
# First data set load & annotation
#Parse Filenames
sample = sample_strings.pop(0)
sample_id = sample_id_strings.pop(0)
exp_string = exp_strings.pop(0)
data_file = file_base+sample+exp_string+sample_id+'/'+'matrix.mtx.gz'
barcode_file = file_base+sample+exp_string+sample_id+'/'+'barcodes.tsv.gz'
gene_file = file_base+sample+exp_string+sample_id+'/'+'features.tsv.gz'

In [None]:
#Load data using scanpy
adata = sc.read(data_file, cache=True)
adata = adata.transpose()

In [None]:
#Sparse data formats are not supported by rpy2 and therefore do not work with any of the integrated R commands.
#Datasets can be turned into a dense format using the code:
adata.X = adata.X.toarray()

barcodes = pd.read_csv(barcode_file, header=None, sep='\t')
genes = pd.read_csv(gene_file, header=None, sep='\t')


In [None]:
genes.head()

In [None]:
#Annotate data
barcodes.rename(columns={0:'barcode'}, inplace=True)
barcodes.set_index('barcode', inplace=True)
adata.obs = barcodes

In [None]:
adata.obs

In [None]:
adata.obs['sample'] = [sample]*adata.n_obs
adata.obs['age'] = [exp_string.split("_")[1]]*adata.n_obs
adata.obs['batch'] = [sample_id]*adata.n_obs
adata.obs['genotype_age'] = [sample+'_'+exp_string.split("_")[1]]*adata.n_obs

In [None]:
adata.obs

In [None]:
genes

In [None]:
genes.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
genes.set_index('gene_symbol', inplace=True)
adata.var = genes

In [None]:
genes.index

In [None]:
adata.var_names_make_unique()

In [None]:
adata.var.columns

In [None]:
adata.obs

In [None]:
adata.var['gene_id']

In [None]:
# Set up data loading

#Data files
sample_strings = ['TY', 'EWT', 'E4A', 'E4Ap53']
file_base = './data/'
exp_strings = ['_2weeks_', '_4months_']
sample_id_strings = ['1', '2', '3']
data_file_end = 'matrix.mtx.gz'
barcode_file_end = 'barcodes.tsv.gz'
gene_file_end = 'features.tsv.gz'
cc_genes_file = './regev_lab_cell_cycle_genes.txt'

# Loop to load rest of data sets
data_file=[]
barcode_file=[]
gene_file=[]
for i in range(len(sample_strings)):
    #Parse Filenames
    sample = sample_strings[i]
    for j in range(len(exp_strings)):
        exp_string = exp_strings[j]
        for k in range(len(sample_id_strings)):
            sample_id = sample_id_strings[k]
            data_file.append(file_base+sample+exp_string+sample_id+'/'+'matrix.mtx.gz')
            barcode_file.append(file_base+sample+exp_string+sample_id+'/'+'barcodes.tsv.gz')
            gene_file.append(file_base+sample+exp_string+sample_id+'/'+'features.tsv.gz')

In [None]:
len(data_file)

In [None]:
[data_file[i].split("/")[2].split("_")[0]]

In [None]:
[data_file[i].split("/")[2].split("_")[1]]

In [None]:
[data_file[i].split("/")[2].split("_")[2]]

In [None]:
[data_file[i].split("/")[2].split("_")[0]+'_'+data_file[i].split("/")[2].split("_")[1]]

In [None]:
# Loop to load rest of data sets
for i in range(1,len(data_file)):
    #Parse Filenames
    data_file_tem = data_file[i]
    barcode_file_tem = barcode_file[i]
    gene_file_tem = gene_file[i]
    
    #Load data
    adata_tmp = sc.read(data_file_tem, cache=True)
    adata_tmp = adata_tmp.transpose()
    adata_tmp.X = adata_tmp.X.toarray()

    barcodes_tmp = pd.read_csv(barcode_file_tem, header=None, sep='\t')
    genes_tmp = pd.read_csv(gene_file_tem, header=None, sep='\t')
    
    #Annotate data
    barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
    barcodes_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = barcodes_tmp
    adata_tmp.obs['sample'] = [data_file[i].split("/")[2].split("_")[0]]*adata_tmp.n_obs
    adata_tmp.obs['age'] = [data_file[i].split("/")[2].split("_")[1]]*adata_tmp.n_obs
    adata_tmp.obs['batch'] = [data_file[i].split("/")[2].split("_")[2]]*adata_tmp.n_obs
    adata_tmp.obs['genotype_age'] = [data_file[i].split("/")[2].split("_")[0]
                                     +'_'+data_file[i].split("/")[2].split("_")[1]]*adata_tmp.n_obs
    
    genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
    genes_tmp.set_index('gene_symbol', inplace=True)
    adata_tmp.var = genes_tmp
    adata_tmp.var_names_make_unique()

    # Concatenate to main adata object
    adata = adata.concatenate(adata_tmp, batch_key='sample_id')
    if 'gene_id-1' in adata.var.columns:
        adata.var['gene_id'] = adata.var['gene_id-1']
        adata.var.drop(columns = ['gene_id-1', 'gene_id-0'], inplace=True)
    adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names = [c.split("-")[0] for c in adata.obs_names]
    adata.obs_names_make_unique()

In [None]:
#64003 rows × 3 columns
adata.obs

In [None]:
#length
adata.obs_names

In [None]:
#31060 rows × 2 columns
adata.var

In [None]:
adata.var_names

In [None]:
#Length: 27998
adata.var['gene_id']

In [None]:
adata.var['gene_id']

In [None]:
#Assign variable names and gene id columns
adata.var_names = [g.split("_")[-1] for g in adata.var_names]
adata.var['gene_id'] = [g.split("_")[-1] for g in adata.var['gene_id']]

In [None]:
adata.var['gene_id']

In [None]:
adata.var_names

In [None]:
#31060 rows × 2 columns
adata.var

In [None]:
#64003 rows × 3 columns
adata.obs

In [None]:
adata.obs['sample']

In [None]:
adata.obs['age']

In [None]:
adata.obs['batch']

In [None]:
# Annotate the data sets
print(adata.obs['sample'].value_counts())
print('')
print(adata.obs['age'].value_counts())
print('')
print(adata.obs['batch'].value_counts())

In [None]:
# Checking the total size of the data set
adata.shape

In [None]:
import pickle

# obj0, obj1, obj2 are created here...

# Saving the objects:
with open('adata.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump(adata, f)

In [None]:
import pickle

# Getting back the objects:
with open('adata.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    adata = pickle.load(f)