In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import scanpy as sc
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import nan_euclidean_distances
from sklearn.metrics import pairwise_distances, pairwise_distances_chunked
from tqdm import tqdm

sc.settings.verbosity = 4
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()
sc.set_figure_params(frameon=False, color_map='magma_r')



  import pandas.util.testing as tm


scanpy==1.5.1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.2 pandas==1.0.5 scikit-learn==0.22 statsmodels==0.10.2 python-igraph==0.7.1 louvain==0.6.1 leidenalg==0.7.0


## Data folder

In [2]:
splicing_folder = '/home/olga/data_sm/splicing/rawdata/rijk_zscore__2020-09-15'
! ls -lha $splicing_folder

total 30G
drwxr-xr-x 2 olga czb 4.0K Sep 15 11:53 .
drwxr-xr-x 4 olga czb 4.0K Sep 15 11:53 ..
-rw-r--r-- 1 olga czb 829M Sep 15 11:29 COVID_pilot2_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 3.5G Sep 15 11:30 COVID_pilot3_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 3.0G Sep 15 11:37 lemur_Antoine_4_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 316M Sep 15 11:37 lemur_Bernard_4_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 333M Sep 15 11:37 lemur_Martine_4_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 1.6G Sep 15 11:38 lemur_Stumpy_4_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 1.8G Sep 15 11:36 TS_10x_redo_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 3.1G Sep 15 11:31 TSP1_SS2_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb 5.5G Sep 15 11:33 TSP2_10x_rerun_3prime_sym_S_0.1_z_0.0_b_5.pq
-rw-r--r-- 1 olga czb  11G Sep 15 11:35 TSP2_SS2_sym_S_0.1_z_0.0_b_5.pq


# Read in data

In [3]:
parquet = f"{splicing_folder}/TS_10x_redo_sym_S_0.1_z_0.0_b_5.pq"
df = pd.read_parquet(parquet)
print(df.shape)
df.head()

Unnamed: 0,inc_emp.p,tissue,gene_count_per_cell_filt,juncPosR1A,geneR1A_uniq,juncPosR1B,numReads,cell,channel,splice_ann,...,cov_ann,z_ann,cov_dom_ch,z_dom_ch,cov_unann,z_unann,cov_dom_unch,z_dom_unch,ontology,ontology_gene
0,True,Pancreas,13.0,86968605.0,ADIRF,86970200.0,5.0,TSP1_exopancreas2_3_S6_L00_ACGTAGTCACTTGGCG,TSP1_exopancreas2_3_S6_L00,True,...,-0.02279,-0.267072,-0.024372,-0.215724,-0.000632,0.050399,,,PancreasEpithelialpancreatic acinar cell,PancreasEpithelialpancreatic acinar cellADIRF
1,True,,10.0,86968605.0,ADIRF,86970200.0,6.0,TSP1_exopancreas2_3_S6_L00_TTCGGTCGTATAATGG,TSP1_exopancreas2_3_S6_L00,True,...,-0.02279,-0.264596,-0.024372,-0.263014,-0.000632,,,,,
2,True,Pancreas,9.0,86968605.0,ADIRF,86970200.0,5.0,TSP1_exopancreas2_3_S6_L00_CTGCTCAGTCCAGTTA,TSP1_exopancreas2_3_S6_L00,True,...,-0.02279,-0.247094,-0.024372,-0.245513,-0.000632,,,,PancreasEpithelialpancreatic acinar cell,PancreasEpithelialpancreatic acinar cellADIRF
3,True,Pancreas,6.0,86968605.0,ADIRF,86970200.0,2.0,TSP1_exopancreas2_3_S6_L00_TCATGCCCATGACCCG,TSP1_exopancreas2_3_S6_L00,True,...,-0.02279,-0.186327,-0.024372,-0.184745,-0.000632,,,,PancreasEpithelialpancreatic ductal cell,PancreasEpithelialpancreatic ductal cellADIRF
4,True,Pancreas,7.0,86968605.0,ADIRF,86970200.0,3.0,TSP1_exopancreas2_3_S6_L00_GGTGGCTGTGCCTACG,TSP1_exopancreas2_3_S6_L00,True,...,-0.02279,-0.208286,-0.024372,-0.206705,-0.000632,,,,PancreasEpithelialpancreatic acinar cell,PancreasEpithelialpancreatic acinar cellADIRF


In [11]:
df.columns

Index(['inc_emp.p', 'tissue', 'gene_count_per_cell_filt', 'juncPosR1A',
       'geneR1A_uniq', 'juncPosR1B', 'numReads', 'cell', 'channel',
       'splice_ann', 'compartment', 'free_annotation', 'missing_domains',
       'domain_insertions', 'refName_newR1', 'gene_frac_filt', 'geneR1B_uniq',
       'sign', 'cell_gene', 'posA_group', 'posB_group', 'rank_acc', 'rank_don',
       'max_rank_acc', 'max_rank_don', 'num_missing_A', 'num_inserted_A',
       'domain_changed_A', 'num_missing_B', 'num_inserted_B',
       'domain_changed_B', 'noncon_count', 'max_rank', 'sum_reads_group',
       'read_x_acc', 'num', 'rank_mean', 'sq_diff', 'don_num', 'don_sigma',
       'S_ijk_A', 'S_ijk_A_unpinned', 'n_sijk', 'n_s', 'n_gene', 'sijkA_mean',
       'sd_num', 'sijkA_var', 'n_g', 'nSijkA', 'mult', 'z_A', 'cell_gene_junc',
       'x_sijk', 'denom_sq', 'temp', 'temp_mag', 'idxmax_z', 'junc_max_A',
       'max_don_z_A', 'num_ann', 'num_dom_ch', 'z_A_ann', 'z_A_dom_ch',
       'num_unann', 'num_dom_unch',

# function for multiprocessing splicing

In [16]:
%%file splicing_utils.py

import itertools
from functools import partial
import multiprocessing
import tempfile
import time

import numpy as np
import pandas as pd
from sourmash.logging import notify


def process_splicing(filename, gene='geneR1A_uniq', cell='cell', tissue=None):
    splicing_df = pd.read_parquet(filename)
    
    # Drop duplicate cell ids and gene names
    splicing_df_no_dups = splicing_df.drop_duplicates([cell, gene])
    
    if tissue is not None:
        splicing_df_no_dups = splicing_df_no_dups.query('tissue == @tissue')
        
    
    # Don't use rows with empty gene names -- these are unannotated genes
    splicing_df_no_dups = splicing_df_no_dups.query(f'{gene} != ""')
    print(splicing_df_no_dups.shape)
    splicing_df_no_dups.head()
    
    splicing2d = splicing_df_no_dups.pivot(index=cell, columns=gene, values='z')
    return splicing2d


def my_nan_euclidean_metric(row_i, row_j):
    #     assert row_i.shape == row_j.shape

    i_missing = np.isnan(row_i)
    j_missing = np.isnan(row_j)

    shared = (~i_missing) & (~j_missing)
    n_shared = shared.sum()
    if n_shared == 0:
        return 0
    weight = row_i.shape[0] / n_shared

    i_shared = row_i[shared]
    j_shared = row_j[shared]

    sum_of_squares = np.sum(np.square(i_shared - j_shared))
    distance = np.sqrt(weight * sum_of_squares)
    return distance


def to_memmap(array):
    """Write a memory mapped array
    Create a memory-map to an array stored in a binary file on disk.
    Memory-mapped files are used for accessing small segments of
    large files on disk, without reading the entire file into memory.
    :param np.array array to memory map
    :return: np.array large_memmap memory mapped array
    :return: str filename name of the file that memory mapped array is written to
    """
    import numpy as np

    filename = tempfile.NamedTemporaryFile(
        prefix="array", suffix=".mmap", delete=False
    ).name
    shape = array.shape
    f = np.memmap(filename, mode="w+", shape=shape, dtype=array.dtype)
    f[:] = array[:]
    del f
    large_memmap = np.memmap(filename, dtype=array.dtype, shape=shape)
    return large_memmap, filename


def distance_args_unpack(args):
    """Helper function to unpack the arguments. Written to use in pool.imap
    as it can only be given one argument."""
    row_i, row_j = args
    return my_nan_euclidean_metric(row_i, row_j)


def get_distances_at_index(index, matrix):
    """Returns similarities of all the combinations of signature at index in
    the siglist with the rest of the indices starting at index + 1. Doesn't
    redundantly calculate signatures with all the other indices prior to
    index - 1

    :param int index: generate masks from this image
    :param boolean ignore_abundance
        If the sketches are not abundance weighted, or ignore_abundance=True,
        compute Jaccard similarity.

        If the sketches are abundance weighted, calculate the angular
        similarity.
    :param boolean downsample by max_hash if True
    :param siglist list of signatures
    :return: list of similarities for the combinations of signature at index
        with rest of the signatures from index+1
    """
    startt = time.time()
    sig_iterator = itertools.product([matrix[index, :]], matrix[(index + 1) :, :])
    func = partial(distance_args_unpack,)
    similarity_list = list(map(func, sig_iterator))
    notify(
        "comparison for index {} done in {:.5f} seconds",
        index,
        time.time() - startt,
        end="\r",
    )
    return similarity_list


def distances_parallel(matrix, n_jobs):
    """Compare all combinations of signatures and return a matrix
    of similarities. Processes combinations parallely on number of processes
    given by n_jobs

    :param list siglist: list of signatures to compare
    :param boolean ignore_abundance
        If the sketches are not abundance weighted, or ignore_abundance=True,
        compute Jaccard similarity.

        If the sketches are abundance weighted, calculate the angular
        similarity.
    :param boolean downsample by max_hash if True
    :param int n_jobs number of processes to run the similarity calculations on
    :return: np.array similarity matrix
    """

    # Starting time - calculate time to keep track in case of lengthy siglist
    start_initial = time.time()

    # Create a memory map of the siglist using numpy to avoid memory burden
    # while accessing small parts in it
    matrix, _ = to_memmap(np.array(matrix))
    notify("Created memmapped siglist")

    # Check that length of combinations can result in a square similarity matrix
    length_matrix = len(matrix)
    shape = (length_matrix, length_matrix)

    # Initialize with ones in the diagonal as the similarity of a signature with
    # itself is one
    distances = np.zeros(shape, dtype=np.float64)
    memmap_distances, filename = to_memmap(distances)
    notify("Initialized memmapped similarities matrix")

    # Initialize the function using func.partial with the common arguments like
    # siglist, ignore_abundance, downsample, for computing all the signatures
    # The only changing parameter that will be mapped from the pool is the index
    func = partial(get_distances_at_index, matrix=matrix,)
    notify("Created similarity func")

    # Initialize multiprocess.pool
    pool = multiprocessing.Pool(processes=n_jobs)

    # Calculate chunk size, by default pool.imap chunk size is 1
    chunksize, extra = divmod(length_matrix, n_jobs)
    if extra:
        chunksize += 1
    notify("Calculated chunk size for multiprocessing")

    # This will not generate the results yet, since pool.imap returns a generator
    result = pool.imap(func, range(length_matrix), chunksize=chunksize)
    notify("Initialized multiprocessing pool.imap")

    # Enumerate and calculate similarities at each of the indices
    # and set the results at the appropriate combination coordinate
    # locations inside the similarity matrix
    for index, l in enumerate(result):
        startt = time.time()
        col_idx = index + 1
        for idx_condensed, item in enumerate(l):
            i = index
            j = col_idx + idx_condensed
            memmap_distances[i, j] = memmap_distances[j, i] = item
        notify(
            "Setting similarities matrix for index {} done in {:.5f} seconds",
            index,
            time.time() - startt,
            end="\r",
        )
    notify("Setting similarities completed")

    pool.close()
    pool.join()

    notify(
        "Time taken to compare all pairs parallely is {:.5f} seconds ",
        time.time() - start_initial,
    )
    return np.memmap(filename, dtype=np.float64, shape=shape)


Overwriting splicing_utils.py


## Function to process splicing

In [17]:
import splicing_utils

In [13]:
df.tissue.value_counts()

Bladder     3505271
Lung        3443061
Pancreas    2156269
Blood       1832072
Muscle      1009895
Name: tissue, dtype: int64

In [14]:
%%time

splicing_df = splicing_utils.process_splicing(parquet, tissue="Muscle")
print(splicing_df.shape)
splicing_df.head()

(305008, 93)
(2193, 1728)
CPU times: user 3min 6s, sys: 4min 9s, total: 7min 15s
Wall time: 3min 27s


geneR1A_uniq,A2M,AAK1,AAMDC,AARSD1,ABCA6,ABCA9,ABI3BP,AC005329.7,AC009950.1,AC018816.3,...,ZNF106,ZNF331,ZNF33A,ZNF385D,ZNF706,ZNF808,ZNHIT1,ZRANB2,bP-2189O9.2,unknown
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSP1_muscle_1_S13_L00_AAACGAACACTGTTCC,,,,,,,,,,,...,,,,,,,,,,
TSP1_muscle_1_S13_L00_AAAGAACCAACGTATC,,,,,,,,,,,...,,,,,,,,,,
TSP1_muscle_1_S13_L00_AAAGAACTCGTTCTGC,,,,,,,,,,,...,,,,,,,,,,
TSP1_muscle_1_S13_L00_AAAGGGCCAAGACTGG,,,,,,,,,,,...,,,,,,,,,,
TSP1_muscle_1_S13_L00_AAAGTCCCAGCAATTC,,,,,,,,,,,...,,,,,,,,,,


In [18]:
%%time

splicing_dists = splicing_utils.distances_parallel(splicing_df, n_jobs=8)

[KCreated memmapped siglist
[KInitialized memmapped similarities matrix
[KCreated similarity func
[KCalculated chunk size for multiprocessing
[KInitialized multiprocessing pool.imap
[KSetting similarities completedr index 2192 done in 0.00000 secondstting similarities matrix for index 70 done in 0.00184 secondsSetting similarities matrix for index 149 done in 0.00157 secondsSetting similarities matrix for index 270 done in 0.00120 secondsSetting similarities matrix for index 442 done in 0.00140 secondsSetting similarities matrix for index 705 done in 0.00089 secondsSetting similarities matrix for index 770 done in 0.00106 secondsSetting similarities matrix for index 1514 done in 0.00063 secondsSetting similarities matrix for index 1669 done in 0.00026 seconds
[KTime taken to compare all pairs parallely is 136.96023 seconds 


In [20]:
splicing_dists.shape

(2193, 2193)

In [22]:
splicing_dists_df = pd.DataFrame(splicing_dists, index=splicing_df.index, columns=splicing_df.index)
print(splicing_dists_df.shape)
splicing_dists_df.head()

(2193, 2193)


cell,TSP1_muscle_1_S13_L00_AAACGAACACTGTTCC,TSP1_muscle_1_S13_L00_AAAGAACCAACGTATC,TSP1_muscle_1_S13_L00_AAAGAACTCGTTCTGC,TSP1_muscle_1_S13_L00_AAAGGGCCAAGACTGG,TSP1_muscle_1_S13_L00_AAAGTCCCAGCAATTC,TSP1_muscle_1_S13_L00_AAATGGAAGCAGATAT,TSP1_muscle_1_S13_L00_AACACACAGTGATTCC,TSP1_muscle_1_S13_L00_AACACACCAGCACACC,TSP1_muscle_1_S13_L00_AACAGGGGTTTACTGG,TSP1_muscle_1_S13_L00_AACCAACAGACTCTAC,...,TSP1_muscle_3_S15_L00_TTTCCTCAGCCTGACC,TSP1_muscle_3_S15_L00_TTTCCTCAGGCCTTCG,TSP1_muscle_3_S15_L00_TTTCCTCGTGAAGCGT,TSP1_muscle_3_S15_L00_TTTCGATAGGTTAAAC,TSP1_muscle_3_S15_L00_TTTCGATCAAGCTGTT,TSP1_muscle_3_S15_L00_TTTGACTAGGGTGAAA,TSP1_muscle_3_S15_L00_TTTGACTCACTGCGAC,TSP1_muscle_3_S15_L00_TTTGGTTCATTGCCGG,TSP1_muscle_3_S15_L00_TTTGGTTTCGGCTTCT,TSP1_muscle_3_S15_L00_TTTGTTGCAACGCATT
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSP1_muscle_1_S13_L00_AAACGAACACTGTTCC,0.0,64.905423,78.590702,63.036527,100.899792,60.759464,124.069012,69.370627,59.147103,79.4212,...,71.318296,71.785115,76.858856,72.017216,80.879281,68.717007,73.554703,84.507145,71.374711,75.937644
TSP1_muscle_1_S13_L00_AAAGAACCAACGTATC,64.905423,0.0,70.962987,56.879149,96.694871,44.079041,98.472187,57.675664,46.678387,67.671647,...,75.414336,73.041602,74.167948,75.943463,68.686309,79.371609,65.985261,74.310812,73.479676,69.427236
TSP1_muscle_1_S13_L00_AAAGAACTCGTTCTGC,78.590702,70.962987,0.0,67.213372,64.517785,56.034204,131.404868,56.14144,62.82632,70.150058,...,101.111568,70.173379,88.143214,89.769384,77.457689,75.745095,65.253979,94.615487,77.696639,70.428732
TSP1_muscle_1_S13_L00_AAAGGGCCAAGACTGG,63.036527,56.879149,67.213372,0.0,62.290726,60.614921,110.431863,61.916032,59.007915,63.353294,...,87.488601,76.89742,82.938572,74.62065,68.804253,72.599116,66.700645,89.982978,76.313683,72.560491
TSP1_muscle_1_S13_L00_AAAGTCCCAGCAATTC,100.899792,96.694871,64.517785,62.290726,0.0,96.768701,103.861524,101.962892,67.174546,133.838731,...,107.649525,134.104692,129.018571,92.036977,123.750396,115.518367,113.03217,111.360558,130.981381,118.118669


In [None]:
2+2

In [None]:
splicing_df

## Read h5ad

In [None]:
h5ad = '/home/olga/googledrive/.shared/Data_Analysis/Combined_Dataset/compartments_with_splicing/rijk_JO_sym-Combined-all.h5ad'
adata = sc.read(h5ad)
adata

# Do BBKNN and UMAP

In [None]:
import bbknn

In [None]:
bbknn.bbknn_pca_matrix(splicing_dists_df,  metric='precomputed')