### Projection ('PCA densities') of (scGen) simulated LD-SPL cells 

### on the combined 10 donors (base normalised) dataset reference 

---
##### hpb29

Date: 2021-02-09

In [1]:
%matplotlib widget

import warnings
warnings.filterwarnings('ignore')

import os, sys, json, operator, getpass
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import scanpy as sc

import matplotlib.pyplot as plt
from ipywidgets import widgets

from scipy.spatial import distance
from collections import Counter

from scipy import stats

In [2]:
home = str(Path.home())
user = getpass.getuser()

basedir = os.path.join(home, 'datafloor/users', user, '2020/SLX19841/')

sc.settings.writedir = os.path.join(basedir, 'analysis/h5ad/')

In [3]:
sc.settings.verbosity = 3
sc.settings.file_format_figs = 'svg'
sc.settings.savefigs = False
sc.set_figure_params(dpi=150)

---

Read reference dataset

In [4]:
data = sc.read('COMBO10_NO_SPL3_Seurat3_lognorm')

Read predicted dataset (to project on reference)

In [5]:
prediction_100e = sc.read('COMBO10_SPL_lognorm_LD_prediction_hvg')

##### transfer annotations from dod SPL cells to predicted LD SPL cells

In [6]:
prediction_100e.obs = data[data.obs.organ == 'SPL', :].obs.copy()

##### re-adjust indexes in order to distinguish from source SPL cells

In [8]:
prediction_100e.obs.index = prediction_100e.obs.index.copy()+'.p'

##### annotate condition (measured vs predicted)

In [9]:
prediction_100e.obs['condition'] = 'pred_SPL'

### 'PCA density' projection

---

In [10]:
data.obs['condition'] = 'reference'

##### Concatenate base normalised dataset reference w/ LD-SPL prediction dataset

In [11]:
%%time
concat_data = data.concatenate(prediction_100e)

CPU times: user 7.61 s, sys: 1.32 s, total: 8.94 s
Wall time: 8.94 s


In [12]:
%%time
sc.pp.scale(concat_data)

... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
CPU times: user 7.61 s, sys: 1.29 s, total: 8.9 s
Wall time: 8.9 s


In [13]:
%%time
sc.pp.pca(concat_data, svd_solver='arpack')

computing PCA
    with n_comps=50
    finished (0:01:32)
CPU times: user 1min 31s, sys: 1.58 s, total: 1min 32s
Wall time: 1min 32s


In [14]:
%%time
X_pca = concat_data.obsm['X_pca']

D = distance.squareform(distance.pdist(X_pca, metric='euclidean'))

CPU times: user 7min 29s, sys: 1min 14s, total: 8min 43s
Wall time: 8min 43s


---

### Projection densities

In [48]:
distances = {}

distances['pred'] = D[len(data.obs_names):len(data.obs_names)+ len(prediction_100e.obs_names), ]
distances['pred'] = distances['pred'].T
distances['pred'] = distances['pred'][:len(data.obs_names),]
distances['pred'] = distances['pred'].T
distances['pred'].shape

(22068, 117200)

In [49]:
counts = {}

top_ones = {}
most_freq = {}
    
# initialize reference counts matrix with zeros
counts['pred'] = np.zeros(len(data.obs_names))

# 'proportionate' number of neighbors considered...
k = int(100000/len(prediction_100e.obs_names))

In [45]:
k

4

In [50]:
%%time
# for each of my sample's cells...
for j in range(len(prediction_100e.obs_names)):
    #print(j)
    dists = distances['pred'][j,]
    # retrieve indexes of the top (smallest distances) 'k' elements
    ind = np.argpartition(dists, k)[:k]

    # Get most freq cluster of interest
    most = [ data.obs['leiden.1.2'][x] for x  in ind ]
    mf = stats.mode(most).mode[0]
    most_freq[ prediction_100e.obs_names[j] ] = mf

    # Get the closest cluster of interest
    closest = ind[np.argmin(distances['pred'][j, ind])]        
    g = data.obs['leiden.1.2'][closest]      
    top_ones[ prediction_100e.obs_names[j] ] = g

    # increment neighbor counter
    for i in range(k):
        counts['pred'][ind[i]] += 1

data.obs['pred_counts'] = counts['pred']

CPU times: user 20.6 s, sys: 2.59 s, total: 23.1 s
Wall time: 23.1 s


In [52]:
prediction_100e.obs['top'] = pd.Series( top_ones )

prediction_100e.obs['freq'] = pd.Series( most_freq )

In [54]:
%%time
sc.write('COMBO10_SPL_lognorm_LD_prediction_hvg_LABEL_TRANSFERED', prediction_100e)

... storing 'condition' as categorical
... storing 'top' as categorical
... storing 'freq' as categorical


CPU times: user 23.5 s, sys: 554 ms, total: 24.1 s
Wall time: 24.6 s


In [56]:
%%time
sc.write('COMBO10_NO_SPL3_Seurat3_lognorm_with_density_projections', data)

CPU times: user 2min 25s, sys: 3.95 s, total: 2min 29s
Wall time: 2min 37s
