## Fixes:
- sort in absolute value
- Put it in a module / pip / conda
- Add differential expression feature
- Calculate everything in matrices rather than for loops

In [1]:
import pandas as pd
import numpy as np
from umap_explorer import UMAPexplorer


In [2]:
%load_ext py5
%gui osx

In [6]:
print('DOWNLOADING AND EXTRACTING EXAMPLE DATA')
! mkdir -p ../data
! wget https://storage.googleapis.com/sabeti-public/dkotliar/scnavigator/pbmc3k/data/pbmc3k_umap.tsv -O ../data/pbmc3k_umap.tsv
! wget https://storage.googleapis.com/sabeti-public/dkotliar/scnavigator/pbmc3k/data/pbmc3k_expression_filtered_normalized.tsv.gz -O ../data/pbmc3k_expression_filtered_normalized.tsv.gz
! gzip -df ../data/pbmc3k_expression_filtered_normalized.tsv.gz
! ls ../data

DOWNLOADING AND EXTRACTING EXAMPLE DATA
--2021-02-21 12:45:10--  https://storage.googleapis.com/sabeti-public/dkotliar/scnavigator/pbmc3k/data/pbmc3k_umap.tsv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.10.112, 172.217.3.112, 172.217.6.240, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.10.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 143969 (141K) [text/tab-separated-values]
Saving to: ‘../data/pbmc3k_umap.tsv’


2021-02-21 12:45:11 (2.87 MB/s) - ‘../data/pbmc3k_umap.tsv’ saved [143969/143969]

--2021-02-21 12:45:11--  https://storage.googleapis.com/sabeti-public/dkotliar/scnavigator/pbmc3k/data/pbmc3k_expression_filtered_normalized.tsv.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.10.112, 172.217.3.112, 172.217.6.240, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.10.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43477

In [3]:
print("LOADING UMAP DATA...")

_umap = pd.read_csv('../data/pbmc3k_umap.tsv', sep='\t')
_umap.index = _umap['index']
_umap = _umap.iloc[:, 1:]
_umap.head()

LOADING UMAP DATA...


Unnamed: 0_level_0,UMAP_1,UMAP_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACATACAACCAC-1,3.638991,3.122167
AAACATTGAGCTAC-1,-0.490064,10.553852
AAACATTGATCAGC-1,0.391078,4.035561
AAACCGTGCTTCCG-1,-6.747729,-0.600776
AAACCGTGTATGCG-1,1.887045,-1.526383


In [4]:
print("LOADING GENE EXPRESSION DATA...")

_expr = pd.read_csv('../data/pbmc3k_expression_filtered_normalized.tsv', sep='\t')
_expr.set_index('index', inplace=True)
_expr.head()

LOADING GENE EXPRESSION DATA...


Unnamed: 0_level_0,TNFRSF4,CPSF3L,ATAD3C,C1orf86,RER1,TNFRSF25,TNFRSF9,CTNNBIP1,SRM,UBIAD1,...,DSCR3,BRWD1,BACE2,SIK1,C21orf33,ICOSLG,SUMO3,SLC19A1,S100B,PRMT2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAACCAC-1,-0.17147,-0.280812,-0.046677,-0.475169,-0.544024,4.928496,-0.038028,-0.280573,-0.341788,-0.195361,...,-0.22657,-0.236269,-0.102943,-0.222116,-0.312401,-0.121678,-0.521229,-0.098269,-0.209095,-0.531203
AAACATTGAGCTAC-1,-0.214582,-0.372653,-0.054804,-0.683391,0.633951,-0.334837,-0.045589,-0.498264,-0.541914,-0.209017,...,-0.317531,2.568868,0.007155,-0.445372,1.629285,-0.058662,-0.857163,-0.266844,-0.313146,-0.596654
AAACATTGATCAGC-1,-0.376888,-0.295085,-0.057527,-0.520972,1.332648,-0.309362,-0.103108,-0.272526,-0.500799,-0.220228,...,-0.302938,-0.239801,-0.071774,-0.297857,-0.41092,-0.070431,-0.59072,-0.158656,-0.170876,1.379
AAACCGTGCTTCCG-1,-0.285241,-0.281735,-0.052227,-0.484929,1.57268,-0.271825,-0.074552,-0.258875,-0.416752,-0.208471,...,-0.262978,-0.231807,-0.093818,-0.24777,2.552079,-0.097402,1.631684,-0.119462,-0.17912,-0.505669
AAACCGTGTATGCG-1,-0.256484,-0.220394,-0.0468,-0.345859,-0.333409,-0.208122,-0.069514,5.80644,-0.283112,-0.199355,...,-0.202237,-0.176765,-0.16735,-0.098665,-0.275836,-0.139482,-0.310095,-0.006877,-0.109614,-0.461946


In [5]:
test = UMAPexplorer(_umap, _expr)
test.run_sketch()

Selected 38 cells
Calculating correlations...
Done
Selected gene 120
Min/max expression level for gene S100A6 -1.691986918449402 1.7633997201919556
EXPORTING DATA...
BYE


In [7]:
test.selected_cells

Unnamed: 0,index,proj
0,AAGATTACCGCCTT-1,0.449388
1,AAGCCATGAACTGC-1,0.642522
2,AATGCGTGGACGGA-1,0.350067
3,AATTACGAATTCCT-1,0.405885
4,ACGAGGGACAGGAG-1,0.73738
5,ACGTCGCTCTATTC-1,0.096618
6,ACGTGATGCCATGA-1,0.501144
7,ACTTAAGATTACTC-1,0.335405
8,AGCACTGATGCTTT-1,0.570746
9,ATACCACTCTAAGC-1,0.399311


In [8]:
test.selected_gene_name

'S100A6'