# Import

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import episcanpy.api as epi
import hdf5storage

import warnings
warnings.filterwarnings("ignore")

# Prepare data

### Cell-by-peak matrix

In [2]:
sc_mat = hdf5storage.loadmat('./example/Splenocyte_sc_mat.mat')
print(sc_mat.keys())

dict_keys(['__header__', '__version__', '__globals__', 'sc_mat'])


In [3]:
sc_mat = sc_mat['sc_mat'].T
print(sc_mat.shape)

(3166, 77453)


In [4]:
print(sc_mat[-5:, :5])

[[0 0 0 0 0]
 [0 0 0 0 0]
 [4 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


### Cell type labels

In [5]:
sc_label = hdf5storage.loadmat('./example/Splenocyte_sc_label.mat')
print(sc_label.keys())

dict_keys(['__header__', '__version__', '__globals__', 'sc_label'])


In [6]:
sc_label = sc_label['sc_label'].reshape(-1)
print(sc_label.shape)

(3166,)


In [7]:
print(sc_label[-5:])

[array(['Naive_CD8_T'], dtype='<U11') array(['Naive_CD8_T'], dtype='<U11')
 array(['Follicular_B'], dtype='<U12')
 array(['Follicular_B'], dtype='<U12')
 array(['Naive_CD4_T'], dtype='<U11')]


In [8]:
sc_label = np.array([s[0] for s in sc_label])
print(sc_label[-5:])

['Naive_CD8_T' 'Naive_CD8_T' 'Follicular_B' 'Follicular_B' 'Naive_CD4_T']


In [9]:
print(pd.value_counts(sc_label))

Follicular_B            1358
Naive_CD4_T              454
Naive_CD8_T              331
Marginal_Zone_B          254
Transitional_B           223
Memory_CD8_T             116
Regulatory_T              89
Granulocyte               85
CD27-_Natural_Killer      75
CD27+_Natural_Killer      65
Dendritic_cell            61
Macrophage                55
dtype: int64


### Peaks

In [10]:
sc_peak = pd.read_csv('./example/Splenocyte_sc_peak.txt',header=None).values.reshape(-1)
print(sc_peak.shape)

(77453,)


In [11]:
print(sc_peak[-5:])

['chrY_90808282_90808985' 'chrY_90828737_90829320'
 'chrY_90833326_90833589' 'chrY_90833764_90833977'
 'chrY_90836319_90836703']


### H5ad

In [12]:
adata = sc.AnnData(sc_mat)
adata.obs['cell_type'] = sc_label
adata.var['peak'] = sc_peak

# Check and save h5ad

In [13]:
print(adata)

AnnData object with n_obs × n_vars = 3166 × 77453
    obs: 'cell_type'
    var: 'peak'


In [14]:
print(adata.obs)

            cell_type
0         Naive_CD8_T
1      Transitional_B
2         Naive_CD8_T
3     Marginal_Zone_B
4         Naive_CD4_T
...               ...
3161      Naive_CD8_T
3162      Naive_CD8_T
3163     Follicular_B
3164     Follicular_B
3165      Naive_CD4_T

[3166 rows x 1 columns]


In [15]:
print(adata.var)

                           peak
0          GL456210.1_3966_5250
1        GL456210.1_16353_16873
2        GL456210.1_17702_18003
3        GL456210.1_75630_76290
4      GL456210.1_118218_118800
...                         ...
77448    chrY_90808282_90808985
77449    chrY_90828737_90829320
77450    chrY_90833326_90833589
77451    chrY_90833764_90833977
77452    chrY_90836319_90836703

[77453 rows x 1 columns]


In [16]:
adata.write('./example/Splenocyte.h5ad')