In [1]:
import os
import numpy as np
import pandas as pd
import scipy
import anndata
import scanpy as sc
import seaborn as sns
import scipy.sparse as sp
import matplotlib.pyplot as plt

# NOTES TO SELF
You can skip to "Load Result X" cell if that step has already been completed

# load metadata
**Note: will have to move some files around to use the commands given in F1L**

In [2]:
cwd = os.getcwd()
meta = pd.read_csv(cwd+'/data/Metadata.txt', sep='\t',)
meta.drop([0], axis=0, inplace=True)
meta.rename(columns={'NAME':'CellID', 'Cell_line':'CellLine', 'Pool_ID':'Pool', 'Cancer_type':'Indication'}, inplace=True)
meta

  meta = pd.read_csv(cwd+'/data/Metadata.txt', sep='\t',)


Unnamed: 0,CellID,CellLine,Pool,Indication,Genes_expressed,Discrete_cluster_minpts5_eps1.8,Discrete_cluster_minpts5_eps1.5,Discrete_cluster_minpts5_eps1.2,CNA_subclone,SkinPig_score,...,EMTII_score,EMTIII_score,IFNResp_score,p53Sen_score,EpiSen_score,StressResp_score,ProtMatu_score,ProtDegra_score,G1/S_score,G2/M_score
1,AAACCTGAGACATAAC-1-18,NCIH2126_LUNG,18,Lung Cancer,4318,,,,,0.166,...,-0.935,-0.935,0.13,0.619,1.869,-0.004,0.805,0.896,0.424,-1.125
2,AACGTTGTCACCCGAG-1-18,NCIH2126_LUNG,18,Lung Cancer,5200,,,,,-0.213,...,-1.027,-1.027,0.066,1.049,1.267,0.252,1.299,1.61,0.624,-0.048
3,AACTGGTAGACACGAC-1-18,NCIH2126_LUNG,18,Lung Cancer,4004,,,,,-0.101,...,-0.677,-0.677,0.304,0.822,2.401,0.141,0.451,1.225,-0.795,0.064
4,AACTGGTAGGGCTTGA-1-18,NCIH2126_LUNG,18,Lung Cancer,4295,,,,,-0.014,...,-0.735,-0.735,0.094,0.834,2.282,0.15,0.267,0.892,-0.238,1.118
5,AACTGGTAGTACTTGC-1-18,NCIH2126_LUNG,18,Lung Cancer,4842,,,,,0.006,...,-0.821,-0.821,0.034,0.96,1.4,-0.012,-0.276,-0.428,0.267,0.791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53509,c4722,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,3343,,,,,0.018,...,-0.505,-0.505,1.657,1.583,3.85,0.539,0.473,0.544,-1.079,-1.349
53510,c4724,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,6977,,,,,-0.098,...,-0.876,-0.876,0.669,1.086,3.046,0.799,0.49,1.319,-0.37,0.057
53511,c4731,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,6638,,,,,-0.112,...,-0.112,-0.112,0.61,0.693,2.289,0.65,0.729,1.143,-0.508,0.501
53512,c4735,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,4052,,,,,-0.244,...,1.981,1.981,0.523,-0.309,0.267,0.822,1.049,0.777,0.296,-0.936


In [3]:
meta.dtypes

CellID                             object
CellLine                           object
Pool                               object
Indication                         object
Genes_expressed                    object
Discrete_cluster_minpts5_eps1.8    object
Discrete_cluster_minpts5_eps1.5    object
Discrete_cluster_minpts5_eps1.2    object
CNA_subclone                       object
SkinPig_score                      object
EMTI_score                         object
EMTII_score                        object
EMTIII_score                       object
IFNResp_score                      object
p53Sen_score                       object
EpiSen_score                       object
StressResp_score                   object
ProtMatu_score                     object
ProtDegra_score                    object
G1/S_score                         object
G2/M_score                         object
dtype: object

In [4]:
# # I THOUGHT THAT "Discrete_cluster_minpts5_eps1.8" were numeric values, but turns out
# # they are strings. i leave this here as a reminder

# display(meta.loc[:,"Discrete_cluster_minpts5_eps1.8"])
# np.array(meta.loc[:,"Discrete_cluster_minpts5_eps1.8"],dtype=np.float32)
# # print("number NaN = ",sum(np.isnan()))

# # plt.plot(meta.loc[:,"Discrete_cluster_minpts5_eps1.8"],bins=10, edgecolor='k', alpha=0.7)

In [5]:
# Iterate through the columns and change their type
for col in meta.columns:
    lowercase_name = col.lower() # get name (case insensitive)
    if "score" in lowercase_name:  # Check if 'score' is in the column name 
        meta[col] = meta[col].astype(np.float32)
    elif "expressed" in lowercase_name:
        meta[col] = meta[col].astype(np.int32) 
    else:
        meta[col] = meta[col].astype(str)

print(meta.dtypes)

CellID                              object
CellLine                            object
Pool                                object
Indication                          object
Genes_expressed                      int32
Discrete_cluster_minpts5_eps1.8     object
Discrete_cluster_minpts5_eps1.5     object
Discrete_cluster_minpts5_eps1.2     object
CNA_subclone                        object
SkinPig_score                      float32
EMTI_score                         float32
EMTII_score                        float32
EMTIII_score                       float32
IFNResp_score                      float32
p53Sen_score                       float32
EpiSen_score                       float32
StressResp_score                   float32
ProtMatu_score                     float32
ProtDegra_score                    float32
G1/S_score                         float32
G2/M_score                         float32
dtype: object


# load cell ids
because large amount of data, we only do the first 10 rows to see what we're working with

In [6]:
# the first three rows are CellID, CellLine, and Pool
# only need the first row, CellID
example = pd.read_csv(cwd+'/data/UMIcount_data.txt', nrows=10, sep='\t', header=None)
display(example)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56973,56974,56975,56976,56977,56978,56979,56980,56981,56982
0,,AAACCTGAGACATAAC-1-18,AAACCTGCACAACGCC-1-18,AAACCTGCAGACAAGC-1-18,AAACCTGCAGCTCGAC-1-18,AAACCTGCATGGATGG-1-18,AAACCTGGTACGAAAT-1-18,AAACGGGAGATACACA-1-18,AAACGGGCAGGTGCCT-1-18,AAACGGGGTTTAGGAA-1-18,...,c4781,c4784,c4785,c4786,c4787,c4788,c4789,c4793,c4800,c4812
1,Cell_line,NCIH2126_LUNG,SW579_THYROID,C32_SKIN,SW579_THYROID,NCIH446_LUNG,HEC251_ENDOMETRIUM,MFE319_ENDOMETRIUM,SKNAS_AUTONOMIC_GANGLIA,NCIH2452_PLEURA,...,SCC25_UPPER_AERODIGESTIVE_TRACT,SCC25_UPPER_AERODIGESTIVE_TRACT,SCC25_UPPER_AERODIGESTIVE_TRACT,SCC25_UPPER_AERODIGESTIVE_TRACT,93VU_UPPER_AERODIGESTIVE_TRACT,JHU029_UPPER_AERODIGESTIVE_TRACT,SCC9_UPPER_AERODIGESTIVE_TRACT,JHU029_UPPER_AERODIGESTIVE_TRACT,SCC9_UPPER_AERODIGESTIVE_TRACT,SCC9_UPPER_AERODIGESTIVE_TRACT
2,Pool_ID,18,18,18,18,18,18,18,18,18,...,custom,custom,custom,custom,custom,custom,custom,custom,custom,custom
3,A1BG,0,1,1,1,0,2,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,A1CF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,A2M,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,A2M-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,A2ML1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,A2ML1-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
counts_cellid = pd.read_csv(cwd+'/data/UMIcount_data.txt', nrows=1, sep='\t', header=None) # get counts_cellid dataframe, 1x56983
counts_cellid = counts_cellid.transpose() # transpose counts_cellid dataframe to 56983x1

print(counts_cellid.iloc[0])

# drop the NaN with label 0, see print out from line above and also the previous cell. 
# counts_cellid dataframe will be 56982x1
counts_cellid.drop([0], inplace=True) 
counts_cellid

0    NaN
Name: 0, dtype: object


Unnamed: 0,0
1,AAACCTGAGACATAAC-1-18
2,AAACCTGCACAACGCC-1-18
3,AAACCTGCAGACAAGC-1-18
4,AAACCTGCAGCTCGAC-1-18
5,AAACCTGCATGGATGG-1-18
...,...
56978,c4788
56979,c4789
56980,c4793
56981,c4800


# load full count data
NOTE: if the file 'raw_adata_56982by30314.h5ad' exists, skip the next two subheadings and go straight to loading that adata object

In [8]:
if os.path.exists('data/raw_adata_56982by30314.h5ad'):
    print('Already created this object, don\'t need to recreate it!')

Already created this object, don't need to recreate it!


## get counts matrix in sparse array format

In [9]:
example.shape[1] # check we can get the exact number of cell IDs as above without hardcoding

56983

In [77]:
%%time

# APPROACH 1: DIDN'T WORK, BREAKS LAPTOP
# # we skip first 3 rows based on what we saw above when we printed the first 10 rows, since
# # first 3 rows are CellID, CellLine, and pool
# counts = pd.read_csv(cwd+'/data/UMIcount_data.txt', sep='\t', skiprows=3, header=None, index_col=0)
# counts = counts.transpose()
# counts

# APPROACH 2: DIDN'T WORK, NEED TO SKIP THINGS
# anndata.io.read_csv(filename, delimiter=',', first_column_names=None, dtype='float32')
# anndata.read_csv(cwd+'/data/UMIcount_data.txt', delimiter='\t') # NOT ENOUGH OPTIONS TO SKIP ROWS OR COLS

# APPROACH 3: STARTED 8:27pm-8:29pm
counts = np.loadtxt(
    'data/UMIcount_data.txt', 
    delimiter='\t', 
    skiprows=3,
    usecols=range(1,example.shape[1]) # skip first col, this is gene IDs
    )

CPU times: user 2min 31s, sys: 13.5 s, total: 2min 45s
Wall time: 2min 48s


In [78]:
%%time
counts = counts.T

CPU times: user 12 μs, sys: 1e+03 ns, total: 13 μs
Wall time: 14.8 μs


In [12]:
counts.shape

(56982, 30314)

In [79]:
counts

array([[0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 2., 0.]])

In [None]:
%%time
counts[mask]

In [13]:
%%time
counts = sp.csr_matrix(counts)

CPU times: user 2min 6s, sys: 4.13 s, total: 2min 10s
Wall time: 2min 10s


## get gene ids

In [14]:
%%time

# the first column is the gene IDs after skipping first 3 rows
gene_ids = pd.read_csv(cwd+'/data/UMIcount_data.txt', sep='\t', 
                       skiprows=3, header=None, usecols=[0])

CPU times: user 35 s, sys: 1.19 s, total: 36.2 s
Wall time: 36.3 s


In [15]:
gene_ids.shape

(30314, 1)

In [16]:
gene_ids[0]

0                  A1BG
1              A1BG-AS1
2                  A1CF
3                   A2M
4               A2M-AS1
              ...      
30309           SPATA13
30310           TBC1D26
30311           TIMM10B
30312            TMBIM4
30313    TMEM256-PLSCR3
Name: 0, Length: 30314, dtype: object

## get gene_ids and counts matrix into AnnData, and save

In [17]:
%%time
adata = anndata.AnnData(X=counts)
print('good 1')
adata.var_names = gene_ids[0]
print('good 2')
adata.write('data/raw_adata_56982by30314.h5ad')

good 1
good 2
CPU times: user 1.13 s, sys: 4.21 s, total: 5.35 s
Wall time: 6.27 s


### Load result cell 1
Raw adata, still has extra cells which don't have CellIDs in metadata (QC sequences?)

In [74]:
%%time
adata = anndata.read_h5ad('data/raw_adata_56982by30314.h5ad')

CPU times: user 780 ms, sys: 2.4 s, total: 3.18 s
Wall time: 10.8 s


## Drop CellIDs (obs) that are NOT in meta['CellID'] column
SECTION COMPLETED, SKIP TO "LOAD RESULT CELL 2"

In [19]:
display(counts_cellid)

Unnamed: 0,0
1,AAACCTGAGACATAAC-1-18
2,AAACCTGCACAACGCC-1-18
3,AAACCTGCAGACAAGC-1-18
4,AAACCTGCAGCTCGAC-1-18
5,AAACCTGCATGGATGG-1-18
...,...
56978,c4788
56979,c4789
56980,c4793
56981,c4800


In [20]:
adata

AnnData object with n_obs × n_vars = 56982 × 30314

In [21]:
counts_cellid[0]

1        AAACCTGAGACATAAC-1-18
2        AAACCTGCACAACGCC-1-18
3        AAACCTGCAGACAAGC-1-18
4        AAACCTGCAGCTCGAC-1-18
5        AAACCTGCATGGATGG-1-18
                 ...          
56978                    c4788
56979                    c4789
56980                    c4793
56981                    c4800
56982                    c4812
Name: 0, Length: 56982, dtype: object

In [22]:
adata.obs_names = counts_cellid[0]

In [23]:
adata

AnnData object with n_obs × n_vars = 56982 × 30314

In [24]:
mask = adata.obs_names.isin(meta['CellID']) # create mask
print(len(mask))
mask

56982


array([ True,  True,  True, ...,  True,  True,  True])

In [25]:
sum(mask) # sum counts number of "True" in boolean array, matches original notebook value!

53513

In [26]:
sum(~mask)

3469

In [27]:
adata.X[~mask]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7091798 stored elements and shape (3469, 30314)>

In [28]:
meta.shape

(53513, 21)

In [29]:
mask.shape

(56982,)

In [30]:
print(adata.X[mask].shape,len(gene_ids))

(53513, 30314) 30314


In [31]:
adata.obs.index

Index(['AAACCTGAGACATAAC-1-18', 'AAACCTGCACAACGCC-1-18',
       'AAACCTGCAGACAAGC-1-18', 'AAACCTGCAGCTCGAC-1-18',
       'AAACCTGCATGGATGG-1-18', 'AAACCTGGTACGAAAT-1-18',
       'AAACGGGAGATACACA-1-18', 'AAACGGGCAGGTGCCT-1-18',
       'AAACGGGGTTTAGGAA-1-18', 'AAACGGGTCTTGCAAG-1-18',
       ...
       'c4781', 'c4784', 'c4785', 'c4786', 'c4787', 'c4788', 'c4789', 'c4793',
       'c4800', 'c4812'],
      dtype='object', length=56982)

In [75]:
adata.X[mask]

AttributeError: 'csr_matrix' object has no attribute 'index'

In [72]:
%%time

adata = anndata.read_h5ad('data/raw_adata_56982by30314.h5ad')

# Instatiate the AnnData object with all the valid cell id's found in meta data
adata = anndata.AnnData(X = adata.X[mask],
                        var=gene_ids,
                        oidx = meta.index)

CPU times: user 617 ms, sys: 791 ms, total: 1.41 s
Wall time: 1.26 s




### Need to re-index meta to match the masked indexing then assign to anndata for obs

In [73]:
adata.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '53503', '53504', '53505', '53506', '53507', '53508', '53509', '53510',
       '53511', '53512'],
      dtype='object', length=53513)

In [57]:
# check if the masking is working properly 
# (in all likelihood, some of the indices will be omitted such that
# the ranging is not consecutive numbers but will skip from i to i+y instead
# of i to i+1 for every index i)
count_true = 0
count_num_greater_than = 0
for i in range(0,len(adata.obs_names)):
    if str(i) == adata.obs_names[i]:
        count_true += 1

    if i == len(adata.obs_names)-1:
        break
    if int(adata.obs_names[i]) < int(adata.obs_names[i+1]):
        count_num_greater_than += 1
    else:
        print(adata.obs_names[i], adata.obs_names[i+1])

In [69]:
print('for adata.obs_names')
print(f'are indices consecutive? {count_true} == {len(adata.obs_names)}, {count_true == len(adata.obs_names)}')
print(f'is i < i+1 for each index? {count_num_greater_than}, {count_num_greater_than} == {len(adata.obs_names)-1}, {count_num_greater_than == len(adata.obs_names)-1}')
# if evalautes to false then we know masking caused skipping of indices

for adata.obs_names
are indices consecutive? 53513 == 53513, True
is i < i+1 for each index? 53512, 53512 == 53512, True


In [68]:
# check if the masking is working properly 
# (in all likelihood, some of the indices will be omitted such that
# the ranging is not consecutive numbers but will skip from i to i+y instead
# of i to i+1 for every index i)
count_true = 0
count_num_greater_than = 0
for i in range(0,len(meta.index)):
    if i == meta.index[i]-1: # offset by one since we deleted row 0?
        count_true += 1

    if i == len(meta.index)-1:
        break
    if int(meta.index[i]) < int(meta.index[i+1]):
        count_num_greater_than += 1
    else:
        print(meta.index[i], meta.index[i+1])

print('for meta.index')
print(f'are indices consecutive? {count_true} == {len(meta.index)}, {count_true == len(meta.index)}')
print(f'is i < i+1 for each index? {count_num_greater_than}, {count_num_greater_than} == {len(meta.index)-1}, {count_num_greater_than == len(meta.index)-1}')
# if evalautes to false then we know masking caused skipping of indices

for meta.index
are indices consecutive? 53513 == 53513, True
is i < i+1 for each index? 53512, 53512 == 53512, True


In [63]:
print(meta.index,len(meta.index))
print(adata.obs_names,len(adata.obs_names))

RangeIndex(start=1, stop=53514, step=1) 53513
Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '53503', '53504', '53505', '53506', '53507', '53508', '53509', '53510',
       '53511', '53512'],
      dtype='object', length=53513) 53513


In [33]:
%%time
meta = meta.reindex(index=adata.obs_names[mask])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 53513 but corresponding boolean dimension is 56982

In [26]:
adata

AnnData object with n_obs × n_vars = 53513 × 30314
    obs: 'CellLine', 'Pool', 'Indication', 'Genes_expressed', 'Discrete_cluster_minpts5_eps1.8', 'Discrete_cluster_minpts5_eps1.5', 'Discrete_cluster_minpts5_eps1.2', 'CNA_subclone', 'SkinPig_score', 'EMTI_score', 'EMTII_score', 'EMTIII_score', 'IFNResp_score', 'p53Sen_score', 'EpiSen_score', 'StressResp_score', 'ProtMatu_score', 'ProtDegra_score', 'G1/S_score', 'G2/M_score'
    var: 0

In [27]:
display(adata.obs,adata.var)

Unnamed: 0,CellLine,Pool,Indication,Genes_expressed,Discrete_cluster_minpts5_eps1.8,Discrete_cluster_minpts5_eps1.5,Discrete_cluster_minpts5_eps1.2,CNA_subclone,SkinPig_score,EMTI_score,EMTII_score,EMTIII_score,IFNResp_score,p53Sen_score,EpiSen_score,StressResp_score,ProtMatu_score,ProtDegra_score,G1/S_score,G2/M_score
1,NCIH2126_LUNG,18,Lung Cancer,4318,,,,,0.166,-0.045,-0.935,-0.935,0.130,0.619,1.869,-0.004,0.805,0.896,0.424,-1.125
2,NCIH2126_LUNG,18,Lung Cancer,5200,,,,,-0.213,0.035,-1.027,-1.027,0.066,1.049,1.267,0.252,1.299,1.610,0.624,-0.048
3,NCIH2126_LUNG,18,Lung Cancer,4004,,,,,-0.101,-0.183,-0.677,-0.677,0.304,0.822,2.401,0.141,0.451,1.225,-0.795,0.064
4,NCIH2126_LUNG,18,Lung Cancer,4295,,,,,-0.014,-0.093,-0.735,-0.735,0.094,0.834,2.282,0.150,0.267,0.892,-0.238,1.118
5,NCIH2126_LUNG,18,Lung Cancer,4842,,,,,0.006,-0.055,-0.821,-0.821,0.034,0.960,1.400,-0.012,-0.276,-0.428,0.267,0.791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53509,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,3343,,,,,0.018,-0.149,-0.505,-0.505,1.657,1.583,3.850,0.539,0.473,0.544,-1.079,-1.349
53510,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,6977,,,,,-0.098,-0.197,-0.876,-0.876,0.669,1.086,3.046,0.799,0.490,1.319,-0.370,0.057
53511,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,6638,,,,,-0.112,-0.107,-0.112,-0.112,0.610,0.693,2.289,0.650,0.729,1.143,-0.508,0.501
53512,JHU006_UPPER_AERODIGESTIVE_TRACT,custom,Head and Neck Cancer,4052,,,,,-0.244,0.442,1.981,1.981,0.523,-0.309,0.267,0.822,1.049,0.777,0.296,-0.936


Unnamed: 0,0
0,A1BG
1,A1BG-AS1
2,A1CF
3,A2M
4,A2M-AS1
...,...
30309,SPATA13
30310,TBC1D26
30311,TIMM10B
30312,TMBIM4


In [28]:
# rename the vars column from "0" to "GeneID"
adata.var.rename(columns={0:'GeneID'}, inplace=True)
display(adata.var)

# adata.obs.rename(columns={0:'CellID'}, inplace=True)
# display(adata.obs,adata.var)

Unnamed: 0,GeneID
0,A1BG
1,A1BG-AS1
2,A1CF
3,A2M
4,A2M-AS1
...,...
30309,SPATA13
30310,TBC1D26
30311,TIMM10B
30312,TMBIM4


In [29]:
%%time 
# save the result
adata.write('data/filtered_adata_53513by30314.h5ad')

CPU times: user 928 ms, sys: 3.37 s, total: 4.3 s
Wall time: 4.33 s


### Load result cell 2
Filtered so only cells from the metadata are in the anndata object

In [8]:
%%time
# load the result
adata = anndata.read_h5ad('data/filtered_adata_53513by30314.h5ad')

CPU times: user 562 ms, sys: 2.54 s, total: 3.1 s
Wall time: 13.8 s


In [9]:
print(adata.obs_names.equals(meta.index))  # Should return True

False


# verify agrees with original notebook
check with [original notebook](https://github.com/deanslee/FigureOneLab/blob/main/kinker/240701_kinker_anndata.ipynb) that the adata object looks correct

In [31]:
adata

AnnData object with n_obs × n_vars = 53513 × 30314
    obs: 'CellLine', 'Pool', 'Indication', 'Genes_expressed', 'Discrete_cluster_minpts5_eps1.8', 'Discrete_cluster_minpts5_eps1.5', 'Discrete_cluster_minpts5_eps1.2', 'CNA_subclone', 'SkinPig_score', 'EMTI_score', 'EMTII_score', 'EMTIII_score', 'IFNResp_score', 'p53Sen_score', 'EpiSen_score', 'StressResp_score', 'ProtMatu_score', 'ProtDegra_score', 'G1/S_score', 'G2/M_score'
    var: 'GeneID'

In [32]:
adata.X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 201078579 stored elements and shape (53513, 30314)>

In [33]:
201078579/(53513*30314) # shows we are saving lots of space by not storing zeroes; we are only storing 12% of the matrix!

0.12395477531693727

In [34]:
%%time
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.filter_cells(adata, min_genes=200)
adata

CPU times: user 6.1 s, sys: 4.65 s, total: 10.7 s
Wall time: 10.9 s


AnnData object with n_obs × n_vars = 53513 × 23081
    obs: 'CellLine', 'Pool', 'Indication', 'Genes_expressed', 'Discrete_cluster_minpts5_eps1.8', 'Discrete_cluster_minpts5_eps1.5', 'Discrete_cluster_minpts5_eps1.2', 'CNA_subclone', 'SkinPig_score', 'EMTI_score', 'EMTII_score', 'EMTIII_score', 'IFNResp_score', 'p53Sen_score', 'EpiSen_score', 'StressResp_score', 'ProtMatu_score', 'ProtDegra_score', 'G1/S_score', 'G2/M_score', 'n_genes'
    var: 'GeneID', 'n_cells'

In [35]:
adata.write(cwd+'/outs/250102_kinker_anndata.h5ad')