In [28]:
import scgen
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [29]:
anndata = sc.read("./tests/data/train_kang.h5ad",
                  backup_url='https://drive.google.com/uc?id=1r87vhoLLq6PXAYdmyyd89zG90eJOFYLk')

In [30]:
anndata_by_cell_type = {}

cell_types = anndata.obs.cell_type.cat.categories.values

for cell_type in cell_types:
    anndata_by_cell_type[cell_type] = anndata[anndata.obs.cell_type == cell_type]

In [31]:
anndata_by_cell_type

{'CD4T': View of AnnData object with n_obs × n_vars = 5564 × 6998
     obs: 'condition', 'n_counts', 'n_genes', 'mt_frac', 'cell_type'
     var: 'gene_symbol', 'n_cells'
     uns: 'cell_type_colors', 'condition_colors', 'neighbors'
     obsm: 'X_pca', 'X_tsne', 'X_umap'
     obsp: 'distances', 'connectivities',
 'CD14+Mono': View of AnnData object with n_obs × n_vars = 2561 × 6998
     obs: 'condition', 'n_counts', 'n_genes', 'mt_frac', 'cell_type'
     var: 'gene_symbol', 'n_cells'
     uns: 'cell_type_colors', 'condition_colors', 'neighbors'
     obsm: 'X_pca', 'X_tsne', 'X_umap'
     obsp: 'distances', 'connectivities',
 'B': View of AnnData object with n_obs × n_vars = 1811 × 6998
     obs: 'condition', 'n_counts', 'n_genes', 'mt_frac', 'cell_type'
     var: 'gene_symbol', 'n_cells'
     uns: 'cell_type_colors', 'condition_colors', 'neighbors'
     obsm: 'X_pca', 'X_tsne', 'X_umap'
     obsp: 'distances', 'connectivities',
 'CD8T': View of AnnData object with n_obs × n_vars = 1115 

In [32]:
cd4t_df = anndata_by_cell_type['CD4T'].to_df()
cd4t_df.describe()

index,AL627309.1,RP11-206L10.9,LINC00115,NOC2L,KLHL17,HES4,ISG15,TNFRSF18,TNFRSF4,SDF4,...,C21orf67,FAM207A,ADARB1,POFUT2,COL18A1,SLC19A1,COL6A2,FTCD,DIP2A,S100B
count,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,...,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0,5564.0
mean,0.00029,0.000295,0.004717,0.081097,0.000599,0.024535,1.396227,0.040431,0.09137,0.064878,...,0.000406,0.043993,0.00407,0.009166,0.009337,0.004598,0.001317,0.000346,0.020588,0.021828
std,0.015311,0.015755,0.063308,0.254489,0.022557,0.154765,1.192688,0.189054,0.297386,0.230153,...,0.022359,0.187557,0.057041,0.087452,0.087252,0.064468,0.036036,0.018251,0.131128,0.152459
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.626387,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,2.427323,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.823251,0.953,1.159003,2.176398,0.927066,2.23116,5.175874,1.898126,2.472356,2.879868,...,1.468089,1.580605,1.017806,1.538911,1.213284,1.51467,1.494599,0.996147,1.607041,2.438123


In [None]:
for column in cd4t_df.columns:
    if cd4t_df[column].isnull().sum() > 0:
        print(column, ': {:.2%}'.format(cd4t_df[column].isnull().sum() /
                                        cd4t_df[column].shape[0]))

Conclusion: There are no missing gene values

In [46]:
fmono_df = anndata_by_cell_type['FCGR3A+Mono'].to_df()
fmono_df.describe()

index,AL627309.1,RP11-206L10.9,LINC00115,NOC2L,KLHL17,HES4,ISG15,TNFRSF18,TNFRSF4,SDF4,...,C21orf67,FAM207A,ADARB1,POFUT2,COL18A1,SLC19A1,COL6A2,FTCD,DIP2A,S100B
count,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,...,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0,3601.0
mean,7e-05,0.000315,0.00299,0.020351,0.000382,0.32136,3.097102,0.009016,0.009736,0.045875,...,0.000179,0.010504,0.001876,0.003571,0.001204,0.000915,0.000244,0.0,0.006154,0.000507
std,0.004207,0.013388,0.039265,0.10065,0.017022,0.457502,1.812274,0.073907,0.077094,0.15364,...,0.007659,0.073966,0.029295,0.04381,0.027396,0.023065,0.010348,0.0,0.055408,0.013735
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.952466,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,4.007938,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.596578,4.427419,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.252454,0.592408,0.762286,1.077075,0.907621,2.279583,5.445781,1.123406,1.151281,1.23881,...,0.360421,1.051624,0.643981,0.820134,0.939582,0.713534,0.456322,0.0,0.882176,0.426932


In [47]:
for column in fmono_df.columns:
    if fmono_df[column].isnull().sum() > 0:
        print(column, ': {:.2%}'.format(fmono_df[column].isnull().sum() /
                                        fmono_df[column].shape[0]))

Conclusion: Same here! There are no missing gene values