In [1]:
import pandas as pd
import scanpy as sc
import random
from copy import deepcopy

## Initialize: read in data, create demo annotation

In [2]:
adata_unique = sc.read('../data/pbmc3k.h5ad')
adata_unique.obs_names_make_unique()

In [3]:
def generate_demo_annotation(adata):
    demo_vals = ['lion', 'tiger', 'bear', 'oh my', ''] ## generate random demo annotation values
    demo_annotation = pd.Series([random.choice(demo_vals) for cell in adata.obs.index.values], 
                                index = adata.obs.index,
                                name='mylabel')

    print(demo_annotation.head(10)) ## print demo annotation
    assert(list(demo_annotation.index) == list(adata.obs.index))
    return demo_annotation

demo_unique_annotation = generate_demo_annotation(adata_unique)

index
AAACATACAACCAC-1     bear
AAACATTGAGCTAC-1         
AAACATTGATCAGC-1    oh my
AAACCGTGCTTCCG-1         
AAACCGTGTATGCG-1     bear
AAACGCACTGGTAC-1    tiger
AAACGCTGACCAGT-1     bear
AAACGCTGGTTCTT-1    oh my
AAACGCTGTAGCCA-1     bear
AAACGCTGTTTCTG-1     bear
Name: mylabel, dtype: object


## Sanity check annotation write and read

In [4]:
def write_annotation(series, path):
    ofile = open(path, 'w') ## write demo data to file, add a provenance comment to header
    provenance_comment = '# Annotations generated on $DATE:$TIME using cellxgene version $VERSION\n# Input data file was $NAME.h5ad, which was last modified on $DATE:$TIME\n'
    ofile.write(provenance_comment)

    series.to_csv(ofile, sep=',', na_rep='', header=True, index=True, index_label='index')
    !head $path ## sanity check the output    
    
write_annotation(demo_unique_annotation, 'demo_unique_annotation.csv')

# Annotations generated on $DATE:$TIME using cellxgene version $VERSION
# Input data file was $NAME.h5ad, which was last modified on $DATE:$TIME
index,mylabel
AAACATACAACCAC-1,bear
AAACATTGAGCTAC-1,
AAACATTGATCAGC-1,oh my
AAACCGTGCTTCCG-1,
AAACCGTGTATGCG-1,bear
AAACGCACTGGTAC-1,tiger
AAACGCTGACCAGT-1,bear


In [5]:
def read_annotation(path):
    annotation = pd.read_csv(path, 
                             comment='#', 
                             dtype='category', 
                             index_col=0)
    print(annotation.head())
    return annotation
    
demo_unique_annotation = read_annotation('demo_unique_annotation.csv')

                 mylabel
index                   
AAACATACAACCAC-1    bear
AAACATTGAGCTAC-1     NaN
AAACATTGATCAGC-1   oh my
AAACCGTGCTTCCG-1     NaN
AAACCGTGTATGCG-1    bear


## Test drive attaching new column to existing metadata

In [14]:
def attach_annotation(df, adata):
#     assert(list(adata.obs.index.values)==list(df.index.values))
#     assert(adata.obs.shape[0] == df.shape[0])
    for label, anno in df.iteritems(): # can also be done with df.join()
        adata.obs[label] = anno
    print(adata.obs.head())

attach_annotation(demo_unique_annotation, adata_unique)

                  n_genes_by_counts  log1p_n_genes_by_counts  total_counts  \
index                                                                        
AAACATACAACCAC-1                781                 6.661855        2421.0   
AAACATTGAGCTAC-1               1352                 7.210080        4903.0   
AAACATTGATCAGC-1               1131                 7.031741        3149.0   
AAACCGTGCTTCCG-1                960                 6.867974        2639.0   
AAACCGTGTATGCG-1                522                 6.259581         981.0   

                  log1p_total_counts  pct_counts_in_top_50_genes  \
index                                                              
AAACATACAACCAC-1            7.792349                   47.748864   
AAACATTGAGCTAC-1            8.497807                   45.502753   
AAACATTGATCAGC-1            8.055158                   41.314703   
AAACCGTGCTTCCG-1            7.878534                   39.029936   
AAACCGTGTATGCG-1            6.889591         

## Rinse, repeat with a nonunique index

In [7]:
nonunique_adata = sc.read('../data/pbmc3k.h5ad')
name_choices = nonunique_adata.obs_names[:10]

nonunique_adata.obs_names = [random.choice(name_choices) for s in nonunique_adata.obs_names] ## make the index nonunique
assert(len(pd.unique(nonunique_adata.obs_names)) != len(nonunique_adata.obs_names))

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [11]:
print('generated this annotation:')
nonunique_annotation = generate_demo_annotation(nonunique_adata)
print('\n\nwriting out annotation:')
write_annotation(nonunique_annotation, 'nonunique_demo.csv')
print('\n\nread annotation back in as:')
nonunique_annotation = read_annotation('nonunique_demo.csv')
print('\n\nattaching re-read annotation to adata:')
attach_annotation(nonunique_annotation, nonunique_adata)

generated this annotation:
AAACCGTGCTTCCG-1         
AAACGCTGTAGCCA-1    oh my
AAACGCTGGTTCTT-1    oh my
AAACGCTGGTTCTT-1     bear
AAACGCTGTAGCCA-1     bear
AAACATTGAGCTAC-1     bear
AAACGCTGACCAGT-1     bear
AAACGCTGGTTCTT-1     bear
AAACATACAACCAC-1    tiger
AAACCGTGTATGCG-1    oh my
Name: mylabel, dtype: object


writing out annotation:
# Annotations generated on $DATE:$TIME using cellxgene version $VERSION
# Input data file was $NAME.h5ad, which was last modified on $DATE:$TIME
index,mylabel
AAACCGTGCTTCCG-1,
AAACGCTGTAGCCA-1,oh my
AAACGCTGGTTCTT-1,oh my
AAACGCTGGTTCTT-1,bear
AAACGCTGTAGCCA-1,bear
AAACATTGAGCTAC-1,bear
AAACGCTGACCAGT-1,bear


read annotation back in as:
                 mylabel
index                   
AAACCGTGCTTCCG-1     NaN
AAACGCTGTAGCCA-1   oh my
AAACGCTGGTTCTT-1   oh my
AAACGCTGGTTCTT-1    bear
AAACGCTGTAGCCA-1    bear


attaching re-read annotation to adata:
                  n_genes_by_counts  log1p_n_genes_by_counts  total_counts  \
AAACCGTGCTTCCG-1       

## what happens if a nonunique index gets shuffled? (simulating mismatched csv + h5ad)

In [15]:
shuffled_nonunique = nonunique_annotation.sort_values(by='mylabel')
attach_annotation(shuffled_nonunique, nonunique_adata)

ValueError: cannot reindex with a non-unique indexer