
Author: Erno Hänninen

Created: 2023-22-01

Title: AnnotateHerbData.ipynb

Description:
 - Filters and annotates cell types to the raw timepoint data from Herb paper
 
Procedure:
 - ...

List of non-standard modules:
 - scanpy


In [None]:
import scanpy as sc

## Load and explore the integrated Herb data

The integrated Herb data is produced in this script: https://github.com/brianherb/HumanHypothalamusDev/blob/f87a2788cef62b55f6c66ba343634673ba30042a/Fig1_HumanEmbryonic.R 

In [183]:
#Read the integrated reference data (Fig1 from the paper)
adata_ref = sc.read_h5ad(integrated_data)

In [None]:
adata_ref

In [None]:
# Plot the celltypes and timpoint
sc.pl.umap(adata_ref, color="CurrentMeta")
sc.pl.umap(adata_ref, color="Timepoint" )

In [None]:
#Change to raw counts
adata_ref_raw = adata_ref.raw.to_adata()
#Copy the var names to the adata_ref_raw
adata_ref_raw.var_names = adata_ref.var_names

## Filter, annotate and process the merged raw timepoint data

In [189]:
#Read the merged raw data to adata object
adata_batches = sc.read_h5ad(merged_raw_data)

In [None]:
# Reformat the object to aviod errors when writing the annotated data to file
del(adata_batches.var['_index'])
adata_batches.__dict__['_raw'].__dict__['_var'] = adata_batches.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})

In [None]:
#Print the number of the cells in the raw and integrated datasets
print(len(adata_batches.obs_names))
print(len(adata_ref_raw.obs_names))

In [191]:
#Raw data processing
#Filter away cells from the raw data that doesn't occur in the integrated datasets
#This way the dataset is "pre-processed" like it was done in the Herb paper
shared_cells = adata_ref_raw.obs_names.intersection(adata_batches.obs_names) #Takes the intersection of cells

adata_batches = adata_batches[shared_cells,:].copy() #Do the actual filtering

In [None]:
#Double check for identical obs.index (same cells in bot datasets)
adata_ref_raw.obs.index.identical(adata_batches.obs.index)

In [None]:
#Check that the dimensions are equal
print(adata_batches.shape)
print(adata_ref_raw.shape)

In [194]:
#Now when the obs.index between the two dataset is identical, move the CurrentMeta column to the raw adata_batches object
adata_batches.obs["Cell_types"] = adata_ref_raw.obs["CurrentMeta"]


In [None]:
adata_batches

In [None]:
#Store the raw counts
adata_batches.layers["counts"] = adata_batches.X.copy()
#Normalize and scale the data
sc.pp.normalize_total(adata_batches, target_sum=1e4)
sc.pp.log1p(adata_batches)

In [None]:
adata_batches.uns["log1p"]

In [None]:
#Check that the counts in layers["counts"] are non-normalized
adata_batches.layers["counts"].todense()[185:190,185:190]

In [None]:
#Print the countes in adata.X
adata_batches.X.todense()[185:190,185:190]

In [None]:
#Write the processed data to file
adata_batches.write("Processed_herb_adata.h5ad")

#When reading the data run adata_batches.uns["log1p"] = {"base":None}

In [None]:
#Identify hvg
sc.pp.highly_variable_genes(adata_batches)

#Scale data
sc.pp.scale(adata_batches)

#Process the data
sc.tl.pca(adata_batches)
sc.pp.neighbors(adata_batches)
sc.tl.umap(adata_batches)

In [None]:
sc.pl.umap(adata_batches, color="Cell_types")
sc.pl.umap(adata_batches, color="sample")