In [1]:
import scvelo as scv

import pandas as pd
import glob
import os

# Read in loom files and combine them

In [19]:
data_dir = '../../data/mouse/'

In [20]:
month = "3_month"

In [27]:
raw_files = glob.glob(f"{data_dir}/data/10x/{month}/*")

In [22]:
samples_month = [os.path.basename(raw_file) for raw_file in raw_files]

In [23]:
# filter output files per month

In [24]:
files = [glob.glob(f"{data_dir}/pipeline_velocyto_output/velocyto/{sample}/*.loom") \
         for sample in samples_month]

In [25]:
files = []
for sample in samples_month:
    if os.path.isdir(f"{data_dir}/pipeline_velocyto_output/velocyto/{sample}"):
        files.append(glob.glob(f"{data_dir}/pipeline_velocyto_output/velocyto/{sample}/*.loom")) 
        #samples.append(sample)

## Read in adata

In [31]:
adata = scv.read(f"{data_dir}/official_data/tabula-muris-senis-droplet-official-raw-obj.h5ad")

In [32]:
adata

AnnData object with n_obs × n_vars = 245389 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'
    var: 'n_cells'

#### Test which samples are new

In [33]:
samples_dict = dict()
for file in files:
    sample = file[0].split('/')[-2]
    samples_dict[sample] = any(adata.obs.cell.str.contains(sample))

In [34]:
samples_metadata = pd.DataFrame.from_dict(samples_dict, orient='index', columns = ['present_in_metadata'])

In [35]:
samples_metadata

Unnamed: 0,present_in_metadata
10X_P7_4,True
10X_P4_0,True
10X_P7_3,True
10X_P4_1,True
10X_P4_4,True
10X_P7_14,True
10X_P7_9,True
10X_P7_10,True
10X_P7_11,True
10X_P7_12,True


In [36]:
print("Percentage of samples that are pesent in metadata {}".format(sum(samples_metadata.present_in_metadata)/samples_metadata.shape[0]))


Percentage of samples that are pesent in metadata 1.0


## Combine on obs.cell column


In [38]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_list = []
for file in tqdm(files):
    print(file)
    file = file[0]
    data = scv.read_loom(file, sparse=True)
    data.var_names_make_unique()
    sample = file.split('/')[-2]
    scv.utils.clean_obs_names(data, ID_length=16)
    if any(data.obs.index.map(len) > 16):
        print("some obs names larger than 16")
    data.obs['cell'] = [f'{sample}_{obs_name}' for obs_name in data.obs.index ]
    #data.obs.set_index('cell', inplace = True)
    df_list.append(data)


In [42]:
combined = df_list[0].concatenate(df_list[1:], join = "outer")

In [43]:
combined.obs.reset_index(inplace = True)
combined.obs = combined.obs.merge(adata.obs, on = 'cell', how = 'left', copy = False)
combined.obs.set_index('index', inplace = True)

In [44]:
combined.obs

Unnamed: 0_level_0,cell,batch,age,cell_ontology_class,cell_ontology_id,free_annotation,method,mouse.id,n_genes,sex,subtissue,tissue,tissue_free_annotation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ACGCAGCAGTTACGGG-0,10X_P7_4_ACGCAGCAGTTACGGG,0,,,,,,,,,,,
AAGGCAGGTCACCTAA-0,10X_P7_4_AAGGCAGGTCACCTAA,0,3m,fibroblast of cardiac tissue,CL:0000057,fibroblast,droplet,3-F-56,,female,,Heart_and_Aorta,Heart_and_Aorta
ACGATACCACAGCGTC-0,10X_P7_4_ACGATACCACAGCGTC,0,3m,smooth muscle cell,,smooth muscle cell,droplet,3-F-56,,female,,Heart_and_Aorta,Heart_and_Aorta
AAATGCCAGTCCATAC-0,10X_P7_4_AAATGCCAGTCCATAC,0,3m,fibroblast of cardiac tissue,CL:0000057,fibroblast,droplet,3-F-56,,female,,Heart_and_Aorta,Heart_and_Aorta
AGAGCGACAACACCCG-0,10X_P7_4_AGAGCGACAACACCCG,0,3m,erythrocyte,,,droplet,3-F-56,,female,,Heart_and_Aorta,Heart_and_Aorta
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTCTCGTCAAACTGTC-27,10X_P4_3_GTCTCGTCAAACTGTC,27,3m,bladder cell,CL:1001319,bladder mesenchymal cell (Scara5+),droplet,3-M-8,,male,,Bladder,Bladder
TGGCTGGGTCGCTTCT-27,10X_P4_3_TGGCTGGGTCGCTTCT,27,3m,bladder urothelial cell,CL:1001428,basal bladder epithelial cell (Krt5+Krt14+),droplet,3-M-8,,male,,Bladder,Bladder
TACGGATAGGTACTCT-27,10X_P4_3_TACGGATAGGTACTCT,27,3m,bladder cell,CL:1001319,bladder mesenchymal cell (Scara5+),droplet,3-M-8,,male,,Bladder,Bladder
GTAGTCAAGACGACGT-27,10X_P4_3_GTAGTCAAGACGACGT,27,3m,bladder cell,CL:1001319,bladder mesenchymal cell (Scara5+),droplet,3-M-8,,male,,Bladder,Bladder


In [46]:
# fraction that has no annotation
combined.obs.cell_ontology_class.isna().sum()/combined.obs.shape[0]

0.16427249166975194

In [47]:
combined_filtered = combined[~combined.obs.cell_ontology_class.isna()]

In [48]:
combined_filtered.shape

(45146, 31253)

In [64]:
combined.write(f"{data_dir}/count_matrices/{month}_annotated.h5ad")

... storing 'Chromosome' as categorical
... storing 'Strand' as categorical
