In [29]:
import anndata as ad
import scvelo as scv
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd

# Load the velocyto loom file
ldata = sc.read("data/103.self_workflow/velocyto_combined.loom", cache=True)

# Set plotting settings
scv.settings.set_figure_params('scvelo', figsize=(10, 8))
adata = ad.read_h5ad('data/110.adipo/sc_adipo.h5ad')

In [30]:
ldata

AnnData object with n_obs × n_vars = 37820 × 55348
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'

In [19]:
# print cell counts in adata
adata.obs['cell_type'].value_counts()

cell_type
PSC      5289
Fib      4793
Adipo    2434
Name: count, dtype: int64

In [28]:
# print cell names in ldata
print(ldata.obs.index)
# get prefix of cell names (ie first 3 characters), summarize
prefix = ldata.obs.index.str[-3:]
prefix_counts = prefix.value_counts()
print(prefix_counts)

Index(['N1_CELL10122_N1x', 'N1_CELL10029_N1x', 'N1_CELL10127_N1x',
       'N1_CELL10069_N1x', 'N1_CELL10018_N1x', 'N1_CELL10073_N1x',
       'N1_CELL10113_N1x', 'N1_CELL10118_N1x', 'N1_CELL10100_N1x',
       'N1_CELL1013_N1x',
       ...
       'N2_CELL989_N2x', 'N2_CELL990_N2x', 'N2_CELL982_N2x', 'N2_CELL971_N2x',
       'N4_CELL978_N4x', 'N3_CELL994_N3x', 'N2_CELL988_N2x', 'N2_CELL984_N2x',
       'N2_CELL987_N2x', 'N3_CELL976_N3x'],
      dtype='object', name='CellID', length=37820)
CellID
N1x    30866
N2x     5528
N3x     1199
N4x      201
N5x       21
N6x        5
Name: count, dtype: int64


In [21]:
# Create lists of cell names with suffixes removed
adata_names_clean = [
    (
        name[:-3]
        if (
            name.endswith("_N1")
            or name.endswith("_N2")
            or name.endswith("_N3")
            or name.endswith("_N4")
            or name.endswith("_T1")
            or name.endswith("_T2")
            or name.endswith("_T3")
            or name.endswith("_T4")
        )
        else name
    )
    for name in adata.obs.index
]

ldata_names_clean = [
    (
        name[:-4]
        if (
            name.endswith("_N1x")
            or name.endswith("_N2x")
            or name.endswith("_N3x")
            or name.endswith("_N4x")
            or name.endswith("_T1x")
            or name.endswith("_T2x")
            or name.endswith("_T3x")
            or name.endswith("_T4x")
        )
        else name
    )
    for name in ldata.obs.index
]

# Find intersection
intersection = set(adata_names_clean) & set(ldata_names_clean)

# Print results
print(f"Total number of cells in adata: {len(adata_names_clean)}")
print(f"Total number of cells in ldata: {len(ldata_names_clean)}")
print(f"Number of matching cells after removing suffixes: {len(intersection)}")

# Print some examples of matching cell names
print("\nSample of matching cell names:")
sample_matches = list(intersection)
for i, match in enumerate(sample_matches):
    print(f"  {i+1}. {match}")

Total number of cells in adata: 12516
Total number of cells in ldata: 37820
Number of matching cells after removing suffixes: 927

Sample of matching cell names:
  1. N1_CELL916
  2. N1_CELL3838
  3. N1_CELL10356
  4. N1_CELL1284
  5. N1_CELL207
  6. N1_CELL15091
  7. N1_CELL2286
  8. N1_CELL2415
  9. N1_CELL5195
  10. N1_CELL8334
  11. N1_CELL60
  12. N1_CELL582
  13. N1_CELL5422
  14. N2_CELL1209
  15. N1_CELL3436
  16. N1_CELL2920
  17. N1_CELL8688
  18. N1_CELL36
  19. N1_CELL8324
  20. N1_CELL1789
  21. N1_CELL10232
  22. N2_CELL2090
  23. N1_CELL6796
  24. N1_CELL13886
  25. N1_CELL360
  26. N1_CELL1794
  27. N1_CELL953
  28. N2_CELL125
  29. N1_CELL9828
  30. N1_CELL2577
  31. N1_CELL24464
  32. N2_CELL972
  33. N2_CELL1294
  34. N1_CELL7241
  35. N1_CELL18600
  36. N2_CELL310
  37. N1_CELL19359
  38. N1_CELL2497
  39. N1_CELL6828
  40. N1_CELL886
  41. N1_CELL1364
  42. N1_CELL4167
  43. N1_CELL1246
  44. N1_CELL6461
  45. N1_CELL10397
  46. N1_CELL2636
  47. N1_CELL6015
  48. 

In [9]:
# Make sure cell names match between anndata objects
ldata.obs.index = [x.split(':')[1] if ':' in x else x for x in ldata.obs.index]
ldata.obs.index = ldata.obs.index.str.replace('x', '')

# Check for and handle duplicates in ldata index
if ldata.obs.index.duplicated().any():
	print(f"Found {sum(ldata.obs.index.duplicated())} duplicated indices in ldata")
	# Make indices unique by adding a suffix to duplicates
	ldata.obs.index = ldata.obs.index.astype(str) + '_' + ldata.obs.groupby(level=0).cumcount().astype(str)
	# Remove the '_0' suffix from non-duplicated indices
	ldata.obs.index = [idx[:-2] if idx.endswith('_0') else idx for idx in ldata.obs.index]

# Print info about our datasets to verify
print(f"adata shape: {adata.shape}")
print(f"ldata shape: {ldata.shape}")
print(f"Common cells: {len(set(adata.obs.index) & set(ldata.obs.index))}")

# Add spliced/unspliced data to our original adata object
adata = scv.utils.merge(adata, ldata)

Found 19847 duplicated indices in ldata
adata shape: (12516, 33177)
ldata shape: (37820, 55348)
Common cells: 496


In [None]:

# Preprocessing and computing velocity
scv.pp.filter_and_normalize(adata)
scv.pp.moments(adata)
scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)

# Plot velocity stream with cells colored by cell_type_dtl
scv.pl.velocity_embedding_stream(adata, basis='X_umap_integrated', color='cell_type_dtl',
                                legend_loc='right', title='RNA Velocity - Cell Types', 
                                size=15, alpha=0.8, dpi=120)

plt.tight_layout()
