# Filter a loom file by a column attribute
Note that this process works but it is quite slow and results in a very large loom file.

In [2]:
import loompy
import pprint
import numpy as np

In [3]:
working_dir="/Users/mshadbolt/Documents/wrangling_work/loom_file_update_test/loom_work/"

In [None]:
loom_path=working_dir + "Fetal_Maternal Interface homo_sapiens 2019-11-08 16.16.loom"

View the dimensions of the loom file

In [9]:
with loompy.connect(loom_path) as ds:
    print(ds.shape)

(58347, 546183)


Print all the available column attributes in the loom

In [16]:
with loompy.connect(loom_path) as ds:
    pprint.pprint(ds.ca.keys())

['CellID',
 'analysis_protocol.protocol_core.protocol_id',
 'analysis_protocol.provenance.document_id',
 'analysis_working_group_approval_status',
 'barcode',
 'bundle_uuid',
 'bundle_version',
 'cell_suspension.genus_species.ontology',
 'cell_suspension.genus_species.ontology_label',
 'cell_suspension.provenance.document_id',
 'derived_organ_label',
 'derived_organ_ontology',
 'derived_organ_parts_label',
 'derived_organ_parts_ontology',
 'donor_organism.development_stage.ontology',
 'donor_organism.development_stage.ontology_label',
 'donor_organism.diseases.ontology',
 'donor_organism.diseases.ontology_label',
 'donor_organism.human_specific.ethnicity.ontology',
 'donor_organism.human_specific.ethnicity.ontology_label',
 'donor_organism.is_living',
 'donor_organism.provenance.document_id',
 'donor_organism.sex',
 'dss_bundle_fqid',
 'emptydrops_is_cell',
 'file_uuid',
 'file_version',
 'genes_detected',
 'library_preparation_protocol.end_bias',
 'library_preparation_protocol.input_n

## Filter an HCA loom file by organ label

First we need to get the column indices for the organ of interest. In this case we want all the columns with the `derived_organ_label` "blood"

In [19]:
with loompy.connect(loom_path) as ds:
    blood_indices = [i for i, x in enumerate(ds.ca["derived_organ_label"]) if x == "blood"]


array(['decidua', 'decidua', 'decidua', ..., 'placenta', 'decidua',
       'decidua'], dtype=object)


We need to convert the array of indices into a numpy array to be able to use it with the `loompy.scan` function

In [33]:
blood_indices = np.array(blood_indices)

array([16, 50, 59, 61, 72])


Then we set the output file path where we want to save the new loom and print it out to check. Now we are ready to create our filtered loom.

In [31]:
out_file=working_dir + "test_scan.loom"
print(out_file)

/Users/mshadbolt/Documents/wrangling_work/loom_file_update_test/loom_work/test_scan.loom


Below code scans through the loom file provided in `loom_path`, selects the columns based on the np.array of indices in `blood_indices` then outputs the filtered loom to the file specified in `out_file`. Scanning through the loom file chunk by chunk prevents out of memory errors.
This may take a long time depending on the dimensions of the loom file.
Adapted from : http://linnarssonlab.org/loompy/cookbook/index.html#combining-data-using-scan-and-new

In [86]:
with loompy.new(out_file) as dsout:  # Create a new, empty, loom file
    with loompy.connect(loom_path) as ds:
      for (ix, selection, view) in ds.scan(items=blood_indices, axis=1):
        dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra)
        pprint.pprint("ix: " + str(ix))


'ix: 0'
'ix: 512'
'ix: 1024'
'ix: 1536'
'ix: 2048'
'ix: 2560'
'ix: 3072'
'ix: 3584'
'ix: 4096'
'ix: 4608'
'ix: 5120'
'ix: 5632'
'ix: 6144'
'ix: 6656'
'ix: 7168'
'ix: 7680'
'ix: 8192'
'ix: 8704'
'ix: 9216'
'ix: 9728'
'ix: 10240'
'ix: 10752'
'ix: 11264'
'ix: 11776'
'ix: 12288'
'ix: 12800'
'ix: 13312'
'ix: 13824'
'ix: 14336'
'ix: 14848'
'ix: 15360'
'ix: 15872'
'ix: 16384'
'ix: 16896'
'ix: 17408'
'ix: 17920'
'ix: 18432'
'ix: 18944'
'ix: 19456'
'ix: 19968'
'ix: 20480'
'ix: 20992'
'ix: 21504'
'ix: 22016'
'ix: 22528'
'ix: 23040'
'ix: 23552'
'ix: 24064'
'ix: 24576'
'ix: 25088'
'ix: 25600'
'ix: 26112'
'ix: 26624'
'ix: 27136'
'ix: 27648'
'ix: 28160'
'ix: 28672'
'ix: 29184'
'ix: 29696'
'ix: 30208'
'ix: 30720'
'ix: 31232'
'ix: 31744'
'ix: 32256'
'ix: 32768'
'ix: 33280'
'ix: 33792'
'ix: 34304'
'ix: 34816'
'ix: 35328'
'ix: 35840'
'ix: 36352'
'ix: 36864'
'ix: 37376'
'ix: 37888'
'ix: 38400'
'ix: 38912'
'ix: 39424'
'ix: 39936'
'ix: 40448'
'ix: 40960'
'ix: 41472'
'ix: 41984'
'ix: 42496'
'ix: 43008'
'ix:

'ix: 331776'
'ix: 332288'
'ix: 332800'
'ix: 333312'
'ix: 333824'
'ix: 334336'
'ix: 334848'
'ix: 335360'
'ix: 335872'
'ix: 336384'
'ix: 336896'
'ix: 337408'
'ix: 337920'
'ix: 338432'
'ix: 338944'
'ix: 339456'
'ix: 339968'
'ix: 340480'
'ix: 340992'
'ix: 341504'
'ix: 342016'
'ix: 342528'
'ix: 343040'
'ix: 343552'
'ix: 344064'
'ix: 344576'
'ix: 345088'
'ix: 345600'
'ix: 346112'
'ix: 346624'
'ix: 347136'
'ix: 347648'
'ix: 348160'
'ix: 348672'
'ix: 349184'
'ix: 349696'
'ix: 350208'
'ix: 350720'
'ix: 351232'
'ix: 351744'
'ix: 352256'
'ix: 352768'
'ix: 353280'
'ix: 353792'
'ix: 354304'
'ix: 354816'
'ix: 355328'
'ix: 355840'
'ix: 356352'
'ix: 356864'
'ix: 357376'
'ix: 357888'
'ix: 358400'
'ix: 358912'
'ix: 359424'
'ix: 359936'
'ix: 360448'
'ix: 360960'
'ix: 361472'
'ix: 361984'
'ix: 362496'
'ix: 363008'
'ix: 363520'
'ix: 364032'
'ix: 364544'
'ix: 365056'
'ix: 365568'
'ix: 366080'
'ix: 366592'
'ix: 367104'
'ix: 367616'
'ix: 368128'
'ix: 368640'
'ix: 369152'
'ix: 369664'
'ix: 370176'
'ix: 370688'

In [87]:
with loompy.connect(out_file) as ds:
    print(ds.shape)

(58347, 65275)


## Conclusion
The process works but it is fairly slow (~2 hours) and results in a file that is huge compared to the original, > 55 GB, I posted an issue on the loompy github page about the increase in size to see if it is expected (https://github.com/linnarsson-lab/loompy/issues/123).