## First Use-case

1. Attach/detach data assets
2. Create a results asset from folders saved within “results” folder
3. Is it possible to do so from another capsule?


### Let's import modules & authenticate

In [26]:
#Import Libs
import os
from codeocean import CodeOcean
from codeocean.capsule import CapsuleSearchParams
from codeocean.components import UserPermissions, EveryoneRole, UserRole, SearchFilter
from codeocean.computation import RunParams, DataAssetsRunParam, PipelineProcessParams, NamedRunParam
from codeocean.data_asset import AWSS3Source, ComputationSource, DataAssetParams, Permissions, Source, Target, DataAssetAttachParams
import datetime

In [27]:
#Authenticate with CO API
co_api_token = os.getenv("CUSTOM_KEY")
co_domain = "https://acmecorp-cfn-demo.codeocean.com"
client=CodeOcean(domain=co_domain,token=co_api_token)

### Search for a named Capsule and get its ID:

In [28]:
import json #

capsule_search_params = CapsuleSearchParams(
    limit=10,
    sort_order="desc",
    sort_field="name",
    status="release",
    archived=False,
    favorite=False,
    query="name:Personal Cancer Genome Reporter" 
)

capsules = client.capsules.search_capsules(capsule_search_params)

# Ensure `capsules` is an object and has the `results` attribute
if hasattr(capsules, "results") and capsules.results:
    for capsule in capsules.results:
        print("\nCapsule Information:")
        print("-" * 50)
        print(f"Name: {capsule.name}")
        print(f"ID: {capsule.id}")
        print(f"Status: {capsule.status.value}")  # Enum to string
        print(f"Field: {capsule.field}")
        print(f"Owner ID: {capsule.owner}")
        print(f"Slug: {capsule.slug}")

        print("\nDescription:")
        print(capsule.description)

        print("\nAvailable Versions:")
        if hasattr(capsule, "versions") and capsule.versions:
            for version in capsule.versions:
                print(f"  - v{version['major_version']}.{version['minor_version']} (Released: {version['release_time']})")
        else:
            print("  No versions available.")
        #Save the ID of the capsule of interest
        if capsule.name == "Personal Cancer Genome Reporter":
            pcgr_capsule_id = capsule.id
            break  # Stop searching once found
        
    print("\n" + "-" * 50)
else:
    print("No capsules found.")

#Confirm PCGR ID extracted
if pcgr_capsule_id:
    print(f"PCGR Capsule ID: {pcgr_capsule_id}")
else:
    print("Capsule not found.")


Capsule Information:
--------------------------------------------------
Name: Personal Cancer Genome Reporter
ID: dc428b3a-823d-4939-b4c4-a644df1f51b3
Status: release
Field: Medical Sciences
Owner ID: ab7da5e2-ae96-437f-8085-693deea17f7c
Slug: 4613209

Description:
The Personal Cancer Genome Reporter (PCGR) is a stand-alone software package for functional annotation and translation of individual tumor genomes for precision cancer medicine. It interprets primarily somatic SNVs/InDels and copy number aberrations, and has additional support for interpretation of bulk RNA-seq expression data. The software classifies variants both with respect to oncogenicity, and actionability. Interactive HTML output reports allow the user to interrogate the clinical impact of the molecular findings in an individual tumor.

Available Versions:
  - v1.0 (Released: 1737454567)
  - v2.0 (Released: 1738675213)
  - v3.0 (Released: 1739110102)
  - v4.0 (Released: 1739276798)
  - v5.0 (Released: 1739407606)
  -

### Find Data Assets by Tag (eg. patient ID)

In [29]:
patient_id = "SRR2089359"

from codeocean.data_asset import DataAssetSearchParams
from pprint import pprint

data_asset_params = DataAssetSearchParams(
    offset=0,
    limit=10, #limit to 10 results
    sort_order="desc",
    sort_field="name",
    archived=False,
    favorite=False,
    query="tag:"+ patient_id, 
)

data_assets = client.data_assets.search_data_assets(data_asset_params)
pprint(data_assets)

if hasattr(data_assets, "results") and data_assets.results:
    for asset in data_assets.results:
        if patient_id in asset.name:
            my_data_asset_id = asset.id
            break  # Stop searching once found


# Print the ID  of the data asset if patient ID found 
if my_data_asset_id:
    print("\n" + "-" * 50)
    print(f"DataAsset ID for {patient_id}: {my_data_asset_id}")
    print("\n" + "-" * 50)
else:
    print("\n" + "-" * 50)
    print("No matching DataAsset found.")
    print("\n" + "-" * 50)


DataAssetSearchResults(has_more=False,
                       results=[DataAsset(id='86cc8096-158d-4c09-99aa-9e7781970520',
                                          created=1739128478,
                                          name='Output Report and data - '
                                               'SRR2089359',
                                          mount='vcf',
                                          state=<DataAssetState.Ready: 'ready'>,
                                          type=<DataAssetType.Result: 'result'>,
                                          last_used=1739406876,
                                          files=16,
                                          size=45258917,
                                          description='Variant Calls and '
                                                      'associated report from '
                                                      'pancreatic tumour '
                                                      'sam

### List files in data asset

In [30]:
file_paths = client.data_assets.list_data_asset_files(
    data_asset_id=my_data_asset_id,
)
pprint(file_paths)


Folder(items=[FolderItem(name='SRR2089359.pcgr.grch38.conf.yaml',
                         path='SRR2089359.pcgr.grch38.conf.yaml',
                         type='file',
                         size=181056),
              FolderItem(name='SRR2089359.pcgr.grch38.html',
                         path='SRR2089359.pcgr.grch38.html',
                         type='file',
                         size=17764353),
              FolderItem(name='SRR2089359.pcgr.grch38.msigs.tsv.gz',
                         path='SRR2089359.pcgr.grch38.msigs.tsv.gz',
                         type='file',
                         size=2526),
              FolderItem(name='SRR2089359.pcgr.grch38.pass.tsv.gz',
                         path='SRR2089359.pcgr.grch38.pass.tsv.gz',
                         type='file',
                         size=5724999),
              FolderItem(name='SRR2089359.pcgr.grch38.pass.vcf.gz',
                         path='SRR2089359.pcgr.grch38.pass.vcf.gz',
                         ty

### Validate the correct data asset is found

In [31]:
# check for a given file type:

# Extract all file names from file_paths
file_names = {item.name for item in file_paths.items}

# Store matching .vcf and .vcf.gz files
vcf_files = set()
vcf_gz_files = set()

# Iterate through file names and classify them
for file_name in file_names:
    if patient_id in file_name and "somatic" in file_name:
        if file_name.endswith(".vcf"):
            vcf_files.add(file_name)
        elif file_name.endswith(".vcf.gz"):
            vcf_gz_files.add(file_name)

# Check if corresponding .vcf and .vcf.gz exist for the same base name
valid_pairs = []
for vcf in vcf_files:
    base_name = vcf.rsplit(".vcf", 1)[0]  # Remove .vcf extension
    gz_variant = base_name + ".vcf.gz"  # Expected compressed variant
    if gz_variant in vcf_gz_files:
        valid_pairs.append((vcf, gz_variant))

# Print validation results
if valid_pairs:
    print("Valid VCF and VCF.GZ pairs found:")
    for pair in valid_pairs:
        print(f" - {pair[0]} and {pair[1]}")
else:
    print("No valid VCF and VCF.GZ pairs found.")

Valid VCF and VCF.GZ pairs found:
 - SRR2089359_normal_marked_duplicates_marked_duplicates_BQSR_somatic.vcf and SRR2089359_normal_marked_duplicates_marked_duplicates_BQSR_somatic.vcf.gz


### Attach Data Assets to Capsule, and run with desired Parameters 


In [32]:
from codeocean.computation import RunParams, DataAssetsRunParam

data_assets=[
        DataAssetsRunParam(id=my_data_asset_id, mount="output_report")
        ]    

run_params = RunParams(capsule_id=pcgr_capsule_id,
                       data_assets=data_assets,
                       version=6, #specify the release version
                       parameters=[
                              patient_id,"19","1","2","WES" #list ordered paramters
                              ]    
                )

computation = client.computations.run_capsule(run_params)

### Run Capsule and check computation status 

In [33]:
#get status
status = client.computations.get_computation(computation_id=computation.id)
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Computation started, current state: {status.state.value}")

import time
# Poll every 30 seconds until the state is no longer 'initializing' or 'running'
while status.state.value in ("initializing", "running"):
    print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Current state: {status.state.value}. Waiting 30 seconds before re-checking...")
    time.sleep(30)
    print("pinging platform")
    print("\n" + "-" * 50)
    status = client.computations.get_computation(computation_id=computation.id)


# When the loop exits, the computation is finished or in a terminal state
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Computation finished with state: {status.state.value}")
print("\n" + "-" * 50)
pprint(status)

[2025-02-13 11:38:02] Computation started, current state: running
[2025-02-13 11:38:02] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

--------------------------------------------------
[2025-02-13 11:38:32] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

--------------------------------------------------
[2025-02-13 11:39:02] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

--------------------------------------------------
[2025-02-13 11:39:32] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

--------------------------------------------------
[2025-02-13 11:40:02] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

--------------------------------------------------
[2025-02-13 11:40:32] Current state: running. Waiting 30 seconds before re-checking...
pinging platform

-------------------------------------------------

### Preview Results

In [34]:
# Get results

result_paths = client.computations.list_computation_results(
    computation_id=computation.id,
)
pprint(result_paths)

Folder(items=[FolderItem(name='SRR2089359.pcgr.grch38.conf.yaml',
                         path='SRR2089359.pcgr.grch38.conf.yaml',
                         type='file',
                         size=180894),
              FolderItem(name='SRR2089359.pcgr.grch38.html',
                         path='SRR2089359.pcgr.grch38.html',
                         type='file',
                         size=17764211),
              FolderItem(name='SRR2089359.pcgr.grch38.msigs.tsv.gz',
                         path='SRR2089359.pcgr.grch38.msigs.tsv.gz',
                         type='file',
                         size=2467),
              FolderItem(name='SRR2089359.pcgr.grch38.pass.tsv.gz',
                         path='SRR2089359.pcgr.grch38.pass.tsv.gz',
                         type='file',
                         size=5724999),
              FolderItem(name='SRR2089359.pcgr.grch38.pass.vcf.gz',
                         path='SRR2089359.pcgr.grch38.pass.vcf.gz',
                         ty

### Create Results Data Asset

In [35]:
#create results data asset

DataAssetName = "Cancer Report"
DataAssetDescription = "The PCGR workflow summarises and prioritizes the annotated variants in a structured and interactive report, adopting recently proposed recommendations (Dienstmann et al., 2014; Ritter et al., 2016). Specifically, a tiered report is constructed, starting from actionable markers in Tier 1, toward aberrations relevant for tumorigenesis in Tier 2 and 3, and ending with variants of unknown functional relevance in Tier 4 and 5. In addition to the tier structure, mutated genes in Tier 3-5 are prioritized by means of the above-mentioned literature-derived score for oncogenic potential, which draws attention to the most relevant findings."

data_asset_params = DataAssetParams(
    name=DataAssetName,
    description=DataAssetDescription,
    mount="Report",
    tags=["Project12345", "PCGR"],
    source=Source(
        computation=ComputationSource(
            id=computation.id,
            path=""
        ),
    )
)
    
data_asset = client.data_assets.create_data_asset(data_asset_params)

In [36]:
pprint(data_asset)

DataAsset(id='384d13bc-92b2-4087-808a-bfc7e79cabb2',
          created=1739447282,
          name='',
          mount='Report',
          state=<DataAssetState.Draft: 'draft'>,
          type=<DataAssetType.Result: 'result'>,
          last_used=0,
          files=None,
          size=None,
          description='The PCGR workflow summarises and prioritizes the '
                      'annotated variants in a structured and interactive '
                      'report, adopting recently proposed recommendations '
                      '(Dienstmann et al., 2014; Ritter et al., 2016). '
                      'Specifically, a tiered report is constructed, starting '
                      'from actionable markers in Tier 1, toward aberrations '
                      'relevant for tumorigenesis in Tier 2 and 3, and ending '
                      'with variants of unknown functional relevance in Tier 4 '
                      'and 5. In addition to the tier structure, mutated genes '
        