# Bundle ID / Version column filler

## 1. Dataset info

In [6]:
accession_index = 11
project_name = "HumanColonicMesenchymeIBD"

In [7]:
work_dir = f"E-HCAD-{accession_index}_{project_name}"
accession = f"E-HCAD-{accession_index}"
protocol_accession = f"HCAD{accession_index}"
idf_file_name = f"{accession}.idf.txt"
sdrf_file_name = f"{accession}.sdrf.txt"

## 2. Load SDRF

In [8]:
import pandas as pd
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

In [9]:
sdrf = pd.read_csv(f"{work_dir}/{sdrf_file_name}", sep="\t", keep_default_na=False)

In [5]:
manifest = pd.read_csv(f"hca_manifest.tsv", sep="\t", keep_default_na=False)

## 3. Select required piece of the manifest

In [10]:
required_cols = ["bundle_uuid", "bundle_version", "file_name", "sample.biomaterial_core.biomaterial_id"]
manifest_chunk = manifest.loc[manifest['project.project_core.project_short_name'] == project_name][required_cols]

## 4. Filter bundle IDs
Take the earliest versions of the file bundles:
1. Sort by bundle version date.
2. Drop duplicates.

In [27]:
required_file_names = [
    "SRR7159836_S1_L004_R1_001.fastq.gz",
    "SRR7159836_S1_L005_R1_001.fastq.gz",
    "SRR7159836_S1_L006_R1_001.fastq.gz",
    "SRR7159836_S1_L007_R1_001.fastq.gz",
    "SRR7159837_S1_L004_R1_001.fastq.gz",
    "SRR7159837_S1_L005_R1_001.fastq.gz",
    "SRR7159837_S1_L006_R1_001.fastq.gz",
    "SRR7159837_S1_L007_R1_001.fastq.gz",
    "SRR7159838_S1_L004_R1_001.fastq.gz",
    "SRR7159838_S1_L005_R1_001.fastq.gz",
    "SRR7159838_S1_L006_R1_001.fastq.gz",
    "SRR7159838_S1_L007_R1_001.fastq.gz",
    "SRR7159839_S1_L004_R1_001.fastq.gz",
    "SRR7159839_S1_L005_R1_001.fastq.gz",
    "SRR7159839_S1_L006_R1_001.fastq.gz",
    "SRR7159839_S1_L007_R1_001.fastq.gz"
]

manifest_chunk = manifest_chunk[manifest_chunk['file_name'].isin(required_file_names)][required_cols]

In [28]:
manifest_chunk['date'] = pd.to_datetime(manifest_chunk['bundle_version'])

In [30]:
manifest_chunk.sort_values(by='bundle_version').drop_duplicates(subset=["file_name"]).sort_values(by="file_name")

Unnamed: 0,bundle_uuid,bundle_version,file_name,sample.biomaterial_core.biomaterial_id,date
380,6ba77e54-c852-4799-9cf4-1a42d9321f96,2019-10-04T094331.590666Z,SRR7159836_S1_L004_R1_001.fastq.gz,Human_HC_1_colon_sample,2019-10-04 09:43:31.590666+00:00
345,6ba77e54-c852-4799-9cf4-1a42d9321f96,2019-10-04T094331.590666Z,SRR7159836_S1_L005_R1_001.fastq.gz,Human_HC_1_colon_sample,2019-10-04 09:43:31.590666+00:00
352,6ba77e54-c852-4799-9cf4-1a42d9321f96,2019-10-04T094331.590666Z,SRR7159836_S1_L006_R1_001.fastq.gz,Human_HC_1_colon_sample,2019-10-04 09:43:31.590666+00:00
381,6ba77e54-c852-4799-9cf4-1a42d9321f96,2019-10-04T094331.590666Z,SRR7159836_S1_L007_R1_001.fastq.gz,Human_HC_1_colon_sample,2019-10-04 09:43:31.590666+00:00
212,1d457990-da79-4c2f-9de6-f70d2c33b800,2019-10-04T094331.542689Z,SRR7159837_S1_L004_R1_001.fastq.gz,Human_HC_2_colon_sample,2019-10-04 09:43:31.542689+00:00
698,1d457990-da79-4c2f-9de6-f70d2c33b800,2019-10-04T094331.542689Z,SRR7159837_S1_L005_R1_001.fastq.gz,Human_HC_2_colon_sample,2019-10-04 09:43:31.542689+00:00
205,1d457990-da79-4c2f-9de6-f70d2c33b800,2019-10-04T094331.542689Z,SRR7159837_S1_L006_R1_001.fastq.gz,Human_HC_2_colon_sample,2019-10-04 09:43:31.542689+00:00
689,1d457990-da79-4c2f-9de6-f70d2c33b800,2019-10-04T094331.542689Z,SRR7159837_S1_L007_R1_001.fastq.gz,Human_HC_2_colon_sample,2019-10-04 09:43:31.542689+00:00
162,9ff12571-ab41-4246-806d-61db4baf347e,2019-10-04T094331.676137Z,SRR7159838_S1_L004_R1_001.fastq.gz,Human_UC_1_colon_sample,2019-10-04 09:43:31.676137+00:00
166,9ff12571-ab41-4246-806d-61db4baf347e,2019-10-04T094331.676137Z,SRR7159838_S1_L005_R1_001.fastq.gz,Human_UC_1_colon_sample,2019-10-04 09:43:31.676137+00:00
