# Bundle ID / Version column filler

## 1. Dataset info

In [1]:
accession_index = 9
project_name = "SingleCellLiverLandscape"

In [2]:
work_dir = f"E-HCAD-{accession_index}_{project_name}"
accession = f"E-HCAD-{accession_index}"
protocol_accession = f"HCAD{accession_index}"
idf_file_name = f"{accession}.idf.txt"
sdrf_file_name = f"{accession}.sdrf.txt"

## 2. Load SDRF

In [41]:
import pandas as pd
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

In [4]:
sdrf = pd.read_csv(sdrf_file_name, sep="\t", keep_default_na=False)

In [10]:
len(sdrf)

40

In [6]:
manifest = pd.read_csv(f"../hca_manifest.tsv", sep="\t", keep_default_na=False)

## 3. Select required piece of the manifest

In [30]:
required_cols = ["bundle_uuid", "bundle_version", "file_name", "sample.biomaterial_core.biomaterial_id"]
manifest_chunk = manifest.loc[manifest['project.project_core.project_short_name'] == project_name][required_cols]

## 4. Filter bundle IDs
Take the earliest versions of the file bundles:
1. Sort by bundle version date.
2. Drop duplicates.

In [32]:
manifest_chunk['date'] = pd.to_datetime(manifest_chunk['bundle_version'])

In [34]:
manifest_chunk['file_part'] = manifest_chunk['file_name'].str.slice(0, -16)

In [43]:
result = manifest_chunk.sort_values(by='date').drop_duplicates(subset=["file_part"]).sort_values(by=["sample.biomaterial_core.biomaterial_id", "file_part"])

In [45]:
result.to_csv(f"E-HCAD-9 Bundle IDs.tsv", sep="\t")

In [46]:
result

Unnamed: 0,bundle_uuid,bundle_version,file_name,sample.biomaterial_core.biomaterial_id,date,file_part
891,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L001_R1_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L001
885,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L002_R2_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L002
881,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L003_R2_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L003
877,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L004_R2_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L004
875,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L005_R2_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L005
888,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L006_R1_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L006
878,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L007_R2_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L007
882,c59a8de8-d4f3-424b-b716-06b7152b980a,2019-09-23T173114.106782Z,TLH_S11_L008_I1_001.fastq.gz,P1TLH_liver,2019-09-23 17:31:14.106782+00:00,TLH_S11_L008
1044,c65efd23-bbc4-459a-ac60-d3cde705193d,2019-09-23T173114.107641Z,TLH_S3_L001_R1_001.fastq.gz,P2TLH_liver,2019-09-23 17:31:14.107641+00:00,TLH_S3_L001
602,c65efd23-bbc4-459a-ac60-d3cde705193d,2019-09-23T173114.107641Z,TLH_S3_L002_R1_001.fastq.gz,P2TLH_liver,2019-09-23 17:31:14.107641+00:00,TLH_S3_L002
