# Bundle ID / Version column filler

## 1. Dataset info

In [1]:
accession_index = 10
project_name = "KidneySingleCellAtlas"

In [10]:
work_dir = f"E-HCAD-{accession_index}_{project_name}"
accession = f"E-HCAD-{accession_index}"
protocol_accession = f"HCAD{accession_index}"
idf_file_name = f"{accession}.idf.txt"
sdrf_file_name = f"{accession}.sdrf.txt"

## 2. Load SDRF

In [11]:
import pandas as pd
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

In [12]:
sdrf = pd.read_csv(sdrf_file_name, sep="\t", keep_default_na=False)

In [13]:
manifest = pd.read_csv(f"../hca_manifest.tsv", sep="\t", keep_default_na=False)

## 3. Select required piece of the manifest

In [14]:
required_cols = ["bundle_uuid", "bundle_version", "file_name", "sample.biomaterial_core.biomaterial_id"]
manifest_chunk = manifest.loc[manifest['project.project_core.project_short_name'] == project_name][required_cols]

In [21]:
manifest_chunk = manifest_chunk[~manifest_chunk['file_name'].astype(str).str.startswith('CZI')]

## 4. Filter bundle IDs
Take the earliest versions of the file bundles:
1. Sort by bundle version date.
2. Drop duplicates.

In [22]:
manifest_chunk['file_part'] = manifest_chunk['file_name'].str.slice(0, -16)

In [23]:
manifest_chunk = manifest_chunk.drop_duplicates(subset=['file_part'])

In [24]:
manifest_chunk['date'] = pd.to_datetime(manifest_chunk['bundle_version'])

In [25]:
manifest_chunk.sort_values(by='bundle_version').sort_values(by=["sample.biomaterial_core.biomaterial_id", "file_part"])

Unnamed: 0,bundle_uuid,bundle_version,file_name,sample.biomaterial_core.biomaterial_id,file_part,date
216,4a93a63a-53b9-4625-ae17-102a94bc6e6b,2019-10-24T212959.067729Z,4834STDY7002875_S1_L001_R1_001.fastq.gz,F16_1,4834STDY7002875_S1_L001,2019-10-24 21:29:59.067729+00:00
1148,85f6d419-f938-4adf-8c28-df42c051d46c,2019-10-23T144855.605493Z,4834STDY7002876_S1_L001_I1_001.fastq.gz,F16_1,4834STDY7002876_S1_L001,2019-10-23 14:48:55.605493+00:00
865,be0dddfd-5b70-48f6-b6bd-ba4f43868cb8,2019-10-23T144855.605539Z,4834STDY7002881_S1_L001_I1_001.fastq.gz,F17_1,4834STDY7002881_S1_L001,2019-10-23 14:48:55.605539+00:00
154,1b19d618-a412-413a-acd6-fb9053b1e5df,2019-10-23T144855.605570Z,4834STDY7002885_S1_L001_R1_001.fastq.gz,F17_1,4834STDY7002885_S1_L001,2019-10-23 14:48:55.605570+00:00
1196,6d853375-b12a-4cbd-a60a-37e7324400a0,2019-10-24T171936.505978Z,4834STDY7002886_S1_L001_R1_001.fastq.gz,F17_1,4834STDY7002886_S1_L001,2019-10-24 17:19:36.505978+00:00
57,83ed57f4-9743-4f08-abfe-d2ebb8d9ff91,2019-10-23T144855.605620Z,FCAImmP7462242_S1_L001_R1_001.fastq.gz,F35_1,FCAImmP7462242_S1_L001,2019-10-23 14:48:55.605620+00:00
140,15479e62-238a-4c6a-ac39-b92a24e4b27f,2019-10-24T084729.869919Z,FCAImmP7462243_S1_L001_I1_001.fastq.gz,F35_1,FCAImmP7462243_S1_L001,2019-10-24 08:47:29.869919+00:00
610,49b83276-d663-4621-962d-7adb7af92f3d,2019-10-24T080821.903964Z,FCAImmP7528292_S1_L001_R1_001.fastq.gz,F38_1,FCAImmP7528292_S1_L001,2019-10-24 08:08:21.903964+00:00
126,66b75bd0-ec85-4777-b31b-f4f05077bdec,2019-10-24T075932.398586Z,FCAImmP7528293_S1_L001_R1_001.fastq.gz,F38_1,FCAImmP7528293_S1_L001,2019-10-24 07:59:32.398586+00:00
939,d4f75bfc-5813-4ca3-a04a-3a10c8997585,2019-10-23T144855.605737Z,FCAImmP7555849_S1_L001_I1_001.fastq.gz,F41_1,FCAImmP7555849_S1_L001,2019-10-23 14:48:55.605737+00:00
