# Bundle ID / Version column filler

## 1. Dataset info

In [1]:
accession_index = 13
project_name = "Reprogrammed_Dendritic_Cells"

In [2]:
work_dir = f"E-HCAD-{accession_index}_{project_name}"
accession = f"E-HCAD-{accession_index}"
protocol_accession = f"HCAD{accession_index}"
idf_file_name = f"{accession}.idf.txt"
sdrf_file_name = f"{accession}.sdrf.txt"

## 2. Load SDRF

In [3]:
import pandas as pd
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)

In [5]:
sdrf = pd.read_csv(sdrf_file_name, sep="\t", keep_default_na=False)

In [16]:
manifest = pd.read_csv(f"../hca_manifest.tsv", sep="\t", keep_default_na=False)

## 3. Select required piece of the manifest

In [17]:
required_cols = ["bundle_uuid", "bundle_version", "file_name", "sample.biomaterial_core.biomaterial_id"]
manifest_chunk = manifest.loc[manifest['project.project_core.project_short_name'] == project_name][required_cols]

## 4. Filter bundle IDs
Take the earliest versions of the file bundles:
1. Sort by bundle version date.
2. Drop duplicates.

In [20]:
manifest_chunk['file_part'] = manifest_chunk['file_name'].str.slice(0, -12)

In [21]:
manifest_chunk = manifest_chunk.drop_duplicates(subset=['file_part'])

In [22]:
manifest_chunk

Unnamed: 0,bundle_uuid,bundle_version,file_name,sample.biomaterial_core.biomaterial_id,file_part
501,cb88115c-75ef-4e8e-bbd7-a48e18960344,2019-10-03T105524.915852Z,IDC9_L002_R1.fastq.gz,Cell_line_2 || Specimen1,IDC9_L002
515,cb88115c-75ef-4e8e-bbd7-a48e18960344,2019-10-03T105524.915852Z,IDC9_L001_I1.fastq.gz,Cell_line_2 || Specimen1,IDC9_L001
651,1307154a-0396-4f2c-a1bf-4a2bf593b1a0,2019-10-03T105524.914465Z,HEF_L004_I1.fastq.gz,Specimen1,HEF_L004
653,c6633095-68f9-4118-981b-16dc8b42a5a1,2019-10-03T105524.913003Z,HEF_L003_R1.fastq.gz,Specimen1,HEF_L003
655,1307154a-0396-4f2c-a1bf-4a2bf593b1a0,2019-10-03T105524.914465Z,HEF_L002_R1.fastq.gz,Specimen1,HEF_L002
659,c6633095-68f9-4118-981b-16dc8b42a5a1,2019-10-03T105524.913003Z,HEF_L001_R1.fastq.gz,Specimen1,HEF_L001
812,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L001_I1.fastq.gz,Cell_line_1 || Specimen1,IDC3_L001
814,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L003_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L003
816,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L004_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L004
820,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L002_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L002


In [23]:
manifest_chunk['date'] = pd.to_datetime(manifest_chunk['bundle_version'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manifest_chunk['date'] = pd.to_datetime(manifest_chunk['bundle_version'])


In [24]:
manifest_chunk.sort_values(by='bundle_version').drop_duplicates(subset=["file_name"]).sort_values(by="file_name")

Unnamed: 0,bundle_uuid,bundle_version,file_name,sample.biomaterial_core.biomaterial_id,file_part,date
659,c6633095-68f9-4118-981b-16dc8b42a5a1,2019-10-03T105524.913003Z,HEF_L001_R1.fastq.gz,Specimen1,HEF_L001,2019-10-03 10:55:24.913003+00:00
655,1307154a-0396-4f2c-a1bf-4a2bf593b1a0,2019-10-03T105524.914465Z,HEF_L002_R1.fastq.gz,Specimen1,HEF_L002,2019-10-03 10:55:24.914465+00:00
653,c6633095-68f9-4118-981b-16dc8b42a5a1,2019-10-03T105524.913003Z,HEF_L003_R1.fastq.gz,Specimen1,HEF_L003,2019-10-03 10:55:24.913003+00:00
651,1307154a-0396-4f2c-a1bf-4a2bf593b1a0,2019-10-03T105524.914465Z,HEF_L004_I1.fastq.gz,Specimen1,HEF_L004,2019-10-03 10:55:24.914465+00:00
812,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L001_I1.fastq.gz,Cell_line_1 || Specimen1,IDC3_L001,2019-10-03 10:55:24.918205+00:00
820,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L002_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L002,2019-10-03 10:55:24.918205+00:00
814,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L003_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L003,2019-10-03 10:55:24.918205+00:00
816,52245757-73ba-43a2-a78d-0eba01cb4ce1,2019-10-03T105524.918205Z,IDC3_L004_R2.fastq.gz,Cell_line_1 || Specimen1,IDC3_L004,2019-10-03 10:55:24.918205+00:00
515,cb88115c-75ef-4e8e-bbd7-a48e18960344,2019-10-03T105524.915852Z,IDC9_L001_I1.fastq.gz,Cell_line_2 || Specimen1,IDC9_L001,2019-10-03 10:55:24.915852+00:00
501,cb88115c-75ef-4e8e-bbd7-a48e18960344,2019-10-03T105524.915852Z,IDC9_L002_R1.fastq.gz,Cell_line_2 || Specimen1,IDC9_L002,2019-10-03 10:55:24.915852+00:00
