In [1]:
import glob
import re
import json
import sys

from os.path import splitext, basename

import pandas as pd


# Protocol mapping.
protocol_type_map = {
    'collection_protocol': "sample collection protocol",
    'dissociation_protocol': "enrichment protocol",
    '??????????????????????': "nucleic acid extraction protocol",
    'enrichment_protocol': "enrichment protocol",
    'library_preparation_protocol': "nucleic acid library construction protocol",
    'sequencing_protocol': "nucleic acid sequencing protocol",
}

# Order of protocols.
protocol_order = [
    'collection_protocol',
    'dissociation_protocol',
    'enrichment_protocol',
    'library_preparation_protocol',
    'sequencing_protocol',
]

# Columns where protocols are stored in the spreadsheet.
protocol_columns = {
    'collection_protocol': ["collection_protocol.protocol_core.protocol_id"],
    'library_preparation_protocol': ["library_preparation_protocol.protocol_core.protocol_id"],
    'sequencing_protocol': ["sequencing_protocol.protocol_core.protocol_id"],
}

multiprotocols = {
    'dissociation_protocol': "dissociation_protocol.protocol_core.protocol_id",
    'enrichment_protocol': "enrichment_protocol.protocol_core.protocol_id",
}


# Convert sheet names to snake case.
def convert_to_snakecase(label):
    return re.sub(r'(\s-\s)|\s', '_', label).lower()


# Fetch all spreadsheet csv in a dir.
def get_all_spreadsheets(work_dir):
    file_names = glob.glob(f"{work_dir}/*.csv")
    file_names = [x for x in file_names if not 'big_table.csv' in x]

    spreadsheets = {}

    for file_name in file_names:
        spreadsheets[convert_to_snakecase(splitext(basename(file_name))[0])] = file_name

    for name, file_name in spreadsheets.items():
        newSheet = pd.read_csv(file_name, header=0, sep=";", skiprows=[0,1,2,4])
        newSheet = newSheet.fillna('')
        newSheet = newSheet.applymap(str)
        newSheet = newSheet.applymap(lambda x: x.strip())
        spreadsheets[name] = newSheet.loc[:, ~newSheet.columns.str.contains('^Unnamed')]

    return spreadsheets


# Extract lists of protocols
# Helpers to convert lists in HCA spreadsheets (items are separated with two
# pipes `||`) to python lists.
def splitlist(list_):
    split_data = []
    try:
        split_data = list_.split('||')
    except:
        pass

    return split_data

def split_multiprotocols(df, proto_column):
    proto_series = df[proto_column].apply(splitlist)
    proto_df = pd.DataFrame(proto_series.values.tolist())
    proto_df_columns = [f'{proto_column}_{y}' for y in range(len(proto_df.columns))]
    proto_df.columns = proto_df_columns
    proto_df[f'{proto_column}_count'] = proto_series.str.len()
    proto_df[f'{proto_column}_list'] = proto_series

    return (proto_df, proto_df_columns)


# Extracts info from the protocols spreadsheets
def extract_protocol_info(
    protocol_map,
    spreadsheets,
    column_to_extract,
    to_key,
    for_protocols = protocol_order
):
    for proto_type, proto_list in protocol_map.items():
        if proto_type in for_protocols:
            for proto_name, proto in proto_list.items():
                extracted_data = spreadsheets[proto_type].loc[spreadsheets[proto_type][f'{proto_type}.protocol_core.protocol_id'] == proto_name][f'{proto_type}.{column_to_extract}'].tolist()

                if len(extracted_data):
                    proto[to_key] = extracted_data[0]
                else:
                    proto[to_key] = ''


# Get protocol types from protocol map.
def get_protocol_idf(protocol_map):
    proto_types = [protocol_type_map[protocol_type] for (protocol_type, value) in protocol_map.items() for repeats in range(len(value.keys()))]
    proto_names = [protocol['scea_id'] for (protocol_type, protocols) in protocol_map.items() for (protocol_name, protocol) in protocols.items()]
    proto_descs = [protocol['description'] for (protocol_type, protocols) in protocol_map.items() for (protocol_name, protocol) in protocols.items()]
    proto_hware = [protocol.get('hardware', '') for (protocol_type, protocols) in protocol_map.items() for (protocol_name, protocol) in protocols.items()]

    return list(zip(proto_types, proto_names, proto_descs, proto_hware))


# Maps a HCA protocol name to a SCEA ID.
def map_proto_to_id(protocol_name, protocol_map):
    for proto_type in protocol_map.values():
        for proto in proto_type.values():
            if protocol_name in proto['hca_ids']:
                return proto.get('scea_id')
    return ''

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Parameters.
work_dir = f"./spreadsheets/GSE136103_ontologies_v2/"

print(f"working at {work_dir}")

# Get details and prepare ids.
with open(f"{work_dir}/project_details.json") as info_file:
    project_details = json.load(info_file)


accession_index = project_details['accession']
accession = f"E-HCAD-{accession_index}"
protocol_accession = f"HCAD{accession_index}"
idf_file_name = f"{accession}.idf.txt"
sdrf_file_name = f"{accession}.sdrf.txt"
fill_this_label = "<FILL THIS>"

# Load all spreadsheets
spreadsheets = get_all_spreadsheets(work_dir)

print(f"{len(spreadsheets)} spreadsheets loaded")

working at ./spreadsheets/GSE136103_ontologies_v2/
13 spreadsheets loaded


In [4]:
big_table = None

# Merge sequence files with cell suspensions.
big_table = spreadsheets['cell_suspension'].merge(
    spreadsheets['sequence_file'],
    how="outer",
    on="cell_suspension.biomaterial_core.biomaterial_id"
)

In [5]:
big_table

Unnamed: 0,cell_suspension.biomaterial_core.biomaterial_id,cell_suspension.biomaterial_core.biomaterial_name,cell_suspension.biomaterial_core.biomaterial_description,specimen_from_organism.biomaterial_core.biomaterial_id,cell_suspension.biomaterial_core.ncbi_taxon_id,cell_suspension.cell_morphology.cell_size_unit.ontology,cell_suspension.cell_morphology.cell_size_unit.ontology_label,cell_suspension.cell_morphology.cell_viability_method,cell_suspension.genus_species.text,cell_suspension.genus_species.ontology,cell_suspension.genus_species.ontology_label,cell_suspension.selected_cell_types.text,cell_suspension.selected_cell_types.ontology,cell_suspension.selected_cell_types.ontology_label,cell_suspension.biomaterial_core.biosamples_accession,dissociation_protocol.protocol_core.protocol_id,enrichment_protocol.protocol_core.protocol_id,sequence_file.file_core.file_name,sequence_file.file_core.format,sequence_file.file_core.content_description.text,sequence_file.file_core.content_description.ontology,sequence_file.file_core.content_description.ontology_label,sequence_file.read_index,sequence_file.lane_index,sequence_file.read_length,sequence_file.insdc_run_accessions,process.insdc_experiment.insdc_experiment_accession,library_preparation_protocol.protocol_core.protocol_id,sequencing_protocol.protocol_core.protocol_id,process.process_core.process_id,process.type.ontology,process.type.ontology_label
0,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_1_I1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),index1,1.0,8,SRR10009410,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
1,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_1_R1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read1,1.0,75,SRR10009410,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
2,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_1_R2_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read2,1.0,75,SRR10009410,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
3,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_2_I1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),index1,2.0,8,SRR10009411,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
4,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_2_R1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read1,2.0,75,SRR10009411,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
5,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_2_R2_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read2,2.0,75,SRR10009411,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
6,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_3_I1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),index1,3.0,8,SRR10009412,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
7,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_3_R1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read1,3.0,75,SRR10009412,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
8,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_3_R2_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),read2,3.0,75,SRR10009412,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,
9,GSM4041150_cells,Healthy1_Cd45+_cells,healthy human donor 1 CD45+ cells,healthy_human_donor_1_liver,9606,,,staining with DAPI,Homo sapiens,NCBITaxon:9606,Homo sapiens,Leukocytes,CL:0000738,leukocyte,SAMN12614701,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,healthy1_cd45+_4_I1_001.fastq.gz,fastq.gz,DNA sequence,data:3494,DNA sequence (raw),index1,4.0,8,SRR10009413,SRX6747672,library_preparation_1,sequencing_protocol_1,process_1,,


In [6]:
big_table[[x for x in big_table.columns if x.endswith('protocol_id')]]

Unnamed: 0,dissociation_protocol.protocol_core.protocol_id,enrichment_protocol.protocol_core.protocol_id,library_preparation_protocol.protocol_core.protocol_id,sequencing_protocol.protocol_core.protocol_id
0,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
1,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
2,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
3,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
4,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
5,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
6,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
7,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
8,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
9,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1


In [7]:
# Take specimen ids from cell suspensions if there are any.
def get_specimen(cell_line_id):
    return spreadsheets['cell_line'].loc[spreadsheets['cell_line']['cell_line.biomaterial_core.biomaterial_id'] == cell_line_id]['specimen_from_organism.biomaterial_core.biomaterial_id'].values[0]


if 'cell_line' in spreadsheets.keys():
    big_table['specimen_from_organism.biomaterial_core.biomaterial_id'] = big_table['specimen_from_organism.biomaterial_core.biomaterial_id'].fillna(big_table.loc[big_table['specimen_from_organism.biomaterial_core.biomaterial_id'].isna()]['cell_line.biomaterial_core.biomaterial_id'].apply(get_specimen))

# Merge specimens into big table.
big_table = spreadsheets['specimen_from_organism'].merge(
    big_table,
    how="outer",
    on="specimen_from_organism.biomaterial_core.biomaterial_id"
)

# Merge donor organisms into big table.
big_table = spreadsheets['donor_organism'].merge(
    big_table,
    how="outer",
    on="donor_organism.biomaterial_core.biomaterial_id"
)

# Merge library preparation into big table.
big_table = spreadsheets['library_preparation_protocol'].merge(
    big_table,
    how="outer",
    on="library_preparation_protocol.protocol_core.protocol_id"
)

# Merge sequencing protocol into big table.
big_table = spreadsheets['sequencing_protocol'].merge(
    big_table,
    how="outer",
    on="sequencing_protocol.protocol_core.protocol_id"
)

# Merge the two rows for each read (read1 and read2).
big_table_read1 = big_table.loc[big_table['sequence_file.read_index'] == 'read1']
big_table_read2 = big_table.loc[big_table['sequence_file.read_index'] == 'read2']

big_table_read2_short = big_table_read2[[
    'cell_suspension.biomaterial_core.biomaterial_id',
    'sequence_file.file_core.file_name',
    'sequence_file.read_length',
    'sequence_file.lane_index',
]]

big_table_joined = big_table_read1.merge(
    big_table_read2_short,
    on=['cell_suspension.biomaterial_core.biomaterial_id', 'sequence_file.lane_index'],
    suffixes=("_read1", "_read2")
)

# Merge index rows for each read.
if ('index1' in big_table['sequence_file.read_index'].values):
    big_table = big_table.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    big_table_index1 = big_table.loc[big_table['sequence_file.read_index'] == 'index1']

    big_table_index1_short = big_table_index1[[
        'cell_suspension.biomaterial_core.biomaterial_id',
        'sequence_file.file_core.file_name',
        'sequence_file.read_length',
        'sequence_file.lane_index',
    ]]

    big_table_index1_short.columns = [f"{x}_index1" for x in big_table_index1_short.columns]

    big_table_joined2 = big_table_joined.merge(
        big_table_index1_short,
        left_on=['cell_suspension.biomaterial_core.biomaterial_id', 'sequence_file.lane_index'],
        right_on=["cell_suspension.biomaterial_core.biomaterial_id_index1", 'sequence_file.lane_index_index1'],
    )

    big_table_joined = big_table_joined2

# Fix up and sort big table.
big_table_joined.reset_index(inplace=True)
big_table_joined = big_table_joined.rename(columns={'sequence_file.file_core.file_name': 'sequence_file.file_core.file_name_read1'})
big_table_joined_sorted = big_table_joined.reindex(sorted(big_table_joined.columns), axis=1)
big_table_joined_sorted = big_table_joined_sorted.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
big_table = big_table_joined_sorted

In [8]:
# Remove NAs in protocol spreadsheets.
for protocol_type in protocol_columns.keys():
    spreadsheets[protocol_type] = spreadsheets[protocol_type].fillna('')

In [13]:
big_table[[x for x in big_table.columns if x.endswith('protocol_id')]]

Unnamed: 0,collection_protocol.protocol_core.protocol_id,dissociation_protocol.protocol_core.protocol_id,enrichment_protocol.protocol_core.protocol_id,library_preparation_protocol.protocol_core.protocol_id,sequencing_protocol.protocol_core.protocol_id
0,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
1,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
2,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
3,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1
4,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1
5,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1
6,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1
7,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1
8,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1
9,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1


In [14]:
# This extracts the lists from protocol types which can have more than one instance and creates extra columns in the
# Big Table for each of the items, as well as the count and the python-style list.
for (protocol_type, protocol_field) in multiprotocols.items():
    if spreadsheets.get(protocol_type) is not None:
        spreadsheets[protocol_type] = spreadsheets[protocol_type].fillna('')
        proto_df, proto_df_columns = split_multiprotocols(big_table, protocol_field)
        for proto_column in proto_df_columns:
            if protocol_columns.get(protocol_type) == None:
                protocol_columns[protocol_type] = []
            protocol_columns[protocol_type].append(proto_column)

        # print(f"columns for {protocol_type}: {protocol_columns[protocol_type]}")

        big_table = big_table.merge(proto_df, left_index=True, right_index=True)

# Save protocol columns for later use when creating sdrf.
    project_details['protocol_columns'] = protocol_columns

In [19]:
big_table[[x for x in big_table.columns if 'protocol_id' in x]]

Unnamed: 0,collection_protocol.protocol_core.protocol_id,dissociation_protocol.protocol_core.protocol_id,enrichment_protocol.protocol_core.protocol_id,library_preparation_protocol.protocol_core.protocol_id,sequencing_protocol.protocol_core.protocol_id,dissociation_protocol.protocol_core.protocol_id_0,dissociation_protocol.protocol_core.protocol_id_count,dissociation_protocol.protocol_core.protocol_id_list,enrichment_protocol.protocol_core.protocol_id_0,enrichment_protocol.protocol_core.protocol_id_count,enrichment_protocol.protocol_core.protocol_id_list
0,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_leukocytes,1,[dissociation_protocol_human_liver_leukocytes],enrichment_protocol_human_liver_CD45+,1,[enrichment_protocol_human_liver_CD45+]
1,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_leukocytes,1,[dissociation_protocol_human_liver_leukocytes],enrichment_protocol_human_liver_CD45+,1,[enrichment_protocol_human_liver_CD45+]
2,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_leukocytes,1,[dissociation_protocol_human_liver_leukocytes],enrichment_protocol_human_liver_CD45+,1,[enrichment_protocol_human_liver_CD45+]
3,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_leukocytes,1,[dissociation_protocol_human_liver_leukocytes],enrichment_protocol_human_liver_CD45+,1,[enrichment_protocol_human_liver_CD45+]
4,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_non_parenchymal,1,[dissociation_protocol_human_liver_non_parench...,enrichment_protocol_human_liver_CD45-,1,[enrichment_protocol_human_liver_CD45-]
5,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_non_parenchymal,1,[dissociation_protocol_human_liver_non_parench...,enrichment_protocol_human_liver_CD45-,1,[enrichment_protocol_human_liver_CD45-]
6,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_non_parenchymal,1,[dissociation_protocol_human_liver_non_parench...,enrichment_protocol_human_liver_CD45-,1,[enrichment_protocol_human_liver_CD45-]
7,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_non_parenchymal,1,[dissociation_protocol_human_liver_non_parench...,enrichment_protocol_human_liver_CD45-,1,[enrichment_protocol_human_liver_CD45-]
8,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_non_parenchymal,enrichment_protocol_human_liver_CD45-,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_non_parenchymal,1,[dissociation_protocol_human_liver_non_parench...,enrichment_protocol_human_liver_CD45-,1,[enrichment_protocol_human_liver_CD45-]
9,collection_protocol_healthy_human_liver,dissociation_protocol_human_liver_leukocytes,enrichment_protocol_human_liver_CD45+,library_preparation_1,sequencing_protocol_1,dissociation_protocol_human_liver_leukocytes,1,[dissociation_protocol_human_liver_leukocytes],enrichment_protocol_human_liver_CD45+,1,[enrichment_protocol_human_liver_CD45+]


In [20]:
# Saving the Big Table.
big_table.to_csv(f"{work_dir}/big_table.csv", index=False, sep=";")


# First, we prepare an ID minter for the protocols following SCEA MAGE-TAB standards.
protocol_id_counter = 0

# Then, protocol map is created: a dict containing types of protocols, and inside each, a map from HCA ids to SCEA ids.
protocol_map = {x: {} for x in protocol_order}

for proto_type in protocol_order:
    for (ptype, proto_columns) in protocol_columns.items():
        if ptype == proto_type:
            # print(f"mapping {ptype}: {proto_columns}")

            new_protos = []
            for proto_column in proto_columns:
                new_protos = new_protos + pd.unique(big_table[proto_column]).tolist()

            # print(f"newprotocols: {new_protos}")

            for proto in new_protos:
                if proto is not None:
                    protocol_id_counter += 1
                    new_proto_id = f"P-{protocol_accession}-{protocol_id_counter}"
                    protocol_map[proto_type].update({proto: {'scea_id': new_proto_id}})


# Using that function, we get the description for all protocol types, and the hardware for sequencing protocols into
# the map.
extract_protocol_info(protocol_map, spreadsheets, f"protocol_core.protocol_description", "description")
extract_protocol_info(protocol_map, spreadsheets, f"instrument_manufacturer_model.ontology_label", "hardware", ["sequencing_protocol"])


# Prepare project details to dump into file
project_details['protocol_map'] = protocol_map
project_details['project_uuid'] = spreadsheets['project'].get('project.uuid', [''])[0]

# Prepare configurable fields.
biomaterial_id_columns = [x for x in big_table.columns if x.endswith("biomaterial_id") or x.endswith("biosamples_accession")]

read_map = {'': "", 'Read 1': "read1", 'Read 2': "read2"}

def get_or_default(source, default):
    return str(big_table[source].values[0]) if source in big_table.columns else default

project_details['configurable_fields'] = [
    {'name': "Source Name", 'type': "column", 'source': biomaterial_id_columns},
    {'name': "Comment[biomaterial name]", 'type': "column", 'source': biomaterial_id_columns},
    {'name': "Material Type_1", 'type': "dropdown", 'source': ["whole organism", "organism part", "cell"]},
    {'name': "Extract Name", 'type': "column", 'source': biomaterial_id_columns},
    {'name': "Material Type_2", 'source': "RNA"},
    {'name': "Comment[primer]", 'source': "oligo-DT"},
    {'name': "Comment[umi barcode read]", 'source': read_map[get_or_default('library_preparation_protocol.umi_barcode.barcode_read', "Read 1")]},
    {'name': "Comment[umi barcode offset]", 'source': get_or_default('librbrary_preparation_protocol.umi_barcode.barcode_offset', "16")},
    {'name': "Comment[umi barcode size]", 'source': get_or_default('library_preparation_protocol.umi_barcode.barcode_length', "10")},
    {'name': "Comment[cell barcode read]", 'source': get_or_default('librrary_preparation_protocol.cell_barcode.barcode_read', "read1")},
    {'name': "Comment[cell barcode offset]", 'source': get_or_default('libribrary_preparation_protocol.cell_barcode.barcode_offset', "0")},
    {'name': "Comment[cell barcode size]", 'source': get_or_default('librrary_preparation_protocol.cell_barcode.barcode_length', "16")},
    {'name': "Comment[sample barcode read]", 'source': ""},
    {'name': "Comment[sample barcode offset]", 'source': "0"},
    {'name': "Comment[sample barcode size]", 'source': "8"},
    {'name': "Comment[single cell isolation]", 'source': "magnetic affinity cell sorting"},
    {'name': "Comment[cDNA read]", 'source': "read2"},
    {'name': "Comment[cDNA read offset]", 'source': "0"},
    {'name': "Comment[cDNA read size]", 'source': "98"},
    {'name': "Comment[LIBRARY_LAYOUT]", 'source': "PAIRED"},
    {'name': "Comment[LIBRARY_SOURCE]", 'source': "TRANSCRIPTOMIC SINGLE CELL"},
    {'name': "Comment[LIBRARY_STRATEGY]", 'source': "RNA-Seq"},
    {'name': "Comment[LIBRARY_SELECTION]", 'source': "cDNA"},
    {'name': "Technology Type", 'source': "sequencing assay"},
    {'name': "Scan Name", 'type': "column", 'source': biomaterial_id_columns},
    {'name': "Comment[RUN]", 'type': "column", 'source': biomaterial_id_columns},
]

# Save file
with open(f"{work_dir}/project_details.json", "w") as project_details_file:
    json.dump(project_details, project_details_file, indent=2)

In [21]:
protocol_map

{'collection_protocol': {'collection_protocol_healthy_human_liver': {'scea_id': 'P-HCAD14-1',
   'description': 'Fresh non-ischaemic liver tissue obtained intraoperatively by wedge biopsy from patients undergoing surgical liver resection for solitary colorectal metastasis. The biopsy was taken before the interruption of the hepatic vascular inflow during liver surgery or transplantation. The tissue was then transported directly to the laboratory and dissociation routinely commenced within 20 min of the liver biopsy.'},
  'collection_protocol_cirrhotic_human_liver': {'scea_id': 'P-HCAD14-2',
   'description': 'Cirrhotic liver tissue was obtained intraoperatively from patients undergoing orthotopic liver transplantation.'},
  'collection_protocol_human_blood': {'scea_id': 'P-HCAD14-3',
   'description': 'Blood from patients with a confirmed diagnosis of liver cirrhosis were obtained. 4.9-ml peripheral venous blood samples were collected in EDTA-coated tubes and placed on ice.'},
  'colle

In [None]:
## Main Script ##
#
## Part 2: Create MAGE-TAB with data from the frontend.
#
def create_magetab():
# Read the big table csv.
big_table = pd.read_csv(f"{work_dir}/big_table.csv", sep=";")

# Read project details.
with open(f"{work_dir}/project_details.json") as info_file:
    project_details = json.load(info_file)


tab = '\t'
protocol_map = project_details['protocol_map']
protocol_columns = project_details['protocol_columns']
configurable_fields = project_details['configurable_fields']


#
## IDF Part.
#
protocol_fields = get_protocol_idf(protocol_map)

def j(sheet, col_name, func=lambda x: x):
    return tab.join([func(p) for p in g(sheet, col_name)])

def g(sheet, col_name):
    return list(spreadsheets[sheet][col_name].fillna('').replace(r'[\n\r]', ' ', regex=True))

def first_letter(str):
    return str[0] if len(str) else ''

person_roles = g("project_contributors", "project.contributors.project_role.text")
person_roles_submitter = g("project_contributors", "project.contributors.corresponding_contributor")

for (i, elem) in enumerate(person_roles_submitter):
    person_roles[i] = person_roles[i].lower()
    if elem == "yes":
        person_roles[i] += ";submitter"


idf_file_contents = f"""\
MAGE-TAB Version\t1.1
Investigation Title\t{g("project", "project.project_core.project_title")}
Comment[Submitted Name]\t{g("project", "project.project_core.project_short_name")}
Experiment Description\t{g("project", "project.project_core.project_description")}
    Public Release Date\t{project_details['last_update_date']}
Person First Name\t{j("project_contributors", "project.contributors.name", lambda x: x.split(',')[0])}
Person Last Name\t{j("project_contributors", "project.contributors.name", lambda x: x.split(',')[2])}
Person Mid Initials\t{j("project_contributors", "project.contributors.name", lambda x: first_letter(x.split(',')[1]))}
Person Email\t{j("project_contributors", "project.contributors.email")}
Person Affiliation\t{j("project_contributors", "project.contributors.institution")}
Person Address\t{j("project_contributors", "project.contributors.address")}
Person Roles\t{tab.join(person_roles)}
Protocol Type\t{tab.join([field[0] for field in protocol_fields])}
Protocol Name\t{tab.join([field[1] for field in protocol_fields])}
Protocol Description\t{tab.join([field[2] for field in protocol_fields])}
Protocol Hardware\t{tab.join([field[3] for field in protocol_fields])}
Term Source Name\tEFO\tArrayExpress
Term Source File\thttp://www.ebi.ac.uk/efo/efo.owl\thttp://www.ebi.ac.uk/arrayexpress/
Comment[AEExperimentType]\tRNA-seq of coding RNA from single cells
Experimental Factor Name\t{fill_this_label}
Experimental Factor Type\t{fill_this_label}
Comment[EAAdditionalAttributes]\t{fill_this_label}
    Comment[EACurator]\t{tab.join(project_details['curators'])}
Comment[EAExpectedClusters]\t
Comment[ExpressionAtlasAccession]\t{accession}
    Comment[HCALastUpdateDate]\t{project_details['last_update_date']}
    Comment[SecondaryAccession]\t{project_details['project_uuid']}\t{tab.join(project_details['geo_accessions'])}
Comment[EAExperimentType]\t{fill_this_label}
SDRF File\t{sdrf_file_name}
"""

print(f"saving {work_dir}/{idf_file_name}")
with open(f"{work_dir}/{idf_file_name}", "w") as idf_file:
    idf_file.write(idf_file_contents)

In [None]:
    #
    ## SDRF Part.
    #

    big_table['UNDEFINED_FIELD'] = fill_this_label

    convert_map_chunks = [{
        'Source Name': "UNDEFINED_FIELD",
        'Characteristics[organism]': "donor_organism.genus_species.ontology_label",
        'Characteristics[individual]': "donor_organism.biomaterial_core.biomaterial_id",
        'Characteristics[sex]': "donor_organism.sex",
        'Characteristics[age]': "donor_organism.organism_age",
        'Unit [time unit]': "donor_organism.organism_age_unit.text",
        'Characteristics[developmental stage]': "donor_organism.development_stage.text",
        'Characteristics[organism part]': "specimen_from_organism.organ.ontology_label",
        'Characteristics[sampling site]': "specimen_from_organism.organ_parts.ontology_label",
        'Characteristics[cell type]': "cell_suspension.selected_cell_types.ontology_label",
        'Characteristics[disease]': "donor_organism.diseases.ontology_label",
        'Characteristics[organism status]': "donor_organism.is_living",
        'Characteristics[cause of death]': "donor_organism.death.cause_of_death",
        'Characteristics[clinical history]': "donor_organism.medical_history.test_results",
        'Description': "specimen_from_organism.biomaterial_core.biomaterial_description",
        'Material Type_1': "UNDEFINED_FIELD",
    }, {
        'Protocol REF': "GENERIC_PROTOCOL_FIELD",
    }, {
        'Extract Name': "UNDEFINED_FIELD",
        'Material Type_2': "UNDEFINED_FIELD",
        'Comment[library construction]': "library_preparation_protocol.library_construction_method.ontology_label",
        'Comment[input molecule]': "library_preparation_protocol.input_nucleic_acid_molecule.ontology_label",
        'Comment[primer]': "UNDEFINED_FIELD",
        'Comment[end bias]': "library_preparation_protocol.end_bias",
        'Comment[umi barcode read]': "UNDEFINED_FIELD",
        'Comment[umi barcode offset]': "UNDEFINED_FIELD",
        'Comment[umi barcode size]': "UNDEFINED_FIELD",
        'Comment[cell barcode read]': "UNDEFINED_FIELD",
        'Comment[cell barcode offset]': "UNDEFINED_FIELD",
        'Comment[cell barcode size]': "UNDEFINED_FIELD",
        'Comment[sample barcode read]': "UNDEFINED_FIELD",
        'Comment[sample barcode offset]': "UNDEFINED_FIELD",
        'Comment[sample barcode size]': "UNDEFINED_FIELD",
        'Comment[single cell isolation]': "UNDEFINED_FIELD",
        'Comment[cDNA read]': "UNDEFINED_FIELD",
        'Comment[cDNA read offset]': "UNDEFINED_FIELD",
        'Comment[cDNA read size]': "UNDEFINED_FIELD",
        'Comment[LIBRARY_STRAND]': "library_preparation_protocol.strand",
        'Comment[LIBRARY_LAYOUT]': "UNDEFINED_FIELD",
        'Comment[LIBRARY_SOURCE]': "UNDEFINED_FIELD",
        'Comment[LIBRARY_STRATEGY]': "UNDEFINED_FIELD",
        'Comment[LIBRARY_SELECTION]': "UNDEFINED_FIELD",
    }, {
        'Protocol REF': "GENERIC_PROTOCOL_FIELD",
    }, {
        'Assay Name': "specimen_from_organism.biomaterial_core.biomaterial_id",
        'Technology Type': "UNDEFINED_FIELD",
        'Scan Name': "UNDEFINED_FIELD",
        'Comment[RUN]': "UNDEFINED_FIELD",
        'Comment[read1 file]': "sequence_file.file_core.file_name_read1",
        'Comment[read2 file]': "sequence_file.file_core.file_name_read2",
        'Comment[index1 file]': "sequence_file.file_core.file_name_index",
    }]

    def get_from_bigtable(column):
        return big_table[column] if column in big_table.columns else big_table['UNDEFINED_FIELD']

    # Chunk 1: donor info.
    sdrf_1 = pd.DataFrame({k: get_from_bigtable(v) for k, v in convert_map_chunks[0].items()})
    sdrf_1 = sdrf_1.fillna('')

    # Fixes for chunk 1.
    # Organism status: convert from 'is_alive' to 'status'.
    sdrf_1['Characteristics[organism status]'] = sdrf_1['Characteristics[organism status]'].apply(lambda x: 'alive' if x.lower() in ['yes', 'y'] else 'dead')


    # Chunk 2: collection/dissociation/enrichment protocols
    def convert_term(term, name):
        return map_proto_to_id(term, protocol_map)

    def convert_row(row):
        return row.apply(lambda x: convert_term(x, row.name))

    protocols_for_sdrf_2 = ['collection_protocol', 'dissociation_protocol', 'enrichment_protocol', 'library_preparation_protocol']

    sdrf_2 = big_table[[col for (proto_type, cols) in protocol_columns.items() if proto_type in protocols_for_sdrf_2 for col in cols]]

    pd.set_option('display.max_columns', 0)
    pd.set_option('display.expand_frame_repr', False)

    sdrf_2 = sdrf_2.apply(convert_row)
    sdrf_2_list = []

    for (_, row) in sdrf_2.iterrows():
        short_row = list(set([x for x in row.tolist() if x != '']))
        short_row.sort()
        sdrf_2_list.append(short_row)

    sdrf_2 = pd.DataFrame.from_records(sdrf_2_list)
    sdrf_2.columns = ["Protocol REF" for col in sdrf_2.columns]
    sdrf_2.fillna(value='', inplace=True)

    # Chunk 3: Library prep protocol info
    sdrf_3 = pd.DataFrame({k: get_from_bigtable(v) for k, v in convert_map_chunks[2].items()})
    sdrf_3 = sdrf_3.fillna('')

    # Fixes for chunk 3:
    # In column Comment[library construction], apply library_constuction_map.
    # In column Comment[input molecule], apply input_molecule_map.
    # In column Comment[LIBRARY_STRAND] add " strand" to the contents.
    library_constuction_map = {'': "", '10X 3\' v2 sequencing': "10xV2"}
    input_molecule_map = {'': "", 'polyA RNA extract': "polyA RNA"}

    sdrf_3['Comment[library construction]'] = sdrf_3['Comment[library construction]'].apply(lambda x: library_constuction_map[x])
    sdrf_3['Comment[input molecule]'] = sdrf_3['Comment[input molecule]'].apply(lambda x: input_molecule_map[x])
    sdrf_3['Comment[LIBRARY_STRAND]'] = sdrf_3['Comment[LIBRARY_STRAND]'] + " strand"

    # Chunk 4: Library preparation / sequencing protocol ids.
    protocols_for_sdrf_4 = ['sequencing_protocol']

    sdrf_4 = big_table[[col for (proto_type, cols) in protocol_columns.items() if proto_type in protocols_for_sdrf_4 for col in cols]]
    sdrf_4 = sdrf_4.apply(convert_row)
    sdrf_4.columns = ["Protocol REF" for col in sdrf_4.columns]

    # Chunk 5: Sequence files.
    sdrf_5 = pd.DataFrame({k: get_from_bigtable(v) for k, v in convert_map_chunks[4].items()})

    # Merge all chunks.
    sdrf = sdrf_1.join(sdrf_2).join(sdrf_3, rsuffix="_1").join(sdrf_4, rsuffix="_1").join(sdrf_5)

    # Put in configurable fields.
    for field in configurable_fields:
        if field.get('type', None) == "column":
            sdrf[field['name']] = get_from_bigtable(field['value'])
        else:
            print(field['value'])
            sdrf[field['name']] = field['value']

    # Fix column names.
    sdrf = sdrf.rename(columns = {'Protocol REF_1' : "Protocol REF", 'Material Type_1': "Material Type", 'Material Type_2': "Material Type"})

    # Save SDRF file.
    print(f"saving {work_dir}/{sdrf_file_name}")
    sdrf.to_csv(f"{work_dir}/{sdrf_file_name}", sep="\t", index=False)


#
#
#
#


## Main script ##
# Decide what to do depending on command.
if process_part == "protocolmap":
    prepare_protocol_map()
elif process_part == "magetab":
    create_magetab()