ASAP CRN Metadata validation - wave 1

# ASAP CRN Metadata validation - wave 1

15 September 2023
Andy Henrie



## STEPS

### imports
- pandas
- pathlib

### Load CDE for validation
- check all columns

### Team Lee
- load .tsv, csv tables
- fix format
- load additional metadata

- add batch columns
- add missing columns


### Team Hafler
- load excel file with tables
- add batch info
- add missing columns



### Team Hardy
- load excel file with tables
- add batch info
- add missing columns


In [27]:
import pandas as pd

from pathlib import Path


# local helpers
from utils.qcutils import validate_table, force_enum_string, reorder_table_to_CDE
from utils.io import ReportCollector, get_dtypes_dict, read_meta_table

## Load CDEs

In [29]:
CDE_path = Path.cwd() / "ASAP_CDE_v1.csv" 

CDEv1 = pd.read_csv( Path.cwd() / "ASAP_CDE_v1.csv" )
CDEv2 = pd.read_csv( Path.cwd() / "ASAP_CDE_v2.csv" )



# create helpers - port_v1_tables

In [6]:
tables_path = Path.cwd() / "clean/team-Hardy"

# Initialize the data types dictionary
dtypes_dict = get_dtypes_dict(CDEv1)

SUBJECT = read_meta_table(f"{tables_path}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{tables_path}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{tables_path}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{tables_path}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{tables_path}/SAMPLE.csv", dtypes_dict)


In [30]:
# STUDY: add preprocessing_references
STUDYv2 = STUDY.copy()

assert len(SAMPLE['preprocessing_references'].unique()) == 1
STUDYv2['preprocessing_references'] = SAMPLE['preprocessing_references'][0]

STUDYv2['team_dataset_id'] = STUDYv2['project_dataset'].str.replace(" ", "_").str.replace("-", "_")


# PROTOCOL
# no change
PROTOCOLv2 = PROTOCOL.copy()    


In [8]:
SAMP_CLIN = SAMPLE.merge(CLINPATH, on="sample_id", how="left")

SAMP_CLIN['source_sample_id'] = SAMP_CLIN['source_sample_id_x']
SAMP_CLIN = SAMP_CLIN.drop(columns=['source_sample_id_x','source_sample_id_y'])


In [9]:

SUBJ_SAMP_CLIN = SUBJECT.merge(SAMP_CLIN, on="subject_id", how="left")
SUBJ_SAMP_CLIN.head()

Unnamed: 0,subject_id,source_subject_id,biobank_name,organism,sex,age_at_collection,race,ethnicity,duration_pmi,primary_diagnosis,...,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level,dig_slide_avail,quant_path_avail,source_sample_id
0,babom,P2/14,QSBB_UK,Human,Female,78,,,46.0,Idiopathic PD,...,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes,
1,babom,P2/14,QSBB_UK,Human,Female,78,,,46.0,Idiopathic PD,...,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes,
2,babom,P2/14,QSBB_UK,Human,Female,78,,,46.0,Idiopathic PD,...,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes,
3,babom,P2/14,QSBB_UK,Human,Female,78,,,46.0,Idiopathic PD,...,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes,
4,babom,P2/14,QSBB_UK,Human,Female,78,,,46.0,Idiopathic PD,...,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes,


In [15]:

SUBJECT_cde_df = CDEv2[CDEv2['Table'] == "SUBJECT"]
SUBJECT_cols = SUBJECT_cde_df["Field"].to_list()
SUBJECTv2 = SUBJ_SAMP_CLIN[SUBJECT_cols].drop_duplicates()


In [11]:

CLINPATH_cde_df = CDEv2[CDEv2['Table'] == "CLINPATH"]

CLINPATH_cols = CLINPATH_cde_df["Field"].to_list()
CLINPATHv2 = SUBJ_SAMP_CLIN[CLINPATH_cols]
CLINPATHv2.head()

Unnamed: 0,subject_id,source_subject_id,duration_pmi,path_autopsy_dx_main,path_autopsy_second_dx,path_autopsy_third_dx,path_autopsy_fourth_dx,path_autopsy_fifth_dx,path_autopsy_sixth_dx,path_autopsy_seventh_dx,...,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level,dig_slide_avail,quant_path_avail
0,babom,P2/14,46.0,Parkinson's disease with dementia,none,none,none,none,none,none,...,,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes
1,babom,P2/14,46.0,Parkinson's disease with dementia,none,none,none,none,none,none,...,,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes
2,babom,P2/14,46.0,Parkinson's disease with dementia,none,none,none,none,none,none,...,,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes
3,babom,P2/14,46.0,Parkinson's disease with dementia,none,none,none,none,none,none,...,,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes
4,babom,P2/14,46.0,Parkinson's disease with dementia,none,none,none,none,none,none,...,,A2,B1,C2,,,,Low level Alzheimer's disease neuropathologica...,Yes,Yes


In [12]:

SAMPLE_cde_df = CDEv2[CDEv2['Table'] == "SAMPLE"]
SAMPLE_cols = SAMPLE_cde_df["Field"].to_list()
SAMPLEv2 = SUBJ_SAMP_CLIN[SAMPLE_cols].drop_duplicates(inplace=False)


In [13]:
DATA_cde_df = CDEv2[CDEv2['Table'] == "DATA"]
DATA_cols = DATA_cde_df["Field"].to_list()
DATAv2 = SAMPLE[DATA_cols]


In [24]:
def update_tables_to_CDEv2(tables_path: str|Path, out_dir: str, CDEv1: pd.DataFrame, CDEv2: pd.DataFrame):
    """
    load the tables from the tables_path, and update them to the CDEv2 schema.  export the new tables to a datstamped out_dir
    """
    import datetime

    # Get the current date and time
    current_date = datetime.datetime.now()

   
    # Initialize the data types dictionary
    dtypes_dict = get_dtypes_dict(CDEv1)
        
    STUDY = read_meta_table(f"{tables_path}/STUDY.csv", dtypes_dict)
    PROTOCOL = read_meta_table(f"{tables_path}/PROTOCOL.csv", dtypes_dict)
    SUBJECT = read_meta_table(f"{tables_path}/SUBJECT.csv", dtypes_dict)
    CLINPATH = read_meta_table(f"{tables_path}/CLINPATH.csv", dtypes_dict)
    SAMPLE = read_meta_table(f"{tables_path}/SAMPLE.csv", dtypes_dict)

    # STUDY
    STUDYv2 = STUDY.copy() # don't really need to copy here
    assert len(SAMPLE['preprocessing_references'].unique()) == 1
    STUDYv2['preprocessing_references'] = SAMPLE['preprocessing_references'][0]
    STUDYv2['team_dataset_id'] = STUDYv2['project_dataset'].str.replace(" ", "_").str.replace("-", "_")

    # PROTOCOL
    PROTOCOLv2 = PROTOCOL.copy()  

    SAMP_CLIN = SAMPLE.merge(CLINPATH, on="sample_id", how="left")
    SAMP_CLIN['source_sample_id'] = SAMP_CLIN['source_sample_id_x']
    SAMP_CLIN = SAMP_CLIN.drop(columns=['source_sample_id_x','source_sample_id_y'])

    SUBJ_SAMP_CLIN = SUBJECT.merge(SAMP_CLIN, on="subject_id", how="left")


    SUBJECT_cde_df = CDEv2[CDEv2['Table'] == "SUBJECT"]
    SUBJECT_cols = SUBJECT_cde_df["Field"].to_list()
    SUBJECTv2 = SUBJ_SAMP_CLIN[SUBJECT_cols]
    SUBJECTv2 = SUBJ_SAMP_CLIN[SUBJECT_cols].drop_duplicates(inplace=False).reset_index()

    CLINPATH_cde_df = CDEv2[CDEv2['Table'] == "CLINPATH"]
    CLINPATH_cols = CLINPATH_cde_df["Field"].to_list()
    CLINPATHv2 = SUBJ_SAMP_CLIN[CLINPATH_cols]

    SAMPLE_cde_df = CDEv2[CDEv2['Table'] == "SAMPLE"]
    SAMPLE_cols = SAMPLE_cde_df["Field"].to_list()
    # SAMPLEv2 = SUBJ_SAMP_CLIN[SAMPLE_cols]
    SAMPLEv2 = SUBJ_SAMP_CLIN[SAMPLE_cols].drop_duplicates(inplace=False).reset_index()

    DATA_cde_df = CDEv2[CDEv2['Table'] == "DATA"]
    DATA_cols = DATA_cde_df["Field"].to_list()
    DATAv2 = SAMPLE[DATA_cols]


    STUDYv2 = reorder_table_to_CDE(STUDYv2, "STUDY", CDEv2)
    PROTOCOLv2 = reorder_table_to_CDE(PROTOCOLv2, "PROTOCOL", CDEv2)
    CLINPATHv2 = reorder_table_to_CDE(CLINPATHv2, "CLINPATH", CDEv2)
    SAMPLEv2 = reorder_table_to_CDE(SAMPLEv2, "SAMPLE", CDEv2)
    SUBJECTv2 = reorder_table_to_CDE(SUBJECTv2, "SUBJECT", CDEv2)
    DATAv2 = reorder_table_to_CDE(DATAv2, "DATA", CDEv2)

    # Format the date as a string in the format 'YYYYMMDD'
    date_str = current_date.strftime('%Y%m%d')

    tables_path = Path(tables_path)

    export_root = tables_path / f"{out_dir}_{date_str}"
    if not export_root.exists():
        export_root.mkdir(parents=True, exist_ok=True)


    STUDYv2.to_csv( export_root / "STUDY.csv")
    PROTOCOLv2.to_csv(export_root / "PROTOCOL.csv")
    SAMPLEv2.to_csv(export_root / "SAMPLE.csv")
    SUBJECTv2.to_csv(export_root / "SUBJECT.csv")
    CLINPATHv2.to_csv(export_root / "CLINPATH.csv")
    DATAv2.to_csv(export_root / "DATA.csv")



    return STUDYv2, PROTOCOLv2, SAMPLEv2, SUBJECTv2, CLINPATHv2, DATAv2




In [25]:
CDE_path = Path.cwd() / "ASAP_CDE_v1.csv" 

CDEv1 = pd.read_csv( Path.cwd() / "ASAP_CDE_v1.csv" )
CDEv2 = pd.read_csv( Path.cwd() / "ASAP_CDE_v2.csv" )

STUDYv2, PROTOCOLv2, SAMPLEv2, SUBJECTv2, CLINPATHv2, DATAv2 = update_tables_to_CDEv2(tables_path, "v2", CDEv1, CDEv2)

In [26]:
SAMPLEv2.head()

Unnamed: 0,sample_id,subject_id,source_sample_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,hemisphere,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV200,pm_PH,donor_id
0,babom_ACG,babom,,,1,0,BATCH_2,brain,ACG,Left,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
1,babom_IPL,babom,,,1,0,BATCH_2,brain,IPL,Left,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
2,borah_ACG,borah,,,1,0,BATCH_2,brain,ACG,Right,...,PATO:0000384 (male),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
3,borah_IPL,borah,,rep1,2,1,BATCH_2,brain,IPL,Right,...,PATO:0000384 (male),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
4,borah_IPL,borah,,rep2,2,1,BATCH_5,brain,IPL,Right,...,PATO:0000384 (male),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,


## write clean metadata tables according to CDE 

In [189]:
# SAMPLE = SAMPLE[CDE[CDE["Table"]=="SAMPLE"].Field.tolist()]


def reorder_table_to_CDE(df, df_name, CDE):
    col_order = CDE[CDE["Table"]==df_name].Field.tolist()
    
    df_out = pd.DataFrame()
    for col in col_order:
        if col in df.columns:   
            df_out[col] = df[col]
        else:
            df_out[col] = ""

    return df_out



In [190]:
## write clean metadata tables according to CDE 
# SAMPLE = SAMPLE[CDE[CDE["Table"]=="SAMPLE"].Field.tolist()]


def reorder_table_to_CDE(df, df_name, CDE):
    col_order = CDE[CDE["Table"]==df_name].Field.tolist()
    
    df_out = pd.DataFrame()
    for col in col_order:
        if col in df.columns:   
            df_out[col] = df[col]
        else:
            df_out[col] = ""

    return df_out



# Clean each Team Table


## Team Lee

In [219]:
# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)     
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)



export_root = Path.cwd() / "clean/team-Lee"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)



## Team Hafler

In [241]:
# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)     
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)

export_root = Path.cwd() / "clean/team-Hafler"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


## Team Hardy

In [245]:
## convert 
data_path = Path.home() / ("Projects/ASAP/team-hardy")
metadata_path = data_path / "metadata"

SUBJECT = pd.read_csv(f"{metadata_path}/SUBJECT.csv")
CLINPATH = pd.read_csv(f"{metadata_path}/CLINPATH.csv")
STUDY = pd.read_csv(f"{metadata_path}/STUDY.csv")
PROTOCOL = pd.read_csv(f"{metadata_path}/PROTOCOL.csv")
SAMPLE = pd.read_csv(f"{metadata_path}/SAMPLE.csv")


In [246]:

validate_table(STUDY, "STUDY", CDE)
STUDY

	Missing Required Fields in STUDY: project_name, project_dataset, project_description, ASAP_team_name, ASAP_lab_name, PI_full_name, PI_email, contributor_names, submitter_name, submittor_email, ASAP_grant_id, other_funding_source, publication_DOI, publication_PMID, number_of_brain_samples, brain_regions, types_of_samples, DUA_version
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


Unnamed: 0,name,value
0,project_name,Understanding mechanisms of Parkinson's diseas...
1,project_dataset,Hardy snRNA-seq
2,project_description,Genetic analysis has identified many risk gene...
3,ASAP_team_name,TEAM-HARDY
4,ASAP_lab_name,Ryten Lab
5,PI_full_name,Mina Ryten
6,PI_email,mina.ryten@ucl.ac.uk
7,contributor_names,"Aine Fairbrother-Browne, Jonathan Brenton, Mel..."
8,submitter_name,Aine Fairbrother-Browne
9,submitter_email,aine.fairbrother-browne.18@ucl.ac.uk


In [247]:
# there seems to be something funky with SAMPLE
# SAMPLE = SAMPLE[SAMPLE["batch"]=="B1"]
# SAMPLE.drop_duplicates(inplace=True) #, subset=[ "file_name"])
SAMPLE

Unnamed: 0,sample_id,source_subject_id,subject_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,source_RIN,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id
0,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
1,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
2,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
3,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
4,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3611,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3612,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3613,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3614,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,


In [248]:
STUDY.head()

Unnamed: 0,name,value
0,project_name,Understanding mechanisms of Parkinson's diseas...
1,project_dataset,Hardy snRNA-seq
2,project_description,Genetic analysis has identified many risk gene...
3,ASAP_team_name,TEAM-HARDY
4,ASAP_lab_name,Ryten Lab


In [249]:

# fix STUDY formatting
tmp = pd.DataFrame()
tmp = STUDY[["name","value"]].transpose().reset_index().drop(columns=["index"])
tmp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,contributor_names,submitter_name,submitter_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Understanding mechanisms of Parkinson's diseas...,Hardy snRNA-seq,Genetic analysis has identified many risk gene...,TEAM-HARDY,Ryten Lab,Mina Ryten,mina.ryten@ucl.ac.uk,"Aine Fairbrother-Browne, Jonathan Brenton, Mel...",Aine Fairbrother-Browne,aine.fairbrother-browne.18@ucl.ac.uk,...,,,,128,"Inferior Parietal Lobule (IPL), Anterior Cingu...",Late stage (Braak 5-6) PD and control post-mor...,0000-0001-9520-6957,https://scholar.google.co.uk/citations?user=lt...,,"Version 1, 09/2023"


In [250]:

tmp.columns = tmp.iloc[0]
STUDY = tmp.drop([0])
STUDY.head()


Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,contributor_names,submitter_name,submitter_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Understanding mechanisms of Parkinson's diseas...,Hardy snRNA-seq,Genetic analysis has identified many risk gene...,TEAM-HARDY,Ryten Lab,Mina Ryten,mina.ryten@ucl.ac.uk,"Aine Fairbrother-Browne, Jonathan Brenton, Mel...",Aine Fairbrother-Browne,aine.fairbrother-browne.18@ucl.ac.uk,...,,,,128,"Inferior Parietal Lobule (IPL), Anterior Cingu...",Late stage (Braak 5-6) PD and control post-mor...,0000-0001-9520-6957,https://scholar.google.co.uk/citations?user=lt...,,"Version 1, 09/2023"


In [251]:


# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
validate_table(STUDY, "STUDY", CDE)



	All required fields are present in STUDY.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


1

In [252]:
PROTOCOL.head()

Unnamed: 0,name,value
0,sample_collection_summary,"This dataset contains cortical regions only, p..."
1,cell_extraction_summary,From protocols.io: This protocol is used to is...
2,lib_prep_summary,'Nuclei were extracted from homogenised post-m...
3,data_processing_summary,Cell ranger was used to convert raw sequencing...
4,github_url,Raw to fastq to mapped: https://github.com/RHR...


In [253]:
# fix STUDY formatting
tmp = pd.DataFrame()
tmp = PROTOCOL[["name","value"]].transpose().reset_index().drop(columns=["index"])
tmp.columns = tmp.iloc[0]
PROTOCOL = tmp.drop([0])
PROTOCOL.head()

Unnamed: 0,sample_collection_summary,cell_extraction_summary,lib_prep_summary,data_processing_summary,github_url,protocols_io_DOI,other_reference
1,"This dataset contains cortical regions only, p...",From protocols.io: This protocol is used to is...,'Nuclei were extracted from homogenised post-m...,Cell ranger was used to convert raw sequencing...,Raw to fastq to mapped: https://github.com/RHR...,Nuclear extraction protocol: 10.17504/protocol...,


In [254]:
# fix the column order
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
validate_table(PROTOCOL, "PROTOCOL", CDE)



	All required fields are present in PROTOCOL.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in PROTOCOL.


1

In [255]:

SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)

# Testing the function with SUBJECT.csv and CDE.csv
validate_table(SUBJECT, "SUBJECT", CDE)

	All required fields are present in SUBJECT.
		Required Fields with Empty or NaN values:
			- ethnicity: 64 rows
			- duration_pmi: 1 rows
No empty or NaN values found in Optional fields.
	Invalid Field/Value pairs:
			- race: Nan


0

In [256]:
SUBJECT.replace("Nan", "", inplace=True)
SUBJECT.replace("nan", "", inplace=True)
SUBJECT.fillna("", inplace=True)


In [257]:
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)

# force the right sex_ontology_term_id
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"

# allos sequence_length == 190 for now
validate_table(SAMPLE, "SAMPLE", CDE)


# add 'replicate' coding (nans)

	All required fields are present in SAMPLE.
		Required Fields with Empty or NaN values:
			- source_RIN: 3616 rows
		Optional Fields with Empty or NaN values:
			- pm_PH: 3616 rows
	Invalid Field/Value pairs:
			- sequencing_length: 190


0

In [258]:
SAMPLE.replace("nan", "", inplace=True)
# sequence length will need to be converted to a string
SAMPLE.fillna("", inplace=True)
# Testing the function with SAMPLE.csv and CDE.csv


In [259]:
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)


# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
		Required Fields with Empty or NaN values:
			- age_at_onset: 138 rows
			- age_at_diagnosis: 10 rows
			- first_motor_symptom: 138 rows
			- path_year_death: 138 rows
			- brain_weight: 138 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 138 rows
	Invalid Field/Value pairs:
			- family_history: Nan
			- hx_dementia_mci: Nan
			- hx_melanoma: Nan
			- education_level: Nan
			- smoking_status: Nan
			- APOE_e4_status: Nan
			- cognitive_status: Nan
			- path_autopsy_dx_main: Control brain, Pathological ageing, Control brain / Path ageing, Argyrophilic grain disease, Control brain, Cerebrovascular disease (small vessel), Cerebrovascular disease (small vessel), Control brain, Alzheimer`s disease (intermediate level AD pathological change), Control brain / Path ageing, CAA, Nan
			- path_braak_nft: 2.0, 1.0, 3.0, 0.0, 4.0, 6.0, Nan
			- path_braak_asyn: 6.0, 0.0, 5.0, Nan
			- path_cerad: Nan
			- path_thal: At least 4, Na

0

In [260]:
# hack to make sure the replace below works.
CLINPATH.replace("", pd.NA, inplace=True)


In [261]:
# replace 'path_braak_asyn' with with string of the numeric. converte nan to ""
CLINPATH['path_braak_asyn'] = CLINPATH['path_braak_asyn'].apply(lambda val: str(int(val)) if pd.notna(val) else "")

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_braak_nft'] = CLINPATH['path_braak_nft'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"0", 
                                                                                                                           "1":"I", 
                                                                                                                           "2": "II", 
                                                                                                                           "3":"III", 
                                                                                                                           "4":"IV", 
                                                                                                                           "5":"V", 
                                                                                                                           "6":"VI"})

# code family_history as "Not Reported" (currently empty)
CLINPATH['family_history'] = "Not Reported"



# check APOE_e4_status ? currently empty

# `path_autopsy_dx_main`  actually seems good parser might be wrong

# code "at least 4" as "4/5" 

CLINPATH['path_thal'] = CLINPATH['path_thal'].replace({'At least 4':"4/5"})


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace({'Diffuse neocortical': "Diffuse, neocortical (brainstem, limbic and neocortical involvement)", 
                                                        'Limbic transitional': "Limbic (transitional)" ,
                                                        'Diffuse Neocortical':"Diffuse, neocortical (brainstem, limbic and neocortical involvement)"})

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_a'] = CLINPATH['path_nia_aa_a'].apply(lambda val: str(int(val)) if pd.notna(val) else "")

CLINPATH['path_nia_aa_a'] = CLINPATH['path_nia_aa_a'].replace({"0":"A0", 
                                                                                                                           "1":"A1", 
                                                                                                                           "2": "A2", 
                                                                                                                           "3":"A3"})


In [262]:

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_b'] = CLINPATH['path_nia_aa_b'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"B0", 
                                                                                                                           "1":"B1", 
                                                                                                                           "2": "B2", 
                                                                                                                           "3":"B3"})


# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_c'] = CLINPATH['path_nia_aa_c'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"C0", 
                                                                                                                           "1":"C1", 
                                                                                                                           "2": "C2", 
                                                                                                                           "3":"C3"})



In [263]:



CLINPATH['path_ad_level'] = CLINPATH['path_ad_level'].replace({"No evidence": "No evidence of Alzheimer\'s disease neuropathological change"})


# empty 'hx_dementia_mci', 'hx_melanoma', 'education_level', 'cognitive_status'
#  coded as nan will be fixed with .fillna("") below
CLINPATH.replace("nan", "", inplace=True)
CLINPATH.fillna("", inplace=True)



In [264]:
CDE[CDE["Field"]=="dig_slide_avail"].Validation.unique(), CLINPATH['dig_slide_avail'].unique()

(array(['["Yes, No"]'], dtype=object), array(['Yes'], dtype=object))

In [265]:
# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	Invalid Field/Value pairs:
			- path_autopsy_dx_main: Control brain, Pathological ageing, Control brain / Path ageing, Argyrophilic grain disease, Control brain, Cerebrovascular disease (small vessel), Cerebrovascular disease (small vessel), Control brain, Alzheimer`s disease (intermediate level AD pathological change), Control brain / Path ageing, CAA, 
			- path_nia_aa_a: A0, 
			- path_nia_aa_b: B0, 
			- path_nia_aa_c: C0, 
			- dig_slide_avail: Yes
			- quant_path_avail: Yes


0

In [266]:


# # write the clean metadata
# STUDY.to_csv(data_path / "metadata/STUDY.csv")
# PROTOCOL.to_csv(data_path / "metadata/PROTOCOL.csv")
# CLINPATH.to_csv(data_path / "metadata/CLINPATH.csv")
# SAMPLE.to_csv(data_path / "metadata/SAMPLE.csv")
# SUBJECT.to_csv(data_path / "metadata/SUBJECT.csv")

# # also writh them to clean...
# 
#  

export_root = Path.cwd() / "clean/team-Hardy"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


STUDY.to_csv( export_root / "STUDY.csv")
PROTOCOL.to_csv(export_root / "PROTOCOL.csv")
SAMPLE.to_csv(export_root / "SAMPLE.csv")
SUBJECT.to_csv(export_root / "SUBJECT.csv")
CLINPATH.to_csv(export_root / "CLINPATH.csv")



basically hold the list of the GP2ID and the original clinical ID pairs + how many samples are in the GP2 (s1 only or s1, s2,...) for all GP2 submitted individuals. It takes the sample manifest, scan the clinical ID to check if this is the additional submission of those already in the GP2 or not and then if its new, give new GP2ID and GP2sampleID. If the clinical_id is already existing in the GP2 then only provide GP2sampleID (GP2ID_sX+1). Also it errors if the original sample ID submitted is equal to the one in the list. (No duplication of sample ID from the same cohort)

In [4]:
team_names = ["team-lee", "team-hafler", "team-hardy", "team-jakobsson", "team-sherzer","team-sulzer", "tam-voet","team-wood"]

In [5]:
team_code = ["LEE", "HAF", "HAR", "JAK", "SHE", "SUL", "VOE", "WOO"]




In [7]:
[x.upper() for x in team_names]


['TEAM-LEE',
 'TEAM-HAFLER',
 'TEAM-HARDY',
 'TEAM-JAKOBSSON',
 'TEAM-SHERZER',
 'TEAM-SULZER',
 'TAM-VOET',
 'TEAM-WOOD']

In [None]:
    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1

In [None]:


def getgp2idsv2(dfproc, n, study_code):
    df_dups = dfproc[dfproc.duplicated(keep=False, subset=['clinical_id'])].sort_values('clinical_id').reset_index(drop = True).copy()
    if df_dups.shape[0]>0:
        dupids_mapper = dict(zip(df_dups.clinical_id.unique(),
                            [num+n for num in range(len(df_dups.clinical_id.unique()))]))
        
        df_dup_chunks = []
        for clin_id, gp2id in dupids_mapper.items():
            df_dups_subset = df_dups[df_dups.clinical_id==clin_id].copy()
            df_dups_subset['GP2ID'] = [f'{study_code}_{gp2id:06}' for i in range(df_dups_subset.shape[0])]
            df_dups_subset['SampleRepNo'] = ['s'+str(i+1) for i in range(df_dups_subset.shape[0])]
            df_dups_subset['GP2sampleID'] = df_dups_subset['GP2ID'] + '_' + df_dups_subset['SampleRepNo']
            df_dup_chunks.append(df_dups_subset)
        df_dups_wids = pd.concat(df_dup_chunks)

    df_nodups = dfproc[~dfproc.duplicated(keep=False, subset=['clinical_id'])].sort_values('clinical_id').reset_index(drop = True).copy()

    if df_dups.shape[0]>0:
        n =  len(list(dupids_mapper.values())) + n
    else:
        n = n

    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1
    df_nodups_wids = df_nodups.copy()
    df_nodups_wids['uid_idx'] = df_nodups_wids['sample_id'].map(mapid)
    df_nodups_wids['GP2ID'] = [f'{study_code}_{i:06}' for i in df_nodups_wids.uid_idx]
    df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('GP2ID').cumcount() + 1
    df_nodups_wids['GP2sampleID'] = df_nodups_wids.GP2ID + '_s' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids['SampleRepNo'] = 's' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids.drop(['uid_idx','uid_idx_cumcount'], axis = 1, inplace = True)

    if df_dups.shape[0]>0:
        df_newids = pd.concat([df_dups_wids, df_nodups_wids])
    else:
        df_newids = df_nodups_wids
    
    return(df_newids)

def assign_unique_gp2clinicalids(df, clinicalid_subset):

    if isinstance(clinicalid_subset, pd.Series):
        clinicalid_subset = clinicalid_subset.to_frame().T

    sampleid = clinicalid_subset.sort_values(by=['master_GP2sampleID'])\
                                .reset_index(drop = True)\
                                .dropna(subset=['master_GP2sampleID'], axis = 0)
    sampleid = sampleid.loc[sampleid.index[-1], 'master_GP2sampleID'].split("_")
    getuniqueid = sampleid[0] + "_" + sampleid[1]
    get_sidrepno = int(sampleid[2].replace("s","")) + 1

    index_modify = clinicalid_subset['index'].unique() #clinicalid_subset[clinicalid_subset['GP2sampleID'].isnull()] #.index
    assign_gp2sampleid = [getuniqueid + "_s" + str(get_sidrepno + i) for i in range(len(index_modify))]
    df.loc[index_modify, 'GP2sampleID'] = assign_gp2sampleid
    getnewidrows = df.loc[index_modify].copy()
    return (getnewidrows)

def master_keyv2(studies):
    # ACCESS MASTERGP2IDS_JSON IN GP2 BUCKET
    client = storage.Client()
    bucket = client.get_bucket('eu-samplemanifest')
    blob = bucket.blob('IDSTRACKER/GP2IDSMAPPER.json')
    
    ids_tracker = {}
    with blob.open("r") as f:
        for k, v in ijson.kvitems(f, ''):
            if k in studies:
                ids_tracker.update({k:v})
    
    return(ids_tracker)

In [1]:
40*"-"

'----------------------------------------'

In [None]:
        # GENERATE GP2 IDs #
        jumptwice()
        st.subheader('GP2 IDs assignment...')
        studynames = list(df['study'].unique())

        if st.session_state['master_get'] == None: # TO ONLY RUN ONCE
            #ids_tracker = generategp2ids.master_key(studies = studynames)
            ids_tracker = generategp2ids.master_keyv2(studies = studynames)
            study_subsets = []
            log_new = []
            df['GP2sampleID'] = None
            # GP2 ID ASSIGNMENT CODE BLOCK
            for study in studynames:
                st.write(f"Getting GP2IDs for {study} samples")
                df_subset = df[df.study==study].copy()
                try:
                    #study_tracker = st.session_state['store_tracker'][study]
                    study_tracker = ids_tracker[study]
                    study_tracker_df = pd.DataFrame.from_dict(study_tracker,
                                                            orient='index',
                                                            columns = ['master_GP2sampleID','clinical_id'])\
                                                    .rename_axis('master_sample_id').reset_index()\
                                                    .astype(str)

                    # Check if any sample ID exists in df_subset.
                    sample_id_unique = pd.merge(study_tracker_df, df_subset,
                                                left_on=['master_sample_id'], right_on=['sample_id'], how='inner')
                    if not sample_id_unique.empty:
                        st.error('We have detected sample ids submitted on previous versions')
                        st.error('Please, correct these sample IDs so that they are unique and resubmit the sample manifest.')
                        sample_id_unique = sample_id_unique.rename(columns={"clinical_id_y": "clinical_id"})
                        st.dataframe(
                        sample_id_unique[['study','sample_id','clinical_id']].style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                        )
                        stopapp=True
                    else:
                        stopapp=False
                except:
                    study_tracker = None
                    stopapp = False
                if stopapp:
                    st.stop()

                if bool(study_tracker):
                    # WORK ON DUPLICATED IDS
                    df_subset = df_subset.reset_index()
                    data_duplicated = pd.merge(df_subset, study_tracker_df, on=['clinical_id'], how='inner')
                    df_subset = df_subset.set_index('index')
                    df_subset.index.name = None

                    if data_duplicated.shape[0]>0:
                        new_clinicaldups = True
                        newids_clinicaldups = data_duplicated.groupby('clinical_id')\
                                                        .apply(lambda x: generategp2ids.assign_unique_gp2clinicalids(df_subset,x))

                        if newids_clinicaldups.shape[0]>0:
                            newids_clinicaldups = newids_clinicaldups.reset_index(drop=True)[['study','clinical_id','sample_id','GP2sampleID']]
                            log_new.append(newids_clinicaldups)
                    else:
                        new_clinicaldups = False
                        newids_clinicaldups = pd.DataFrame()

                    # GET GP2 IDs METADATA for new CLINICAL-SAMPLE ID pairs
                    df_newids = df_subset[df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                    if not df_newids.empty: # Get new GP2 IDs
                        df_wids = df_subset[~df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                        df_wids['GP2ID'] = df_wids['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_wids['SampleRepNo'] = df_wids['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))

                        n=int(max(study_tracker_df['master_GP2sampleID'].to_list()).split("_")[1])+1
                        df_newids = generategp2ids.getgp2idsv2(df_newids, n, study)
                        df_subset = pd.concat([df_newids, df_wids], axis = 0)
                        study_subsets.append(df_subset)
                        log_new.append(df_newids[['study','clinical_id','sample_id','GP2sampleID']])
                        
                    else: # TO CONSIDER THE CASE IN WHICH WE ONLY HAD DUPLICATE IDS MAPPED ON THE MASTER FILE
                        df_subset['GP2ID'] = df_subset['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_subset['SampleRepNo'] = df_subset['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))
                        study_subsets.append(df_subset)

                # Brand new data - NO STUDY TRACKER FOR THIS COHORT
                else:
                    study = study
                    new_clinicaldups = False # Duplicates from master key json are treated differently to brand new data
                    n = 1
                    df_newids = generategp2ids.getgp2idsv2(df_subset, n, study)
                    study_subsets.append(df_newids)


                # CODE TO UPDATE THE GET FILE WE WILL USE TO UPDATE MASTER JSON
                if (new_clinicaldups) and (newids_clinicaldups.shape[0]>0):
                    tmp = pd.concat([df_newids[['study','clinical_id','sample_id','GP2sampleID']], newids_clinicaldups])
                    tmp['master_value'] = list(zip(tmp['GP2sampleID'],
                                                    tmp['clinical_id']))
                    ids_log = tmp.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                            x['master_value']))).to_dict()
                else:
                    df_update_master = df_newids.copy()
                    df_update_master['master_value'] = list(zip(df_update_master['GP2sampleID'],
                                                            df_update_master['clinical_id']))
                    ids_log = df_update_master.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                                    x['master_value']))).to_dict()

                #generategp2ids.update_masterids(ids_log, study_tracker) # THIS WILL BE UPDATED ONCE THE USET CONFIRMS THE QC ( AT THE END)
                
                #if st.session_state['master_get'] == None:
                if (isinstance(st.session_state['all_ids'], list)):
                    st.session_state['all_ids'].append( [ids_log, study_tracker] )
                if st.session_state['all_ids'] == None:
                    st.session_state['all_ids'] = [ [ids_log, study_tracker] ]
            

            # OUT OF FOR LOOP // END OF GP2 IDS ASSIGNMENT. LET'S RESUME df.
            df = pd.concat(study_subsets, axis = 0)
            df = df[list(df)[-3:] + list(df)[:-3]]
            st.write("GPS IDs assignment... OK")

            #if st.session_state['master_get'] == None:
            st.session_state['df_copy'] = df
            if len(log_new) > 0:
                allnew = pd.concat(log_new, axis = 0).reset_index(drop=True)
                st.write("Thanks for uploading a new version of the sample manifest")
                st.write(f'We have detected a total of {allnew.shape[0]} new samples')
                st.write("We have assigned new GP2IDs to those. Showing them below...")
                st.dataframe(
                allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                #allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                )
            else:
                aggridPlotter(df)

            st.session_state['df_finalids'] = df
            st.session_state['master_get'] = 'DONE'

        else:
            df = st.session_state['df_finalids']
            aggridPlotter(df)
            # df_builder = GridOptionsBuilder.from_dataframe(st.session_state['df_copy'])
            # df_builder.configure_grid_options(alwaysShowHorizontalScroll = True,
            #                                     enableRangeSelection=True,
            #                                     pagination=True,
            #                                     paginationPageSize=10000,
            #                                     domLayout='normal')
            # godf = df_builder.build()
            # AgGrid(st.session_state['df_copy'],gridOptions=godf, theme='streamlit', height=300)
            #df = st.session_state['df_finalids']
        #st.session_state['master_get'] = 'DONE'
