ASAP CRN Metadata validation - wave 1

# ASAP CRN Metadata validation - wave 1

15 September 2023
Andy Henrie



## STEPS

### imports
- pandas
- pathlib

### Load CDE for validation
- check all columns

### Team Lee
- load .tsv, csv tables
- fix format
- load additional metadata

- add batch columns
- add missing columns


### Team Hafler
- load excel file with tables
- add batch info
- add missing columns



### Team Hardy
- load excel file with tables
- add batch info
- add missing columns


In [25]:
import pandas as pd

from pathlib import Path


## Load CDE

In [187]:
CDE_path = Path.cwd() / "CDE.csv" 
CDE = pd.read_csv(CDE_path )

CDE.head()



Unnamed: 0,Table,Field,Description,DataType,Required,Validation,Unnamed: 6,ClinPath field,team_Hafler type,ClinPath description,Unnamed: 10
0,STUDY,project_name,Project Name: A Title of the overall project...,String,Required,,,,,,bard
1,STUDY,project_dataset,Dataset Name: A unique name is required for ...,String,Required,,,,,,
2,STUDY,project_description,Project Description: Brief description of th...,String,Required,,,,,,
3,STUDY,ASAP_team_name,ASAP Team Name: Name of the ASAP CRN Team. i...,Enum,Required,"[""TEAM-LEE"",""TEAM-HAFLER"",""TEAM-HARDY"", ""TEAM-...",,,,,
4,STUDY,ASAP_lab_name,Lab Name. : Lab name that is submitting data...,String,Required,,,,,,


# create helpers - validate_table

In [188]:
def validate_table(table: pd.DataFrame, table_name: str, CDE: pd.DataFrame):

    retval = 1

    # Filter out rows specific to the given table_name from the CDE
    specific_cde_df = CDE[CDE['Table'] == table_name]
    
    # Extract fields that have a data type of "Enum" and retrieve their validation entries
    enum_fields_dict = dict(zip(specific_cde_df[specific_cde_df['DataType'] == "Enum"]['Field'], 
                               specific_cde_df[specific_cde_df['DataType'] == "Enum"]['Validation']))
    
    # Extract fields that are marked as "Required"
    required_fields = specific_cde_df[specific_cde_df['Required'] == "Required"]['Field'].tolist()
    optional_fields = specific_cde_df[specific_cde_df['Required'] == "Optional"]['Field'].tolist()

    table = force_enum_string(table, table_name, CDE)

    # Check for missing "Required" fields
    missing_required_fields = [field for field in required_fields if field not in table.columns]
    
    if missing_required_fields:
        print(f"\tMissing Required Fields in {table_name}: {', '.join(missing_required_fields)}")
    else:
        print(f"\tAll required fields are present in {table_name}.")

    # Check for empty or NaN values
    for test_field,test_name in zip([required_fields, optional_fields], ["Required", "Optional"]):
        empty_or_nan_fields = {}
        for field in test_field:
            if field in table.columns:
                invalid_count = table[field].isna().sum()
                if invalid_count > 0:
                    empty_or_nan_fields[field] = invalid_count
                    
        if empty_or_nan_fields:
            print(f"\t\t{test_name} Fields with Empty or NaN values:")
            for field, count in empty_or_nan_fields.items():
                print(f"\t\t\t- {field}: {count} rows")
            retval = 0
        else:
            print(f"No empty or NaN values found in {test_name} fields.")
    


    # Check for invalid Enum field values
    invalid_field_values = {}
    for field, validation_str in enum_fields_dict.items():
        valid_values = eval(validation_str)
        if field in table.columns:
            invalid_values = table[~table[field].isin(valid_values)][field].unique()
            if invalid_values.any():
                invalid_field_values[field] = invalid_values
    
    if invalid_field_values:
        print("\tInvalid Field/Value pairs:")
        for field, values in invalid_field_values.items():
            print(f"\t\t\t- {field}: {', '.join(map(str, values))}")
        retval = 0
    else:
        print(f"\tAll Enum fields have valid values in {table_name}.")

    return retval

######## HELPERS ########
# Define a function to only capitalize the first letter of a string
def capitalize_first_letter(s):
    if not isinstance(s, str) or len(s) == 0:  # Check if the value is a string and non-empty
        return s
    return s[0].upper() + s[1:]

def force_enum_string(df, df_name, CDE):

    string_enum_fields = CDE[(CDE["Table"] == df_name) & 
                                (CDE["DataType"].isin(["Enum", "String"]))]["Field"].tolist()
    # Convert the specified columns to string data type using astype() without a loop
    columns_to_convert = {col: 'str' for col in string_enum_fields if col in df.columns}
    df = df.astype(columns_to_convert)

    # enum_fields = CDE[ (CDE["Table"] == df_name) & 
    #                             (CDE["DataType"]=="Enum") ]["Field"].tolist()
    
    for col in string_enum_fields:
        if col in df.columns and col not in ["assay", "file_type"]:
            df[col] = df[col].apply(capitalize_first_letter)

    return df



## write clean metadata tables according to CDE 

In [189]:
# SAMPLE = SAMPLE[CDE[CDE["Table"]=="SAMPLE"].Field.tolist()]


def reorder_table_to_CDE(df, df_name, CDE):
    col_order = CDE[CDE["Table"]==df_name].Field.tolist()
    
    df_out = pd.DataFrame()
    for col in col_order:
        if col in df.columns:   
            df_out[col] = df[col]
        else:
            df_out[col] = ""

    return df_out



In [190]:
## write clean metadata tables according to CDE 
# SAMPLE = SAMPLE[CDE[CDE["Table"]=="SAMPLE"].Field.tolist()]


def reorder_table_to_CDE(df, df_name, CDE):
    col_order = CDE[CDE["Table"]==df_name].Field.tolist()
    
    df_out = pd.DataFrame()
    for col in col_order:
        if col in df.columns:   
            df_out[col] = df[col]
        else:
            df_out[col] = ""

    return df_out



# Clean each Team Table


## Team Lee

In [191]:
## convert 
data_path = Path.home() / ("Projects/ASAP/team-lee")
metadata_path = data_path / "metadata/ogmetadata"

SUBJECT = pd.read_csv(f"{metadata_path}/SUBJECT.tsv", delimiter="\t")
SAMPLE = pd.read_csv(f"{metadata_path}/SAMPLE.tsv",delimiter="\t")

CLINPATH = pd.read_csv(f"{metadata_path}/CLINPATH.csv",delimiter=",")
STUDY = pd.read_csv(f"{metadata_path}/STUDY.tsv",delimiter="\t")
PROTOCOL = pd.read_csv(f"{metadata_path}/PROTOCOL.tsv",delimiter="\t")


In [192]:
STUDY = pd.read_csv(metadata_path / "STUDY.tsv",delimiter="\t")
STUDY.to_csv(data_path / "STUDY_.csv")
STUDY = pd.read_csv(data_path / "STUDY_.csv")
STUDY.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Team-Lee-Bras-Lab-Info,Field,Description,Data type,Validation,Note,Required/Optional
0,Is senescence a component of human PD and does...,project_name,Project Name/Title,String,,Unique and clear title.,Required,,,
1,Human snRNA-seq PD Senesence Jose Bras Team Lee,project_dataset,Dataset name,String,,A Dataset name is required for each submission...,Required,,,
2,Characterize the neuropathological progression...,project_description,Brief description of the goals and objectives ...,String,,,Required,,,
3,TEAM-LEE,ASAP_team_name,"ASAP Team e.g. ""Scherzer""",Enum,"[""TEAM-LEE"",""TEAM-HAFLER"",""TEAM-HARDY"",....]",,Required,,,
4,Bras,ASAP_lab_name,"ASAP Lab under the above team e.g. ""Dong""",String,,,Required,,,


In [193]:

# fix STUDY formatting
tmp = pd.DataFrame()
tmp = STUDY[["Unnamed: 1","Unnamed: 0"]].transpose().reset_index().drop(columns=["index"])
tmp.columns = tmp.iloc[0]
STUDY = tmp.drop([0])
# STUDY[["Unnamed: 1"]].transpose().reset_index().drop(columns=["index"]), tmp
STUDY.head()

Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,submitter_id,submitter_name,submittor_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Is senescence a component of human PD and does...,Human snRNA-seq PD Senesence Jose Bras Team Lee,Characterize the neuropathological progression...,TEAM-LEE,Bras,"Jose, Bras",jose.bras@vai.org,"Lee, L, Marshall ; Kimberly, E, Paquette ; Kai...",Kaitlyn E Westra,kaitlyn.westra@vai.org,...,,,,75,hippocampus; middle frontal gyrus; substantia ...,human PD and control postmortem brains,,,unsure,


Collect additional metadata from covar.csv .. i.e. batch

In [194]:

metadata_path = Path.home() / ("Projects/ASAP/team-lee/metadata")
HIP_covar = pd.read_csv(f"{metadata_path}/HIP/covar.csv")
HIP_cases = pd.read_csv(f"{metadata_path}/HIP/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
HIP_control = pd.read_csv(f"{metadata_path}/HIP/PD_ASAP_Sample_batch_information_banner_controls.csv")

MFG_covar = pd.read_csv(f"{metadata_path}/MFG/covar.csv") # includes 'PMI' ?
MFG_cases = pd.read_csv(f"{metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
MFG_control = pd.read_csv(f"{metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_controls.csv")


SN_covar = pd.read_csv(f"{metadata_path}/SN/covar.csv")
SN_cases = pd.read_csv(f"{metadata_path}/SN/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
SN_control = pd.read_csv(f"{metadata_path}/SN/PD_ASAP_Sample_batch_information_banner_controls.csv")

In [195]:
# Hippocampus samples
# HIP_cases["GROUPcv"]="PD"
# HIP_control["GROUPcv"]="HC"

HIP_meta = pd.concat([HIP_cases, HIP_control], axis=0, ignore_index=True)
HIP_meta["GROUPcv"]= HIP_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")


In [196]:


HIP_meta['MERGE_ID'] = "HIP_" + HIP_meta['GROUPcv'] +"_" + HIP_meta['CaseID'].str.replace('-','')
HIP_covar['MERGE_ID'] = HIP_covar['COUNT_ID']
# the fastqs follow COUNT_ID insteald of SEQ_ID naming convention
HIP_covar['SEQ_ID'] = HIP_covar['COUNT_ID']



In [197]:
# there's a bug in the meta table... skip for now
HIP_TABLE = pd.merge(HIP_covar, HIP_meta, on='MERGE_ID', how='outer')

# HIP_TABLE = HIP_covar
HIP_TABLE['subdir']="HIP"


In [198]:
test = HIP_TABLE[["MERGE_ID","SEQ_ID","GROUPcv","subdir",'PD']]

In [199]:
### medial frontal gyrus samples
MFG_meta = pd.concat([MFG_cases, MFG_control], axis=0, ignore_index=True)
MFG_meta["GROUPcv"]= MFG_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")

# make a MERGE_ID column because the formatting is inconsistent
MFG_meta['MERGE_ID'] = "MFG_" + MFG_meta['GROUPcv'] +"_" + MFG_meta['CaseID'].str.replace('-','')
MFG_covar['MERGE_ID'] = MFG_covar['SAMPLE']
# the fastqs are in SEQ_ID 

# there's a bug in the meta table... skip for now
MFG_TABLE = pd.merge(MFG_covar, MFG_meta, on='MERGE_ID', how='inner')
MFG_TABLE['subdir']="MFG"



# Substantia Nigra
SN_meta = pd.concat([SN_cases, SN_control], axis=0, ignore_index=True)
SN_meta["GROUPcv"] = SN_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")

SN_meta['MERGE_ID'] = "SN_" + MFG_meta['GROUPcv'] +"_" + MFG_meta['CaseID'].str.replace('-','')
SN_covar['MERGE_ID'] = SN_covar['SAMPLE']

# there's a bug in the meta table... skip for now
SN_TABLE = pd.merge(SN_covar, SN_meta, on='MERGE_ID', how='outer')
SN_TABLE['subdir']="SN"


### concatenate SN, MSG, and HIP tables into one 'all_samples' table
all_samples = pd.concat([HIP_TABLE, MFG_TABLE, SN_TABLE], axis=0, ignore_index=True)


In [200]:

SAMPLE_ALL = SAMPLE.merge(all_samples, left_on='sample_id', right_on='MERGE_ID', how='left')
SAMPLE_ALL.to_csv("alternate_metadata.csv")

In [201]:
SAMPLE_og = SAMPLE.copy()
SAMPLE['batch'] = SAMPLE_ALL['BATCH']

In [202]:
# Extract the fields with DataType as "Enum" or "String" for the "sample" table from CDE.csv

SAMPLE = force_enum_string(SAMPLE, "SAMPLE", CDE)
# for field in string_enum_fields:
#     if field in SAMPLE.columns:
#         SAMPLE[field] = SAMPLE[field].astype(str)


In [203]:
STUDY

Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,submitter_id,submitter_name,submittor_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Is senescence a component of human PD and does...,Human snRNA-seq PD Senesence Jose Bras Team Lee,Characterize the neuropathological progression...,TEAM-LEE,Bras,"Jose, Bras",jose.bras@vai.org,"Lee, L, Marshall ; Kimberly, E, Paquette ; Kai...",Kaitlyn E Westra,kaitlyn.westra@vai.org,...,,,,75,hippocampus; middle frontal gyrus; substantia ...,human PD and control postmortem brains,,,unsure,


In [204]:
# Testing the function with STUDY.csv and CDE.csv
STUDY.replace("Nan", "", inplace=True)

validate_table(STUDY, "STUDY", CDE)



	Missing Required Fields in STUDY: contributor_names
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


1

In [205]:

# Need to rename submitter_id to contributor_names
STUDY = STUDY.rename(columns={"submitter_id": "contributor_names"})
validate_table(STUDY, "STUDY", CDE)


	All required fields are present in STUDY.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


1

In [206]:
# Testing the function with PROTOCOL.csv and CDE.csv

validate_table(PROTOCOL, "PROTOCOL", CDE)

	All required fields are present in PROTOCOL.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in PROTOCOL.


1

In [207]:
# Extract the fields with DataType as "Enum" or "String" for the "sample" table from CDE.csv
SUBJECT.replace("Nan", "", inplace=True)


SUBJECT = force_enum_string(SUBJECT, "SUBJECT", CDE)

# Testing the function with SUBJECT.csv and CDE.csv
validate_table(SUBJECT, "SUBJECT", CDE)

	All required fields are present in SUBJECT.
No empty or NaN values found in Required fields.
		Optional Fields with Empty or NaN values:
			- primary_diagnosis_text: 23 rows
	All Enum fields have valid values in SUBJECT.


0

In [208]:
SAMPLE.replace("Nan", "", inplace=True)

SAMPLE = force_enum_string(SAMPLE, "SAMPLE", CDE)


# Testing the function with SAMPLE.csv and CDE.csv
validate_table(SAMPLE, "SAMPLE", CDE)

# sequence length will need to be converted to a string




	Missing Required Fields in SAMPLE: file_MD5
		Required Fields with Empty or NaN values:
			- source_RIN: 75 rows
			- RIN: 75 rows
		Optional Fields with Empty or NaN values:
			- pm_PH: 75 rows
	All Enum fields have valid values in SAMPLE.


0

In [209]:
SAMPLE['file_type'] = SAMPLE['file_type'].replace({"Fastq":"fastq"})


In [210]:
# make the colunn order of SAMPLE match the CDE.Field
# SAMPLE = SAMPLE[CDE.Field.tolist()]
SAMPLE.head()

Unnamed: 0,sample_id,source_sample_id,subject_id,replicate,replicate_count,repeated_sample,tissue,brain_region,source_RIN,RIN,...,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id,batch
0,MFG_HC_1225,12-25,12-25,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
1,MFG_HC_0602,06-02,06-02,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
2,MFG_PD_0009,00-09,00-09,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
3,MFG_PD_1921,19-21,19-21,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
4,MFG_PD_2058,20-58,20-58,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4


In [211]:
# fix file_name and file_MD5 which need to be exploded (do this last for simplicity. i.e. to keep one sample per row rather than one file per row)

# Step 1: Split the values in the columns based on commas
SAMPLE['file_name'] = SAMPLE['file_name'].str.split(',')
SAMPLE['file_MD5(R1,R2)'] = SAMPLE['file_MD5(R1,R2)'].str.split(',')

# Step 2: Explode both 'file_name' and 'file_MD5(R1,R2)' columns together
SAMPLE = SAMPLE.explode(['file_name', 'file_MD5(R1,R2)'])

# Step 3: Rename the "file_MD5(R1,R2)" column to "file_MD5"
SAMPLE = SAMPLE.rename(columns={"file_MD5(R1,R2)": "file_MD5"})



In [212]:
SAMPLE.head()

Unnamed: 0,sample_id,source_sample_id,subject_id,replicate,replicate_count,repeated_sample,tissue,brain_region,source_RIN,RIN,...,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id,batch
0,MFG_HC_1225,12-25,12-25,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
0,MFG_HC_1225,12-25,12-25,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
1,MFG_HC_0602,06-02,06-02,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
1,MFG_HC_0602,06-02,06-02,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
2,MFG_PD_0009,00-09,00-09,Rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4


In [213]:
CLINPATH = force_enum_string(CLINPATH, "CLINPATH", CDE)
CLINPATH.replace("Nan", "", inplace=True)

# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
		Required Fields with Empty or NaN values:
			- age_at_onset: 75 rows
			- age_at_diagnosis: 75 rows
			- first_motor_symptom: 75 rows
			- path_year_death: 75 rows
			- brain_weight: 75 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 75 rows
	Invalid Field/Value pairs:
			- region_level_2: Hippocampus
			- path_braak_asyn: L. Olfactory Bulb-Only, Lla. Brainstem Predominant, Llb. Limbic Predominant, LV. Neocortical, Lll. Brainstem/Limbic, 0. No Lewy bodies
			- path_mckeith: L. Olfactory Bulb-Only, Lla. Brainstem Predominant, Llb. Limbic Predominant, LV. Neocortical, Lll. Brainstem/Limbic, 0. No Lewy bodies
			- path_nia_ri: Criteria not met, Not AD
			- TDP43: , Na
			- amyloid_angiopathy_severity_scale: , Cerebral amyloid angiopathy, temporal and occipital lobe, Cerebral amyloid angiopathy, frontal lobe
			- path_ad_level: Microscopic changes of Alzheimer's disease, insufficient for diagnosis, Microscopic lesions of 

0

In [214]:
CLINPATH['region_level_2'].unique()

# change "Hippocampus" to "CA1-CA4"
CLINPATH['region_level_2'] = CLINPATH['region_level_2'].replace('Hippocampus', 'CA1-CA4')

# skip hx_melanoma and education level for now as there is not a "Unknown" or "Not Reported" option in the CDE

# leave te APOE_e4_status as is for now . multiple are coded as "2,3" 
# leave cognitive status as is, since there is no "Unknown" or "Not Reported" option in the CDE

# potential "path_braak_asyn" coding 
braak_map = {'L. Olfactory Bulb-Only':"1/2", 'Lla. Brainstem Predominant':"3",
       'Llb. Limbic Predominant':"3/4", 'LV. Neocortical':"5",
       'Lll. Brainstem/Limbic':"3/4", '0. No Lewy bodies':"0"}
# set to NaN for now since this is actualy path_mckeith coding

CLINPATH['path_braak_asyn'] = ""

mckeith_map = {'L. Olfactory Bulb-Only':"Olfactory bulb only", 'Lla. Brainstem Predominant':"Brainstem",
       'Llb. Limbic Predominant':"Limbic (transitional)", 'LV. Neocortical':"Neocortical",
       'Lll. Brainstem/Limbic':"Amygdala Predominant", '0. No Lewy bodies':"Absent"}


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace(mckeith_map)

# leave path_nia_ri like this for now. not sure how to map "criteria not met" and "Not AD"

# leave amyloid_angiopathy_severity_scale like this for now. not sure how to map 'Cerebral amyloid angiopathy, temporal and occipital lobe','Cerebral amyloid angiopathy, frontal lobe']


In [215]:
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
		Required Fields with Empty or NaN values:
			- age_at_onset: 75 rows
			- age_at_diagnosis: 75 rows
			- first_motor_symptom: 75 rows
			- path_year_death: 75 rows
			- brain_weight: 75 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 75 rows
	Invalid Field/Value pairs:
			- path_nia_ri: Criteria not met, Not AD
			- TDP43: , Na
			- amyloid_angiopathy_severity_scale: , Cerebral amyloid angiopathy, temporal and occipital lobe, Cerebral amyloid angiopathy, frontal lobe
			- path_ad_level: Microscopic changes of Alzheimer's disease, insufficient for diagnosis, Microscopic lesions of Alzheimer's disease, insufficient for diagnosis


0

In [216]:
CLINPATH

Unnamed: 0,sample_id,source_sample_id,time_from_baseline,GP2_id,hemisphere,region_level_1,region_level_2,region_level_3,AMPPD_id,family_history,...,sn_neuronal_loss,path_infarcs,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level
0,MFG_HC_1225,12-25,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,,,Criteria not met,,,,,,,No evidence of Alzheimer's disease neuropathol...
1,MFG_HC_0602,06-02,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,,,Criteria not met,,,,,,,No evidence of Alzheimer's disease neuropathol...
2,MFG_PD_0009,00-09,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,,,Not AD,,,,,,,No evidence of Alzheimer's disease neuropathol...
3,MFG_PD_1921,19-21,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,,,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...
4,MFG_PD_2058,20-58,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,,,Low,,,,None in medial temporal lobe,,,"Microscopic changes of Alzheimer's disease, in..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,SN_PD_1858,18-58,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,,,Criteria not met,,,,Unknown,,,"Microscopic changes of Alzheimer's disease, in..."
71,SN_PD_1902,19-02,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,,,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...
72,SN_PD_1973,19-73,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,,Yes,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...
73,SN_PD_2005,20-05,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,,Yes,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...


In [217]:

SAMPLE_ALL_CP = SAMPLE_ALL.merge(CLINPATH, on='sample_id', how='outer')


In [218]:
SAMPLE_ALL_CP.to_csv("./clean/team-Lee/auxiluary_metadata.csv")

In [219]:
# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)     
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)

# write the clean metadata
STUDY.to_csv(data_path / "metadata/STUDY.csv")
PROTOCOL.to_csv(data_path / "metadata/PROTOCOL.csv")
CLINPATH.to_csv(data_path / "metadata/CLINPATH.csv")
SAMPLE.to_csv(data_path / "metadata/SAMPLE.csv")
SUBJECT.to_csv(data_path / "metadata/SUBJECT.csv")

# also writh them to clean...
# 
#  

export_root = Path.cwd() / "clean/team-Lee"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


STUDY.to_csv( export_root / "STUDY.csv")
PROTOCOL.to_csv(export_root / "PROTOCOL.csv")
SAMPLE.to_csv(export_root / "SAMPLE.csv")
SUBJECT.to_csv(export_root / "SUBJECT.csv")
CLINPATH.to_csv(export_root / "CLINPATH.csv")


In [220]:
CLINPATH

Unnamed: 0,sample_id,source_sample_id,time_from_baseline,GP2_id,hemisphere,region_level_1,region_level_2,region_level_3,AMPPD_id,family_history,...,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level,dig_slide_avail,quant_path_avail
0,MFG_HC_1225,12-25,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,Criteria not met,,,,,,,No evidence of Alzheimer's disease neuropathol...,,
1,MFG_HC_0602,06-02,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,Criteria not met,,,,,,,No evidence of Alzheimer's disease neuropathol...,,
2,MFG_PD_0009,00-09,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,Not AD,,,,,,,No evidence of Alzheimer's disease neuropathol...,,
3,MFG_PD_1921,19-21,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...,,
4,MFG_PD_2058,20-58,0,,Unknown,Frontal lobe,Middle frontal gyrus,Unknown,,Not Reported,...,Low,,,,None in medial temporal lobe,,,"Microscopic changes of Alzheimer's disease, in...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,SN_PD_1858,18-58,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,Criteria not met,,,,Unknown,,,"Microscopic changes of Alzheimer's disease, in...",,
71,SN_PD_1902,19-02,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...,,
72,SN_PD_1973,19-73,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...,,
73,SN_PD_2005,20-05,0,,Unknown,Midbrain,Substantia nigra,Unknown,,Not Reported,...,Criteria not met,,,,None in medial temporal lobe,,,No evidence of Alzheimer's disease neuropathol...,,


## Team Hafler

In [221]:
## convert to seurat Object
data_path = Path.home() / ("Projects/ASAP")
metadata_path = data_path / "team-hafler/metadata"

sheets = ["SAMPLE","SUBJECT","CLINPATH","STUDY","PROTOCOL"]
excel_path = data_path / "ASAP_CDE_ALL_Team_Hafler_v1.xlsx"
STUDY = pd.read_excel(excel_path,sheet_name="STUDY",header=1).drop(columns="Field")
CLINPATH = pd.read_excel(excel_path,sheet_name="CLINPATH",header=1).drop(columns="Field")
SUBJECT = pd.read_excel(excel_path,sheet_name="SUBJECT",header=1).drop(columns="Field")
SAMPLE = pd.read_excel(excel_path,sheet_name="SAMPLE",header=1).drop(columns="Field")
PROTOCOL = pd.read_excel(excel_path,sheet_name="PROTOCOL",header=1).drop(columns="Field")
metadata_path

PosixPath('/Users/ergonyc/Projects/ASAP/team-hafler/metadata')

In [222]:

def add_hafler_batch(sample_df):

    # First batch: HSDG07HC HSDG10HC HSDG148PD HSDG199PD
    # batch[batch.sample_id in ['hSDG07HC', 'hSDG10HC', 'hSDG148PD', 'hSDG199PD']]=1
    Batch_1 = ['hSDG07', 'hSDG10', 'hSDG148', 'hSDG199'] 
    # Second batch: hsDG101HC hsDG13HC hsDG151PD hsDG197PD hsDG30HC hsDG99HC
    Batch_2 = ['hSDG101', 'hSDG13', 'hSDG151', 'hSDG197', 'hSDG30', 'hSDG99']
    # Third batch: hsDG142PD hsDG208PD
    Batch_3 = ['hSDG142', 'hSDG208'] 


    batch_col = []
    for row in sample_df.sample_id:
        if row in Batch_1:
            batch_col.append("Batch_1")
        elif row in Batch_2:
            batch_col.append("Batch_2")
        elif row in Batch_3:
            batch_col.append("Batch_3")
        else:
            print("ERROR >>>>>>>> not no batch info")
            batch_col.append("")


    sample_df['batch'] = batch_col
    return sample_df

SAMPLE = add_hafler_batch(SAMPLE)

In [223]:
# fix replicate & replicate_count
SAMPLE.replace("Nan", "", inplace=True)

SAMPLE['replicate'] = "Rep1"
SAMPLE['replicate_count'] = 1

SAMPLE.head(50)

Unnamed: 0,sample_id,source_sample_id,subject_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,source_RIN,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id
0,hSDG07,hSDG07,HC01,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
1,hSDG07,hSDG07,HC01,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
2,hSDG07,hSDG07,HC01,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
3,hSDG101,hSDG101,HC03,Rep1,1,0,Batch_2,Brain,Prefrontal Cortex,,...,,,,,,,,,,
4,hSDG101,hSDG101,HC03,Rep1,1,0,Batch_2,Brain,Prefrontal Cortex,,...,,,,,,,,,,
5,hSDG101,hSDG101,HC03,Rep1,1,0,Batch_2,Brain,Prefrontal Cortex,,...,,,,,,,,,,
6,hSDG10,hSDG10,HC04,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
7,hSDG10,hSDG10,HC04,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
8,hSDG10,hSDG10,HC04,Rep1,1,0,Batch_1,Brain,Prefrontal Cortex,,...,,,,,,,,,,
9,hSDG13,hSDG13,HC02,Rep1,1,0,Batch_2,Brain,Prefrontal Cortex,,...,,,,,,,,,,


In [224]:

STUDY = force_enum_string(STUDY, "STUDY", CDE)


In [225]:
# Testing the function with STUDY.csv and CDE.csv
validate_table(STUDY, "STUDY", CDE)


# Need to add contributor_names

	All required fields are present in STUDY.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


1

In [226]:
# Testing the function with PROTOCOL.csv and CDE.csv
validate_table(PROTOCOL, "PROTOCOL", CDE)

	All required fields are present in PROTOCOL.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in PROTOCOL.


1

In [227]:
SUBJECT.replace("Nan", "", inplace=True)
# Testing the function with SUBJECT.csv and CDE.csv
validate_table(SUBJECT, "SUBJECT", CDE)

	All required fields are present in SUBJECT.
No empty or NaN values found in Required fields.
		Optional Fields with Empty or NaN values:
			- primary_diagnosis_text: 12 rows
	Invalid Field/Value pairs:
			- sex: F, M
			- race: B, W
			- primary_diagnosis: Normal control, Idiopathic Parkinson's disease


0

In [228]:

SUBJECT = force_enum_string(SUBJECT, "SUBJECT", CDE)


In [229]:
SUBJECT['sex'] = SUBJECT['sex'].replace({'F':"Female", 'M':"Male"})
SUBJECT['race'] = SUBJECT['race'].replace({'W':"White", 'B':"Black or African American"})

SUBJECT['primary_diagnosis'] = SUBJECT['primary_diagnosis'].replace({'Normal control':"Healthy Control", "Idiopathic Parkinson's disease":"Idiopathic PD"})
validate_table(SUBJECT, "SUBJECT", CDE)

	All required fields are present in SUBJECT.
No empty or NaN values found in Required fields.
		Optional Fields with Empty or NaN values:
			- primary_diagnosis_text: 12 rows
	All Enum fields have valid values in SUBJECT.


0

In [230]:
SAMPLE.replace("Nan", "", inplace=True)
SAMPLE.replace("nan", "", inplace=True)

# Testing the function with SAMPLE.csv and CDE.csv
validate_table(SAMPLE, "SAMPLE", CDE)

# sequence length will need to be converted to a string

	All required fields are present in SAMPLE.
		Required Fields with Empty or NaN values:
			- source_RIN: 36 rows
			- RIN: 36 rows
			- time: 36 rows
			- suspension_type: 36 rows
		Optional Fields with Empty or NaN values:
			- pm_PH: 36 rows
	Invalid Field/Value pairs:
			- assay: v3.1 - Single Index, 10x Genomics 
			- sequencing_length: 150bp x2
			- organism_ontology_term_id: Nan
			- sex_ontology_term_id: Nan


0

In [231]:
SAMPLE = force_enum_string(SAMPLE, "SAMPLE", CDE)


In [232]:
# force the right sex_ontology_term_id
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"

# set time == 0 for all samples
SAMPLE['time'] = 0

SAMPLE['file_type'] = SAMPLE['file_type'].replace({"Fastq":"fastq"})


In [233]:

# need to join with subject to get "sex" and convert to ontology term
SAMPLE_SUBJECT = SAMPLE.merge(SUBJECT, on='subject_id',  how='left')
SAMPLE_og = SAMPLE.copy()
SAMPLE['sex_ontology_term_id'] = SAMPLE_SUBJECT['sex'].replace({"Male":"PATO:0000384 (male)", "Female":"PATO:0000383 (female)" })

# ignore development_stage_ontology_term_id, self_reported_ethnicity_ontology_term_id, assay_ontology_term_id, etc for now. (Check wiht Le)

In [234]:
# fix assay
SAMPLE['assay'] = SAMPLE['assay'].replace({'v3.1 - Single Index, 10x Genomics ':"v3.1 - Single Index"})
# fix assay
SAMPLE['sequencing_length'] = SAMPLE['sequencing_length'].replace({'150bp x2':"150"})


In [235]:
validate_table(SAMPLE, "SAMPLE", CDE)

	All required fields are present in SAMPLE.
		Required Fields with Empty or NaN values:
			- source_RIN: 36 rows
			- RIN: 36 rows
			- suspension_type: 36 rows
		Optional Fields with Empty or NaN values:
			- pm_PH: 36 rows
	All Enum fields have valid values in SAMPLE.


0

In [236]:
CLINPATH.replace("Nan", "", inplace=True)
CLINPATH.replace("nan", "", inplace=True)

# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	Missing Required Fields in CLINPATH: path_thal
		Required Fields with Empty or NaN values:
			- age_at_onset: 12 rows
			- age_at_diagnosis: 12 rows
			- first_motor_symptom: 12 rows
			- path_year_death: 12 rows
			- brain_weight: 12 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 12 rows
	Invalid Field/Value pairs:
			- region_level_2: Prefrontal cortex
			- hx_dementia_mci: Nan
			- hx_melanoma: Nan
			- education_level: Nan
			- smoking_status: Nan
			- APOE_e4_status: 3,3, 2,3
			- path_autopsy_dx_main: Nan
			- path_braak_nft: Nan
			- path_braak_asyn: Nan
			- path_cerad: Nan
			- known_pathogenic_mutation: Nan
			- path_mckeith: Nan
			- sn_neuronal_loss: Nan
			- path_infarcs: Nan
			- path_nia_ri: Nan
			- path_nia_aa_a: Nan
			- path_nia_aa_b: Nan
			- path_nia_aa_c: Nan
			- TDP43: Nan
			- arteriolosclerosis_severity_scale: Nan
			- amyloid_angiopathy_severity_scale: Nan
			- path_ad_level: Nan


0

In [237]:
CLINPATH = force_enum_string(CLINPATH, "CLINPATH", CDE)


In [238]:
# redact "Prefrontal Cortex" from region_level_2 for now
CLINPATH['region_level_2'] = CLINPATH['region_level_2'].replace({'Prefrontal Cortex':""})

# leave te APOE_e4_status as is for now . multiple are coded as "2,3" 
# but remove commas
CLINPATH["APOE_e4_status"] = CLINPATH["APOE_e4_status"].str.replace(",","")

# need to fix the path_autopsy_dx_main

In [239]:
validate_table(CLINPATH, "CLINPATH", CDE)
CLINPATH.head()

	Missing Required Fields in CLINPATH: path_thal
		Required Fields with Empty or NaN values:
			- age_at_onset: 12 rows
			- age_at_diagnosis: 12 rows
			- first_motor_symptom: 12 rows
			- path_year_death: 12 rows
			- brain_weight: 12 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 12 rows
	Invalid Field/Value pairs:
			- region_level_2: Prefrontal cortex
			- hx_dementia_mci: Nan
			- hx_melanoma: Nan
			- education_level: Nan
			- smoking_status: Nan
			- path_autopsy_dx_main: Nan
			- path_braak_nft: Nan
			- path_braak_asyn: Nan
			- path_cerad: Nan
			- known_pathogenic_mutation: Nan
			- path_mckeith: Nan
			- sn_neuronal_loss: Nan
			- path_infarcs: Nan
			- path_nia_ri: Nan
			- path_nia_aa_a: Nan
			- path_nia_aa_b: Nan
			- path_nia_aa_c: Nan
			- TDP43: Nan
			- arteriolosclerosis_severity_scale: Nan
			- amyloid_angiopathy_severity_scale: Nan
			- path_ad_level: Nan


Unnamed: 0,sample_id,source_sample_id,time_from_baseline,GP2_id,hemisphere,region_level_1,region_level_2,region_level_3,AMPPD_id,family_history,...,sn_neuronal_loss,path_infarcs,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level
0,HC01,HSDG07,Nan,Nan,Unknown,Frontal lobe,Prefrontal cortex,Grey matter,Nan,Unknown,...,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan
1,HC02,HSDG13,Nan,Nan,Unknown,Frontal lobe,Prefrontal cortex,Grey matter,Nan,Unknown,...,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan
2,HC03,HSDG101,Nan,Nan,Unknown,Frontal lobe,Prefrontal cortex,Grey matter,Nan,Unknown,...,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan
3,HC04,HSDG10,Nan,Nan,Unknown,Frontal lobe,Prefrontal cortex,Grey matter,Nan,Unknown,...,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan
4,HC05,HSDG30,Nan,Nan,Unknown,Frontal lobe,Prefrontal cortex,Grey matter,Nan,Unknown,...,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan


In [240]:
data_path = data_path / "team-hafler"
data_path = Path.home() / ("Projects/ASAP/team-hafler")

In [241]:
# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)     
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)

# write the clean metadata
STUDY.to_csv(data_path / "metadata/STUDY.csv")
PROTOCOL.to_csv(data_path / "metadata/PROTOCOL.csv")
CLINPATH.to_csv(data_path / "metadata/CLINPATH.csv")
SAMPLE.to_csv(data_path / "metadata/SAMPLE.csv")
SUBJECT.to_csv(data_path / "metadata/SUBJECT.csv")

# also writh them to clean...
# 
#  

export_root = Path.cwd() / "clean/team-Hafler"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


STUDY.to_csv( export_root / "STUDY.csv")
PROTOCOL.to_csv(export_root / "PROTOCOL.csv")
SAMPLE.to_csv(export_root / "SAMPLE.csv")
SUBJECT.to_csv(export_root / "SUBJECT.csv")
CLINPATH.to_csv(export_root / "CLINPATH.csv")


In [242]:
STUDY.to_csv( metadata_path / "STUDY.csv")
CLINPATH.to_csv( metadata_path / "CLINPATH.csv")
SUBJECT.to_csv( metadata_path / "SUBJECT.csv")
SAMPLE.to_csv( metadata_path / "SAMPLE.csv")
PROTOCOL.to_csv( metadata_path / "PROTOCOL.csv")

In [243]:
CLINPATH["APOE_e4_status"].unique()

CDE[CDE["Field"]== "APOE_e4_status"].Validation.unique()

array(['["22", "23","24","33", "34", "44", "Unknown"]'], dtype=object)

In [244]:

export_root = Path.cwd() / "clean/team-Lee"
SAMPLE_lee = pd.read_csv(export_root / "SAMPLE.csv")
n_samples_lee = SAMPLE_lee["sample_id"].nunique()

export_root = Path.cwd() / "clean/team-Hafler"
SAMPLE_hafler = pd.read_csv(export_root / "SAMPLE.csv")
n_samples_hafler = SAMPLE_hafler["sample_id"].nunique()


total_samples = n_samples_lee + n_samples_hafler
total_samples

87

## Team Hardy

In [245]:
## convert 
data_path = Path.home() / ("Projects/ASAP/team-hardy")
metadata_path = data_path / "metadata"

SUBJECT = pd.read_csv(f"{metadata_path}/SUBJECT.csv")
CLINPATH = pd.read_csv(f"{metadata_path}/CLINPATH.csv")
STUDY = pd.read_csv(f"{metadata_path}/STUDY.csv")
PROTOCOL = pd.read_csv(f"{metadata_path}/PROTOCOL.csv")
SAMPLE = pd.read_csv(f"{metadata_path}/SAMPLE.csv")


In [246]:

validate_table(STUDY, "STUDY", CDE)
STUDY

	Missing Required Fields in STUDY: project_name, project_dataset, project_description, ASAP_team_name, ASAP_lab_name, PI_full_name, PI_email, contributor_names, submitter_name, submittor_email, ASAP_grant_id, other_funding_source, publication_DOI, publication_PMID, number_of_brain_samples, brain_regions, types_of_samples, DUA_version
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


Unnamed: 0,name,value
0,project_name,Understanding mechanisms of Parkinson's diseas...
1,project_dataset,Hardy snRNA-seq
2,project_description,Genetic analysis has identified many risk gene...
3,ASAP_team_name,TEAM-HARDY
4,ASAP_lab_name,Ryten Lab
5,PI_full_name,Mina Ryten
6,PI_email,mina.ryten@ucl.ac.uk
7,contributor_names,"Aine Fairbrother-Browne, Jonathan Brenton, Mel..."
8,submitter_name,Aine Fairbrother-Browne
9,submitter_email,aine.fairbrother-browne.18@ucl.ac.uk


In [247]:
# there seems to be something funky with SAMPLE
# SAMPLE = SAMPLE[SAMPLE["batch"]=="B1"]
# SAMPLE.drop_duplicates(inplace=True) #, subset=[ "file_name"])
SAMPLE

Unnamed: 0,sample_id,source_subject_id,subject_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,source_RIN,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id
0,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
1,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
2,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
3,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
4,babom_ACG,P2/14,babom,,1,0,B1,brain,ACG,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0009835,,EFO:0008913,nucleus,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3611,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3612,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3613,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,
3614,zupam_IPL,P78/11,zupam,rep2,2,1,B2,brain,IPL,,...,PATO:0000383 (female),,MONDO:0005180,UBERON:0006088,,EFO:0008913,nucleus,,,


In [248]:
STUDY.head()

Unnamed: 0,name,value
0,project_name,Understanding mechanisms of Parkinson's diseas...
1,project_dataset,Hardy snRNA-seq
2,project_description,Genetic analysis has identified many risk gene...
3,ASAP_team_name,TEAM-HARDY
4,ASAP_lab_name,Ryten Lab


In [249]:

# fix STUDY formatting
tmp = pd.DataFrame()
tmp = STUDY[["name","value"]].transpose().reset_index().drop(columns=["index"])
tmp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,contributor_names,submitter_name,submitter_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Understanding mechanisms of Parkinson's diseas...,Hardy snRNA-seq,Genetic analysis has identified many risk gene...,TEAM-HARDY,Ryten Lab,Mina Ryten,mina.ryten@ucl.ac.uk,"Aine Fairbrother-Browne, Jonathan Brenton, Mel...",Aine Fairbrother-Browne,aine.fairbrother-browne.18@ucl.ac.uk,...,,,,128,"Inferior Parietal Lobule (IPL), Anterior Cingu...",Late stage (Braak 5-6) PD and control post-mor...,0000-0001-9520-6957,https://scholar.google.co.uk/citations?user=lt...,,"Version 1, 09/2023"


In [250]:

tmp.columns = tmp.iloc[0]
STUDY = tmp.drop([0])
STUDY.head()


Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,contributor_names,submitter_name,submitter_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Understanding mechanisms of Parkinson's diseas...,Hardy snRNA-seq,Genetic analysis has identified many risk gene...,TEAM-HARDY,Ryten Lab,Mina Ryten,mina.ryten@ucl.ac.uk,"Aine Fairbrother-Browne, Jonathan Brenton, Mel...",Aine Fairbrother-Browne,aine.fairbrother-browne.18@ucl.ac.uk,...,,,,128,"Inferior Parietal Lobule (IPL), Anterior Cingu...",Late stage (Braak 5-6) PD and control post-mor...,0000-0001-9520-6957,https://scholar.google.co.uk/citations?user=lt...,,"Version 1, 09/2023"


In [251]:


# fix the column order
STUDY = reorder_table_to_CDE(STUDY, "STUDY", CDE)
validate_table(STUDY, "STUDY", CDE)



	All required fields are present in STUDY.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in STUDY.


1

In [252]:
PROTOCOL.head()

Unnamed: 0,name,value
0,sample_collection_summary,"This dataset contains cortical regions only, p..."
1,cell_extraction_summary,From protocols.io: This protocol is used to is...
2,lib_prep_summary,'Nuclei were extracted from homogenised post-m...
3,data_processing_summary,Cell ranger was used to convert raw sequencing...
4,github_url,Raw to fastq to mapped: https://github.com/RHR...


In [253]:
# fix STUDY formatting
tmp = pd.DataFrame()
tmp = PROTOCOL[["name","value"]].transpose().reset_index().drop(columns=["index"])
tmp.columns = tmp.iloc[0]
PROTOCOL = tmp.drop([0])
PROTOCOL.head()

Unnamed: 0,sample_collection_summary,cell_extraction_summary,lib_prep_summary,data_processing_summary,github_url,protocols_io_DOI,other_reference
1,"This dataset contains cortical regions only, p...",From protocols.io: This protocol is used to is...,'Nuclei were extracted from homogenised post-m...,Cell ranger was used to convert raw sequencing...,Raw to fastq to mapped: https://github.com/RHR...,Nuclear extraction protocol: 10.17504/protocol...,


In [254]:
# fix the column order
PROTOCOL = reorder_table_to_CDE(PROTOCOL, "PROTOCOL", CDE)
validate_table(PROTOCOL, "PROTOCOL", CDE)



	All required fields are present in PROTOCOL.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	All Enum fields have valid values in PROTOCOL.


1

In [255]:

SUBJECT = reorder_table_to_CDE(SUBJECT, "SUBJECT", CDE)

# Testing the function with SUBJECT.csv and CDE.csv
validate_table(SUBJECT, "SUBJECT", CDE)

	All required fields are present in SUBJECT.
		Required Fields with Empty or NaN values:
			- ethnicity: 64 rows
			- duration_pmi: 1 rows
No empty or NaN values found in Optional fields.
	Invalid Field/Value pairs:
			- race: Nan


0

In [256]:
SUBJECT.replace("Nan", "", inplace=True)
SUBJECT.replace("nan", "", inplace=True)
SUBJECT.fillna("", inplace=True)


In [257]:
SAMPLE = reorder_table_to_CDE(SAMPLE, "SAMPLE", CDE)

# force the right sex_ontology_term_id
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"

# allos sequence_length == 190 for now
validate_table(SAMPLE, "SAMPLE", CDE)


# add 'replicate' coding (nans)

	All required fields are present in SAMPLE.
		Required Fields with Empty or NaN values:
			- source_RIN: 3616 rows
		Optional Fields with Empty or NaN values:
			- pm_PH: 3616 rows
	Invalid Field/Value pairs:
			- sequencing_length: 190


0

In [258]:
SAMPLE.replace("nan", "", inplace=True)
# sequence length will need to be converted to a string
SAMPLE.fillna("", inplace=True)
# Testing the function with SAMPLE.csv and CDE.csv


In [259]:
CLINPATH = reorder_table_to_CDE(CLINPATH, "CLINPATH", CDE)


# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
		Required Fields with Empty or NaN values:
			- age_at_onset: 138 rows
			- age_at_diagnosis: 10 rows
			- first_motor_symptom: 138 rows
			- path_year_death: 138 rows
			- brain_weight: 138 rows
		Optional Fields with Empty or NaN values:
			- smoking_years: 138 rows
	Invalid Field/Value pairs:
			- family_history: Nan
			- hx_dementia_mci: Nan
			- hx_melanoma: Nan
			- education_level: Nan
			- smoking_status: Nan
			- APOE_e4_status: Nan
			- cognitive_status: Nan
			- path_autopsy_dx_main: Control brain, Pathological ageing, Control brain / Path ageing, Argyrophilic grain disease, Control brain, Cerebrovascular disease (small vessel), Cerebrovascular disease (small vessel), Control brain, Alzheimer`s disease (intermediate level AD pathological change), Control brain / Path ageing, CAA, Nan
			- path_braak_nft: 2.0, 1.0, 3.0, 0.0, 4.0, 6.0, Nan
			- path_braak_asyn: 6.0, 0.0, 5.0, Nan
			- path_cerad: Nan
			- path_thal: At least 4, Na

0

In [260]:
# hack to make sure the replace below works.
CLINPATH.replace("", pd.NA, inplace=True)


In [261]:
# replace 'path_braak_asyn' with with string of the numeric. converte nan to ""
CLINPATH['path_braak_asyn'] = CLINPATH['path_braak_asyn'].apply(lambda val: str(int(val)) if pd.notna(val) else "")

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_braak_nft'] = CLINPATH['path_braak_nft'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"0", 
                                                                                                                           "1":"I", 
                                                                                                                           "2": "II", 
                                                                                                                           "3":"III", 
                                                                                                                           "4":"IV", 
                                                                                                                           "5":"V", 
                                                                                                                           "6":"VI"})

# code family_history as "Not Reported" (currently empty)
CLINPATH['family_history'] = "Not Reported"



# check APOE_e4_status ? currently empty

# `path_autopsy_dx_main`  actually seems good parser might be wrong

# code "at least 4" as "4/5" 

CLINPATH['path_thal'] = CLINPATH['path_thal'].replace({'At least 4':"4/5"})


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace({'Diffuse neocortical': "Diffuse, neocortical (brainstem, limbic and neocortical involvement)", 
                                                        'Limbic transitional': "Limbic (transitional)" ,
                                                        'Diffuse Neocortical':"Diffuse, neocortical (brainstem, limbic and neocortical involvement)"})

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_a'] = CLINPATH['path_nia_aa_a'].apply(lambda val: str(int(val)) if pd.notna(val) else "")

CLINPATH['path_nia_aa_a'] = CLINPATH['path_nia_aa_a'].replace({"0":"A0", 
                                                                                                                           "1":"A1", 
                                                                                                                           "2": "A2", 
                                                                                                                           "3":"A3"})


In [262]:

# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_b'] = CLINPATH['path_nia_aa_b'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"B0", 
                                                                                                                           "1":"B1", 
                                                                                                                           "2": "B2", 
                                                                                                                           "3":"B3"})


# replace 'path_braak_nft' with with string of the numeric. converte nan to ""
CLINPATH['path_nia_aa_c'] = CLINPATH['path_nia_aa_c'].apply(lambda val: str(int(val)) if pd.notna(val) else "").replace({"0":"C0", 
                                                                                                                           "1":"C1", 
                                                                                                                           "2": "C2", 
                                                                                                                           "3":"C3"})



In [263]:



CLINPATH['path_ad_level'] = CLINPATH['path_ad_level'].replace({"No evidence": "No evidence of Alzheimer\'s disease neuropathological change"})


# empty 'hx_dementia_mci', 'hx_melanoma', 'education_level', 'cognitive_status'
#  coded as nan will be fixed with .fillna("") below
CLINPATH.replace("nan", "", inplace=True)
CLINPATH.fillna("", inplace=True)



In [264]:
CDE[CDE["Field"]=="dig_slide_avail"].Validation.unique(), CLINPATH['dig_slide_avail'].unique()

(array(['["Yes, No"]'], dtype=object), array(['Yes'], dtype=object))

In [265]:
# Testing the function with CLINPATH.csv and CDE.csv
validate_table(CLINPATH, "CLINPATH", CDE)

	All required fields are present in CLINPATH.
No empty or NaN values found in Required fields.
No empty or NaN values found in Optional fields.
	Invalid Field/Value pairs:
			- path_autopsy_dx_main: Control brain, Pathological ageing, Control brain / Path ageing, Argyrophilic grain disease, Control brain, Cerebrovascular disease (small vessel), Cerebrovascular disease (small vessel), Control brain, Alzheimer`s disease (intermediate level AD pathological change), Control brain / Path ageing, CAA, 
			- path_nia_aa_a: A0, 
			- path_nia_aa_b: B0, 
			- path_nia_aa_c: C0, 
			- dig_slide_avail: Yes
			- quant_path_avail: Yes


0

In [266]:


# # write the clean metadata
# STUDY.to_csv(data_path / "metadata/STUDY.csv")
# PROTOCOL.to_csv(data_path / "metadata/PROTOCOL.csv")
# CLINPATH.to_csv(data_path / "metadata/CLINPATH.csv")
# SAMPLE.to_csv(data_path / "metadata/SAMPLE.csv")
# SUBJECT.to_csv(data_path / "metadata/SUBJECT.csv")

# # also writh them to clean...
# 
#  

export_root = Path.cwd() / "clean/team-Hardy"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


STUDY.to_csv( export_root / "STUDY.csv")
PROTOCOL.to_csv(export_root / "PROTOCOL.csv")
SAMPLE.to_csv(export_root / "SAMPLE.csv")
SUBJECT.to_csv(export_root / "SUBJECT.csv")
CLINPATH.to_csv(export_root / "CLINPATH.csv")



basically hold the list of the GP2ID and the original clinical ID pairs + how many samples are in the GP2 (s1 only or s1, s2,...) for all GP2 submitted individuals. It takes the sample manifest, scan the clinical ID to check if this is the additional submission of those already in the GP2 or not and then if its new, give new GP2ID and GP2sampleID. If the clinical_id is already existing in the GP2 then only provide GP2sampleID (GP2ID_sX+1). Also it errors if the original sample ID submitted is equal to the one in the list. (No duplication of sample ID from the same cohort)

In [4]:
team_names = ["team-lee", "team-hafler", "team-hardy", "team-jakobsson", "team-sherzer","team-sulzer", "tam-voet","team-wood"]

In [5]:
team_code = ["LEE", "HAF", "HAR", "JAK", "SHE", "SUL", "VOE", "WOO"]




In [7]:
[x.upper() for x in team_names]


['TEAM-LEE',
 'TEAM-HAFLER',
 'TEAM-HARDY',
 'TEAM-JAKOBSSON',
 'TEAM-SHERZER',
 'TEAM-SULZER',
 'TAM-VOET',
 'TEAM-WOOD']

In [None]:
    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1

In [None]:


def getgp2idsv2(dfproc, n, study_code):
    df_dups = dfproc[dfproc.duplicated(keep=False, subset=['clinical_id'])].sort_values('clinical_id').reset_index(drop = True).copy()
    if df_dups.shape[0]>0:
        dupids_mapper = dict(zip(df_dups.clinical_id.unique(),
                            [num+n for num in range(len(df_dups.clinical_id.unique()))]))
        
        df_dup_chunks = []
        for clin_id, gp2id in dupids_mapper.items():
            df_dups_subset = df_dups[df_dups.clinical_id==clin_id].copy()
            df_dups_subset['GP2ID'] = [f'{study_code}_{gp2id:06}' for i in range(df_dups_subset.shape[0])]
            df_dups_subset['SampleRepNo'] = ['s'+str(i+1) for i in range(df_dups_subset.shape[0])]
            df_dups_subset['GP2sampleID'] = df_dups_subset['GP2ID'] + '_' + df_dups_subset['SampleRepNo']
            df_dup_chunks.append(df_dups_subset)
        df_dups_wids = pd.concat(df_dup_chunks)

    df_nodups = dfproc[~dfproc.duplicated(keep=False, subset=['clinical_id'])].sort_values('clinical_id').reset_index(drop = True).copy()

    if df_dups.shape[0]>0:
        n =  len(list(dupids_mapper.values())) + n
    else:
        n = n

    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1
    df_nodups_wids = df_nodups.copy()
    df_nodups_wids['uid_idx'] = df_nodups_wids['sample_id'].map(mapid)
    df_nodups_wids['GP2ID'] = [f'{study_code}_{i:06}' for i in df_nodups_wids.uid_idx]
    df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('GP2ID').cumcount() + 1
    df_nodups_wids['GP2sampleID'] = df_nodups_wids.GP2ID + '_s' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids['SampleRepNo'] = 's' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids.drop(['uid_idx','uid_idx_cumcount'], axis = 1, inplace = True)

    if df_dups.shape[0]>0:
        df_newids = pd.concat([df_dups_wids, df_nodups_wids])
    else:
        df_newids = df_nodups_wids
    
    return(df_newids)

def assign_unique_gp2clinicalids(df, clinicalid_subset):

    if isinstance(clinicalid_subset, pd.Series):
        clinicalid_subset = clinicalid_subset.to_frame().T

    sampleid = clinicalid_subset.sort_values(by=['master_GP2sampleID'])\
                                .reset_index(drop = True)\
                                .dropna(subset=['master_GP2sampleID'], axis = 0)
    sampleid = sampleid.loc[sampleid.index[-1], 'master_GP2sampleID'].split("_")
    getuniqueid = sampleid[0] + "_" + sampleid[1]
    get_sidrepno = int(sampleid[2].replace("s","")) + 1

    index_modify = clinicalid_subset['index'].unique() #clinicalid_subset[clinicalid_subset['GP2sampleID'].isnull()] #.index
    assign_gp2sampleid = [getuniqueid + "_s" + str(get_sidrepno + i) for i in range(len(index_modify))]
    df.loc[index_modify, 'GP2sampleID'] = assign_gp2sampleid
    getnewidrows = df.loc[index_modify].copy()
    return (getnewidrows)

def master_keyv2(studies):
    # ACCESS MASTERGP2IDS_JSON IN GP2 BUCKET
    client = storage.Client()
    bucket = client.get_bucket('eu-samplemanifest')
    blob = bucket.blob('IDSTRACKER/GP2IDSMAPPER.json')
    
    ids_tracker = {}
    with blob.open("r") as f:
        for k, v in ijson.kvitems(f, ''):
            if k in studies:
                ids_tracker.update({k:v})
    
    return(ids_tracker)

In [1]:
40*"-"

'----------------------------------------'

In [None]:
        # GENERATE GP2 IDs #
        jumptwice()
        st.subheader('GP2 IDs assignment...')
        studynames = list(df['study'].unique())

        if st.session_state['master_get'] == None: # TO ONLY RUN ONCE
            #ids_tracker = generategp2ids.master_key(studies = studynames)
            ids_tracker = generategp2ids.master_keyv2(studies = studynames)
            study_subsets = []
            log_new = []
            df['GP2sampleID'] = None
            # GP2 ID ASSIGNMENT CODE BLOCK
            for study in studynames:
                st.write(f"Getting GP2IDs for {study} samples")
                df_subset = df[df.study==study].copy()
                try:
                    #study_tracker = st.session_state['store_tracker'][study]
                    study_tracker = ids_tracker[study]
                    study_tracker_df = pd.DataFrame.from_dict(study_tracker,
                                                            orient='index',
                                                            columns = ['master_GP2sampleID','clinical_id'])\
                                                    .rename_axis('master_sample_id').reset_index()\
                                                    .astype(str)

                    # Check if any sample ID exists in df_subset.
                    sample_id_unique = pd.merge(study_tracker_df, df_subset,
                                                left_on=['master_sample_id'], right_on=['sample_id'], how='inner')
                    if not sample_id_unique.empty:
                        st.error('We have detected sample ids submitted on previous versions')
                        st.error('Please, correct these sample IDs so that they are unique and resubmit the sample manifest.')
                        sample_id_unique = sample_id_unique.rename(columns={"clinical_id_y": "clinical_id"})
                        st.dataframe(
                        sample_id_unique[['study','sample_id','clinical_id']].style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                        )
                        stopapp=True
                    else:
                        stopapp=False
                except:
                    study_tracker = None
                    stopapp = False
                if stopapp:
                    st.stop()

                if bool(study_tracker):
                    # WORK ON DUPLICATED IDS
                    df_subset = df_subset.reset_index()
                    data_duplicated = pd.merge(df_subset, study_tracker_df, on=['clinical_id'], how='inner')
                    df_subset = df_subset.set_index('index')
                    df_subset.index.name = None

                    if data_duplicated.shape[0]>0:
                        new_clinicaldups = True
                        newids_clinicaldups = data_duplicated.groupby('clinical_id')\
                                                        .apply(lambda x: generategp2ids.assign_unique_gp2clinicalids(df_subset,x))

                        if newids_clinicaldups.shape[0]>0:
                            newids_clinicaldups = newids_clinicaldups.reset_index(drop=True)[['study','clinical_id','sample_id','GP2sampleID']]
                            log_new.append(newids_clinicaldups)
                    else:
                        new_clinicaldups = False
                        newids_clinicaldups = pd.DataFrame()

                    # GET GP2 IDs METADATA for new CLINICAL-SAMPLE ID pairs
                    df_newids = df_subset[df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                    if not df_newids.empty: # Get new GP2 IDs
                        df_wids = df_subset[~df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                        df_wids['GP2ID'] = df_wids['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_wids['SampleRepNo'] = df_wids['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))

                        n=int(max(study_tracker_df['master_GP2sampleID'].to_list()).split("_")[1])+1
                        df_newids = generategp2ids.getgp2idsv2(df_newids, n, study)
                        df_subset = pd.concat([df_newids, df_wids], axis = 0)
                        study_subsets.append(df_subset)
                        log_new.append(df_newids[['study','clinical_id','sample_id','GP2sampleID']])
                        
                    else: # TO CONSIDER THE CASE IN WHICH WE ONLY HAD DUPLICATE IDS MAPPED ON THE MASTER FILE
                        df_subset['GP2ID'] = df_subset['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_subset['SampleRepNo'] = df_subset['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))
                        study_subsets.append(df_subset)

                # Brand new data - NO STUDY TRACKER FOR THIS COHORT
                else:
                    study = study
                    new_clinicaldups = False # Duplicates from master key json are treated differently to brand new data
                    n = 1
                    df_newids = generategp2ids.getgp2idsv2(df_subset, n, study)
                    study_subsets.append(df_newids)


                # CODE TO UPDATE THE GET FILE WE WILL USE TO UPDATE MASTER JSON
                if (new_clinicaldups) and (newids_clinicaldups.shape[0]>0):
                    tmp = pd.concat([df_newids[['study','clinical_id','sample_id','GP2sampleID']], newids_clinicaldups])
                    tmp['master_value'] = list(zip(tmp['GP2sampleID'],
                                                    tmp['clinical_id']))
                    ids_log = tmp.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                            x['master_value']))).to_dict()
                else:
                    df_update_master = df_newids.copy()
                    df_update_master['master_value'] = list(zip(df_update_master['GP2sampleID'],
                                                            df_update_master['clinical_id']))
                    ids_log = df_update_master.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                                    x['master_value']))).to_dict()

                #generategp2ids.update_masterids(ids_log, study_tracker) # THIS WILL BE UPDATED ONCE THE USET CONFIRMS THE QC ( AT THE END)
                
                #if st.session_state['master_get'] == None:
                if (isinstance(st.session_state['all_ids'], list)):
                    st.session_state['all_ids'].append( [ids_log, study_tracker] )
                if st.session_state['all_ids'] == None:
                    st.session_state['all_ids'] = [ [ids_log, study_tracker] ]
            

            # OUT OF FOR LOOP // END OF GP2 IDS ASSIGNMENT. LET'S RESUME df.
            df = pd.concat(study_subsets, axis = 0)
            df = df[list(df)[-3:] + list(df)[:-3]]
            st.write("GPS IDs assignment... OK")

            #if st.session_state['master_get'] == None:
            st.session_state['df_copy'] = df
            if len(log_new) > 0:
                allnew = pd.concat(log_new, axis = 0).reset_index(drop=True)
                st.write("Thanks for uploading a new version of the sample manifest")
                st.write(f'We have detected a total of {allnew.shape[0]} new samples')
                st.write("We have assigned new GP2IDs to those. Showing them below...")
                st.dataframe(
                allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                #allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                )
            else:
                aggridPlotter(df)

            st.session_state['df_finalids'] = df
            st.session_state['master_get'] = 'DONE'

        else:
            df = st.session_state['df_finalids']
            aggridPlotter(df)
            # df_builder = GridOptionsBuilder.from_dataframe(st.session_state['df_copy'])
            # df_builder.configure_grid_options(alwaysShowHorizontalScroll = True,
            #                                     enableRangeSelection=True,
            #                                     pagination=True,
            #                                     paginationPageSize=10000,
            #                                     domLayout='normal')
            # godf = df_builder.build()
            # AgGrid(st.session_state['df_copy'],gridOptions=godf, theme='streamlit', height=300)
            #df = st.session_state['df_finalids']
        #st.session_state['master_get'] = 'DONE'
