ASAP CRN Unique ID generation - wave 1

# ASAP CRN Unique ID generation - wave 1


Postmortem-derived Brain Sequencing Collection


25 OCT 2023
Andy Henrie


### Dataset ID
- "ASAP_PBMSC" to identify that it is part of the Postmortem-derived Brain Sequencing Collection
- `ASAP_dataset_id`
    - also need to generate a "team_dataset_id" (Add to CDE/DataDictionary). TeamCODE+"one to two word descriptor"

### Team ID
- hardcoded definitions
- `ASAP_team_id`

### Subject ID
- unique for ASAP
- could exist across several Teams / Datasets
- `ASAP_subject_id`

### Sample ID
- unique for each sample
- multiple could derive from same `ASAP_subject_id`.  
    - multiple brain regions from a single team
    - multiple teams from same biobank
    - "other" repeated samples??
- `ASAP_sample_id`
- Unique ASAP_subject_id + "sample repeat number"



###  Issues

- storing "master" IDs for lookup:  pandas vs. json... 


In [None]:
# conda create -n lw10 python=3.10 notebook ipykernel pip pandas ijson - y && conda activate lw10

In [4]:
import pandas as pd
import json
import ijson
from pathlib import Path


from asap_ids import (read_meta_table, get_dtypes_dict, STUDY_PREFIX, DATASET_ID, 
                      load_id_mapper, write_id_mapper, generate_asap_sample_ids,
                      generate_asap_subject_ids, process_meta_files)


                       

%load_ext autoreload
%autoreload 2


Load CDE for properly reading the team tables.

In [6]:
CDE_path = Path.cwd() / "ASAP_CDE.csv" 
CDE = pd.read_csv(CDE_path )
# Initialize the data types dictionary
dtypes_dict = get_dtypes_dict(CDE)


In [7]:



## test with team Lee
export_root = Path.cwd() / "clean/team-Lee"
subject_mapper_path = Path.cwd() / "ASAP_subj_map.json"
sample_mapper_path = Path.cwd() / "ASAP_samp_map.json"

try:
    subj_id_mapper = load_id_mapper(subject_mapper_path)
except FileNotFoundError:
    subj_id_mapper = {}
    print(f"{subject_mapper_path} not found... starting from scratch")

try:
    samp_id_mapper = load_id_mapper(sample_mapper_path)
except FileNotFoundError:
    samp_id_mapper = {}
    print(f"{sample_mapper_path} not found... starting from scratch")

if CDE_path.exists():
    CDE = pd.read_csv(CDE_path )
else:
    print(f"{CDE_path} not found... aborting")



id_mapper not found at /Users/ergonyc/Projects/ASAP/meta-clean/ASAP_subj_map.json
id_mapper not found at /Users/ergonyc/Projects/ASAP/meta-clean/ASAP_samp_map.json


In [17]:
subj_id_mapper = {}
samp_id_mapper = {}

In [18]:
table_path = export_root
# dtypes_dict = get_dtypes_dict(CDE)

# add ASAP_team_id to the STUDY and PROTOCOL tables
study_path = table_path / "STUDY.csv"
if study_path.exists():
    study_df = read_meta_table(study_path, dtypes_dict)
    team_id = study_df['ASAP_team_name'].str.upper().replace('-', '_')
    study_df['ASAP_team_id'] = team_id
    # add ASAP_dataset_id = DATASET_ID to the STUDY tables
    study_df['ASAP_dataset_id'] = DATASET_ID
else:
    study_df = None
    print(f"{study_path} not found... aborting")

protocol_path = table_path / "PROTOCOL.csv"
if protocol_path.exists():
    protocol_df = read_meta_table(protocol_path, dtypes_dict)
    protocol_df['ASAP_team_id'] = team_id
else:
    protocol_df = None
    print(f"{protocol_path} not found... aborting")


In [19]:

# add ASAP_subject_id to the SUBJECT tables
subject_path = table_path / "SUBJECT.csv"
if subject_path.exists():
    subject_df = read_meta_table(subject_path, dtypes_dict)
    subj_id_mapper, subject_df, n = generate_asap_subject_ids(subj_id_mapper, subject_df)
    # add ASAP_dataset_id = DATASET_ID to the SUBJECT tables
    subject_df['ASAP_dataset_id'] = DATASET_ID
else:
    subject_df = None
    print(f"{subject_path} not found... aborting")



In [20]:
subj_id_mapper

{'HC_1225': 'ASAP_PMBDS_000001',
 'HC_0602': 'ASAP_PMBDS_000002',
 'PD_0009': 'ASAP_PMBDS_000003',
 'PD_1921': 'ASAP_PMBDS_000004',
 'PD_2058': 'ASAP_PMBDS_000005',
 'PD_1441': 'ASAP_PMBDS_000006',
 'PD_1344': 'ASAP_PMBDS_000007',
 'HC_1939': 'ASAP_PMBDS_000008',
 'HC_1308': 'ASAP_PMBDS_000009',
 'HC_1862': 'ASAP_PMBDS_000010',
 'HC_1864': 'ASAP_PMBDS_000011',
 'HC_2057': 'ASAP_PMBDS_000012',
 'HC_2061': 'ASAP_PMBDS_000013',
 'HC_2062': 'ASAP_PMBDS_000014',
 'HC_2067': 'ASAP_PMBDS_000015',
 'PD_0348': 'ASAP_PMBDS_000016',
 'PD_0413': 'ASAP_PMBDS_000017',
 'PD_1312': 'ASAP_PMBDS_000018',
 'PD_1317': 'ASAP_PMBDS_000019',
 'PD_1504': 'ASAP_PMBDS_000020',
 'PD_1858': 'ASAP_PMBDS_000021',
 'PD_1902': 'ASAP_PMBDS_000022',
 'PD_1973': 'ASAP_PMBDS_000023',
 'PD_2005': 'ASAP_PMBDS_000024',
 'PD_2038': 'ASAP_PMBDS_000025'}

In [21]:
subject_df.head()

Unnamed: 0,ASAP_subject_id,subject_id,source_subject_id,biobank_name,organism,sex,age_at_collection,race,ethnicity,duration_pmi,primary_diagnosis,primary_diagnosis_text,uid_idx,uid_idx_cumcount,ASAP_dataset_id
0,ASAP_PMBDS_000001,HC_1225,12-25,Banner Sun Health Research Institute,Human,Male,80,White,Not Reported,3.5,No PD nor other neurological disorder,,1,1,ASAP_PMBDS
1,ASAP_PMBDS_000002,HC_0602,06-02,Banner Sun Health Research Institute,Human,Male,84,White,Not Reported,2.66,Other neurological disorder,Mild Cognitive Impairment,2,1,ASAP_PMBDS
2,ASAP_PMBDS_000003,PD_0009,00-09,Banner Sun Health Research Institute,Human,Male,64,White,Not Reported,4.0,Idiopathic PD,,3,1,ASAP_PMBDS
3,ASAP_PMBDS_000004,PD_1921,19-21,Banner Sun Health Research Institute,Human,Male,82,White,Not Reported,3.93,Idiopathic PD,,4,1,ASAP_PMBDS
4,ASAP_PMBDS_000005,PD_2058,20-58,Banner Sun Health Research Institute,Human,Male,87,White,Not Reported,3.17,Idiopathic PD,,5,1,ASAP_PMBDS


In [23]:

# add ASAP_sample_id and ASAP_dataset_id to the SAMPLE tables
sample_path = table_path / "SAMPLE.csv"
if sample_path.exists():
    sample_df = read_meta_table(sample_path, dtypes_dict)
    subj_id_mapper, sample_df = generate_asap_sample_ids(subj_id_mapper, sample_df, n, samp_id_mapper)
    sample_df['ASAP_dataset_id'] = DATASET_ID
else:
    sample_df = None
    print(f"{sample_path} not found... aborting")



In [24]:
subj_id_mapper

{'HC_1225': 'ASAP_PMBDS_000001',
 'HC_0602': 'ASAP_PMBDS_000002',
 'PD_0009': 'ASAP_PMBDS_000003',
 'PD_1921': 'ASAP_PMBDS_000004',
 'PD_2058': 'ASAP_PMBDS_000005',
 'PD_1441': 'ASAP_PMBDS_000006',
 'PD_1344': 'ASAP_PMBDS_000007',
 'HC_1939': 'ASAP_PMBDS_000008',
 'HC_1308': 'ASAP_PMBDS_000009',
 'HC_1862': 'ASAP_PMBDS_000010',
 'HC_1864': 'ASAP_PMBDS_000011',
 'HC_2057': 'ASAP_PMBDS_000012',
 'HC_2061': 'ASAP_PMBDS_000013',
 'HC_2062': 'ASAP_PMBDS_000014',
 'HC_2067': 'ASAP_PMBDS_000015',
 'PD_0348': 'ASAP_PMBDS_000016',
 'PD_0413': 'ASAP_PMBDS_000017',
 'PD_1312': 'ASAP_PMBDS_000018',
 'PD_1317': 'ASAP_PMBDS_000019',
 'PD_1504': 'ASAP_PMBDS_000020',
 'PD_1858': 'ASAP_PMBDS_000021',
 'PD_1902': 'ASAP_PMBDS_000022',
 'PD_1973': 'ASAP_PMBDS_000023',
 'PD_2005': 'ASAP_PMBDS_000024',
 'PD_2038': 'ASAP_PMBDS_000025'}

In [25]:


# add ASAP_sample_id to the CLINPATH tables
clinpath_path = table_path / "CLINPATH.csv"
if clinpath_path.exists():
    clinpath_df = read_meta_table(clinpath_path, dtypes_dict)
    clinpath_df['ASAP_sample_id'] = clinpath_df['sample_id'].map(samp_id_mapper)

# once we update the CDE so CLINPATH has subject level data we can add this
# # add ASAP_subject_id to the CLINPATH tables
# clinpath_path = table_path / "CLINPATH.csv"
# if clinpath_path.exists():
#     clinpath_df = read_meta_table(clinpath_path, dtypes_dict)

#     clinpath_df['ASAP_subject_id'] = clinpath_df['subject_id'].map(id_mapper)

# export updated tables
asap_tables_path = Path.cwd() / "ASAP_tables"
if  not asap_tables_path.exists():
    asap_tables_path.mkdir()


In [26]:

if study_path.exists():
    study_df.to_csv(asap_tables_path / study_path.name)
if protocol_path.exists():
    protocol_df.to_csv(asap_tables_path / protocol_path.name)
if subject_path.exists():
    subject_df.to_csv(asap_tables_path / subject_path.name)
if sample_path.exists():
    sample_df.to_csv(asap_tables_path / sample_path.name)
if clinpath_path.exists():
    clinpath_df.to_csv(asap_tables_path / clinpath_path.name)


# write the updated id_mapper to file
write_id_mapper(subj_id_mapper, subject_mapper_path)
write_id_mapper(samp_id_mapper, sample_mapper_path)


0

In [None]:
MASTER_TEAM_ids = pd.DataFrame()


## STUDY: Postmortem-derived Brain Sequencing Collection (PMBDS) 

All ASAP_dataset_id, and ASAP_subject_id here will start with "ASAP_PMBDS_"


In [None]:
STUDY_PREFIX = "ASAP_PMBDS_"

In [None]:



# function to assign the study prefix to the id



## `ASAP_team_id`

On meta-data ingest, add this to:
- STUDY, PROTOCOL

In [None]:
team_names = ["lee", "hafler", "hardy", "jakobsson", "sherzer","sulzer", "voet","wood"]
[x.upper() for x in team_names]
MASTER_TEAM_ids['team_name'] = team_names



In [None]:
team_codes = ["LEE", "HAF", "HAR", "JAK", "SHE", "SUL", "VOE", "WOO"]

MASTER_TEAM_ids['team_code'] = team_codes



In [None]:
MASTER_TEAM_ids['ASAP_team_id'] = "TEAM_" + MASTER_TEAM_ids['team_name'].str.upper() 
MASTER_TEAM_ids

## `ASAP_dataset_id`

This compares with the GP2 "study code".

This is done by hand for now. On meta-data ingest, add this (?) to:
- STUDY, PROTOCOL, SAMPLE



Currently we have:
- Team Lee 
- Team Hardy
- Team Hafler



In [None]:


MASTER_DATASET_IDs = pd.DataFrame()

MASTER_DATASET_IDs['ASAP_dataset_id'] = "ASAP_PMBDS"



## `ASAP_subject_id`


### Subject ID
- unique for ASAP
- could exist across several Teams / Datasets
- `ASAP_subject_id`


On meta-data ingest, add this to:
- SUBJECT

"ASAP_XXXXXXX"

Team Lee:  

Team Hardy:

Team Hafler:



We need to define a function that creates the _master_archive_ (if it doesn't exist), and assigns  



In [None]:
MASTER_SUBJECT_IDs = pd.DataFrame()



In [None]:

export_root = Path.cwd() / "clean/team-Lee"
# make sure cleaned files are correct


SUBJECT = read_meta_table(f"{export_root}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{export_root}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{export_root}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{export_root}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{export_root}/SAMPLE.csv", dtypes_dict)


In [None]:
subject_df = SUBJECT


def generate_asap_subject_ids(subj_id_mapper, subject_df):
    """
    generate new unique_ids for new subject_ids in subject_df table, 
    update the id_mapper with the new ids from the data table

    return the updated id_mapper
    """
    # extract the max value of the mapper's third (last) section ([2] or [-1]) to get our n
    if bool(subj_id_mapper):
        n = max([int(v.split("_")[2]) for v in subj_id_mapper.values() if v]) + 1
    else:
        n = 1

    df_nodups_wids = subject_df.copy()
    # might want to use 'source_subject_id' instead of 'subject_id' since we want to find matches across teams
    # shouldn't actually matter but logically cleaner
    uids = [str(id) for id in df_nodups_wids['subject_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1

    df_nodups_wids['uid_idx'] = df_nodups_wids['subject_id'].map(mapid)
    # make a new column with the ASAP_subject_id
    # and insert it at the beginning of the dataframe
    ASAP_subject_id = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
    df_nodups_wids.insert(0, 'ASAP_subject_id', ASAP_subject_id)
    # df_nodups_wids['ASAP_subject_id'] = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
    df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('ASAP_subject_id').cumcount() + 1
    asap_id_mapper = dict(zip(df_nodups_wids['subject_id'], df_nodups_wids['ASAP_subject_id']))

    subj_id_mapper.update(asap_id_mapper)
    
    return subj_id_mapper, df_nodups_wids, n



def generate_asap_sample_ids(subj_id_mapper, sample_df, n, samp_id_mapper):
    """
    generate new unique_ids for new sample_ids in sample_df table, 
    update the id_mapper with the new ids from the data table


    return the updated id_mapper
    """
    # could pass subj_id_mapper as a parameter instead of n.  e.g.
    # if bool(subj_id_mapper):
    #     n = max([int(v.split("_")[2]) for v in subj_id_mapper.values() if v]) + 1
    # else:
    #     n = 1
    
    # since the current SAMPLE tables can have multipl sample_ids lets drop duplciates, with the caveat of replciates
    df_nodups = sample_df.drop_duplicates(subset=['sample_id','replicate'])
    
    # 
    uniq_subj = df_nodups.subject_id.unique()

    dupids_mapper = dict(zip(uniq_subj,
                        [num+n for num in range(len(uniq_subj))] ))

    df_dup_chunks = []
    for subj_id, samp_n in dupids_mapper.items():
        df_dups_subset = df_nodups[df_nodups.subject_id==subj_id].copy()
        asap_id = subj_id_mapper[subj_id]
        df_dups_subset['asap_sample'] = [f'{STUDY_PREFIX}{asap_id}_{samp_n:06}' for i in range(df_dups_subset.shape[0])]
        df_dups_subset['samp_rep_no'] = ['s'+str(i+1) for i in range(df_dups_subset.shape[0])]
        # make a new column with the asap_sample_id
        # and insert it at the beginning of the dataframe
        ASAP_sample_id = df_dups_subset['asap_sample'] + '_' + df_dups_subset['samp_rep_no']
        df_dups_subset.insert(0, 'ASAP_sample_id', ASAP_sample_id)

        df_dup_chunks.append(df_dups_subset)
    df_dups_wids = pd.concat(df_dup_chunks)



    id_mapper = dict(zip(df_dups_wids.sample_id,
                        df_dups_wids.ASAP_sample_id))
    out_df = sample_df.copy()
    out_df['ASAP_sample_id'] = out_df['sample_id'].map(id_mapper)

    samp_id_mapper.update(id_mapper)
    return subj_id_mapper, out_df




subj_id_mapper = {}
samp_id_mapper = {}



In [None]:
ud_subj_id_mapper, ud_subject_df, n = generate_asap_subject_ids(subj_id_mapper, subject_df)
ud_subj_id_mapper

In [None]:
ud_subject_df.head()

In [None]:
sample_df = SAMPLE.copy()
sample_df.head()


In [None]:

ud_subj_id_mapper, ud_sample_df = generate_asap_sample_ids(ud_subj_id_mapper, SAMPLE, n, samp_id_mapper)

In [None]:

export_root = Path.cwd() / "clean/team-Hardy"
# make sure cleaned files are correct


SUBJECT = read_meta_table(f"{export_root}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{export_root}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{export_root}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{export_root}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{export_root}/SAMPLE.csv", dtypes_dict)

subject_df = SUBJECT


In [None]:
ud_subj_id_mapper, ud_subject_df, n = generate_asap_subject_ids(ud_subj_id_mapper, subject_df)

In [None]:
ud_subj_id_mapper

In [None]:

# set starting n = 3
n = 3

df_nodups_wids = df_nodups.copy()
uids = [str(id) for id in df_nodups['source_subject_id'].unique()]
mapid = {}
for uid in uids:
    mapid[uid]= n
    n += 1

df_nodups_wids['uid_idx'] = df_nodups_wids['source_subject_id'].map(mapid)
df_nodups_wids['ASAP_subject_id'] = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('ASAP_subject_id').cumcount() + 1



In [None]:

asap_id_mapper = dict(zip(df_nodups_wids['source_subject_id'], df_nodups_wids['ASAP_subject_id']))

ASAPsubject_df = df_nodups_wids.copy()


In [None]:
max([int(v.split("_")[2]) for v in asap_id_mapper.values()])+1
asap_id_mapper

In [None]:
sample_df = SAMPLE.copy()
sample_df.head()


In [None]:
# sample_df.duplicated(keep=False, subset=['sample_id'])
# ~sample_df.duplicated(keep=False, subset=['sample_id'])
df_nodups = sample_df.drop_duplicates(subset=['sample_id','replicate'])
df_nodups.head()


In [None]:



# #  could do this  easier with groupby:
# df_nodups_wids['uid_idx'] = df_nodups_wids['source_subject_id'].map(mapid)
# df_nodups_wids['ASAP_subject_id'] = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
# df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('ASAP_subject_id').cumcount() + 1


# 
uniq_subj = df_nodups.subject_id.unique()

dupids_mapper = dict(zip(uniq_subj,
                    [num+n for num in range(len(uniq_subj))] ))

df_dup_chunks = []
for subj_id, samp_n in dupids_mapper.items():
    df_dups_subset = df_nodups[df_nodups.subject_id==subj_id].copy()
    df_dups_subset['asap_sample'] = [f'{STUDY_PREFIX}{subj_id}_{samp_n:06}' for i in range(df_dups_subset.shape[0])]
    df_dups_subset['samp_rep_no'] = ['s'+str(i+1) for i in range(df_dups_subset.shape[0])]
    df_dups_subset['ASAP_sample_id'] = df_dups_subset['asap_sample'] + '_' + df_dups_subset['samp_rep_no']
    df_dup_chunks.append(df_dups_subset)
df_dups_wids = pd.concat(df_dup_chunks)


In [None]:

sample_id_mapper = dict(zip(df_dups_wids.sample_id,
                    df_dups_wids.ASAP_sample_id))


sample_df['ASAP_sample_id'] = sample_df['sample_id'].map(sample_id_mapper)

ASAPsample_df = sample_df.copy()

In [None]:

df_sorted = sample_df.sort_values('sample_id').reset_index(drop = True).copy()
df_sorted.head()


In [None]:


df_sorted['ASAP_subject_id'] = df_sorted['subject_id'].map(asap_id_mapper)
# df_sorted['ASAP_subject_id'] = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
# df_sorted['uid_idx_cumcount'] = df_sorted.groupby('sample_id').cumcount() + 1

data_duplicated = pd.merge(df_sorted, ASAPsubject_df, on=['ASAP_subject_id'], how='right')



In [None]:
data_duplicated[['ASAP_subject_id','sample_id', 'source_sample_id', 'subject_id_x', 'replicate',
       'replicate_count', 'repeated_sample', 'batch', 
       'uid_idx_cumcount_x', 'subject_id_y', 'source_subject_id',
      'uid_idx', 'uid_idx_cumcount_y']]

In [None]:

df_sorted['asap_samp_id'] = df_sorted.ASAP_subject_id + '_s' + df_nodups_wids.uid_idx_cumcount.astype('str')
df_nodups_wids['sample_rep_num'] = 's' + df_nodups_wids.uid_idx_cumcount.astype('str')
df_nodups_wids.drop(['uid_idx','uid_idx_cumcount'], axis = 1, inplace = True)
df_sorted['uid_idx_cumcount'] = df_sorted.groupby('sample_id').cumcount() + 1

df_sorted[['sample_id', 'subject_id', 'ASAP_subject_id','uid_idx_cumcount']].head()

ASAPsample_df = df_sorted.copy()


In [None]:

# set starting n = 3
n = 3


uids = [str(id) for id in df_nodups['sample_id'].unique()]
mapid = {}
for uid in uids:
    mapid[uid]= n
    n += 1


df_nodups_wids = df_nodups.copy()
df_nodups_wids['uid_idx'] = df_nodups_wids['sample_id'].map(mapid)
df_nodups_wids['ASAP_subject_id'] = [f'{STUDY_PREFIX}{i:06}' for i in df_nodups_wids.uid_idx]
df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('ASAP_subject_id').cumcount() + 1
df_nodups_wids['GP2sampleID'] = df_nodups_wids.ASAP_subject_id + '_s' + df_nodups_wids.uid_idx_cumcount.astype('str')
df_nodups_wids['SampleRepNo'] = 's' + df_nodups_wids.uid_idx_cumcount.astype('str')
df_nodups_wids.drop(['uid_idx','uid_idx_cumcount'], axis = 1, inplace = True)



In [None]:
#ids_tracker = generategp2ids.master_key(studies = studynames)

df['GP2sampleID'] = None

tracking = 'ASAP_sample_id' in MASTER_SUBJECT_IDs.columns


if tracking:
    # check for DUPLICATED IDS

    # df_subset = df_subset.reset_index()
    # data_duplicated = pd.merge(df_subset, study_tracker_df, on=['clinical_id'], how='inner')
    # df_subset = df_subset.set_index('index')
    # df_subset.index.name = None

    if data_duplicated.shape[0]>0:
        new_clinicaldups = True
        newids_clinicaldups = data_duplicated.groupby('clinical_id')\
                                        .apply(lambda x: generategp2ids.assign_unique_gp2clinicalids(df_subset,x))

        if newids_clinicaldups.shape[0]>0:
            newids_clinicaldups = newids_clinicaldups.reset_index(drop=True)[['study','clinical_id','sample_id','GP2sampleID']]
            log_new.append(newids_clinicaldups)
    else:
        new_clinicaldups = False
        newids_clinicaldups = pd.DataFrame()

    # GET GP2 IDs METADATA for new CLINICAL-SAMPLE ID pairs
    df_newids = df_subset[df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
    if not df_newids.empty: # Get new GP2 IDs
        df_wids = df_subset[~df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
        df_wids['GP2ID'] = df_wids['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
        df_wids['SampleRepNo'] = df_wids['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))

        n=int(max(study_tracker_df['master_GP2sampleID'].to_list()).split("_")[1])+1
        df_newids = generategp2ids.getgp2idsv2(df_newids, n, study)
        df_subset = pd.concat([df_newids, df_wids], axis = 0)
        study_subsets.append(df_subset)
        log_new.append(df_newids[['study','clinical_id','sample_id','GP2sampleID']])
        
    else: # TO CONSIDER THE CASE IN WHICH WE ONLY HAD DUPLICATE IDS MAPPED ON THE MASTER FILE
        df_subset['GP2ID'] = df_subset['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
        df_subset['SampleRepNo'] = df_subset['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))
        study_subsets.append(df_subset)

# Brand new data. create IDs and export tracking
else:
    study = study
    new_clinicaldups = False # Duplicates from master key json are treated differently to brand new data
    n = 1
    df_newids = generategp2ids.getgp2idsv2(df_subset, n, study)
    study_subsets.append(df_newids)


# CODE TO UPDATE THE GET FILE WE WILL USE TO UPDATE MASTER JSON
if (new_clinicaldups) and (newids_clinicaldups.shape[0]>0):
    tmp = pd.concat([df_newids[['study','clinical_id','sample_id','GP2sampleID']], newids_clinicaldups])
    tmp['master_value'] = list(zip(tmp['GP2sampleID'],
                                    tmp['clinical_id']))
    ids_log = tmp.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                            x['master_value']))).to_dict()
else:
    df_update_master = df_newids.copy()
    df_update_master['master_value'] = list(zip(df_update_master['GP2sampleID'],
                                            df_update_master['clinical_id']))
    ids_log = df_update_master.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                    x['master_value']))).to_dict()



## `ASAP_sample_id`

- unique for each sample
- multiple could derive from same `ASAP_subject_id`
- `ASAP_sample_id`
- Unique ASAP_subject_id + "sample repeat number"


On meta-data ingest, add this to:
- SAMPLE

In [None]:
MASTER_SAMPLE_IDs = pd.DataFrame()


In [None]:
# . ASAP_sample_id, ASAP_dataset_id 
# ASAP_sample_id: “ASAP generated unique sample ID”
# “ASAP_” + incrementing 6 digit number.  e.g. “ASAP_000001”
# maybe add a sample number (e.g. for replicates)
# maybe add a 3 digit team code (e.g. “HAF”)
# ASAP_dataset_id:  “ASAP generated unique dataset ID”
# “{team_name}_” + one or two word description combined with “_”
# caveat: 
# mechanism where to access “master” list of ASAP_sample_id’s to test for collisions.  POSTGRES?
# need to add entries for these to the CDE


# Clean each Team Table


## Team Lee

In [None]:


export_root = Path.cwd() / "clean/team-Lee"
# make sure cleaned files are correct


SUBJECT = read_meta_table(f"{export_root}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{export_root}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{export_root}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{export_root}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{export_root}/SAMPLE.csv", dtypes_dict)


In [None]:
len(SUBJECT['source_subject_id'].unique()),len(SUBJECT['subject_id'].unique())

In [None]:
SAMPLE['sample_id'].unique()

In [None]:
SUBJECT[['subject_id', 'source_subject_id', 'biobank_name','primary_diagnosis']]


In [None]:


CLINPATH[['sample_id', 'source_sample_id', 'GP2_id']]


In [None]:

SAMPLE[['sample_id', 'source_sample_id', 'subject_id', 'replicate',
       'replicate_count', 'repeated_sample', 'batch', 'tissue','donor_id']]

## Team Hafler

In [None]:
## convert to seurat Object
data_path = Path.home() / ("Projects/ASAP")
metadata_path = data_path / "team-hafler/metadata"


In [None]:

export_root = Path.cwd() / "clean/team-Hafler"
# make sure cleaned files are correct

SUBJECT = read_meta_table(f"{export_root}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{export_root}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{export_root}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{export_root}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{export_root}/SAMPLE.csv", dtypes_dict)



## Team Hardy

In [None]:
## convert 
data_path = Path.home() / ("Projects/ASAP/team-hardy")
metadata_path = data_path / "metadata"


In [None]:

export_root = Path.cwd() / "clean/team-Hardy"
# make sure cleaned files are correct

SUBJECT = read_meta_table(f"{export_root}/SUBJECT.csv", dtypes_dict)
CLINPATH = read_meta_table(f"{export_root}/CLINPATH.csv", dtypes_dict)
STUDY = read_meta_table(f"{export_root}/STUDY.csv", dtypes_dict)
PROTOCOL = read_meta_table(f"{export_root}/PROTOCOL.csv", dtypes_dict)
SAMPLE = read_meta_table(f"{export_root}/SAMPLE.csv", dtypes_dict)



basically hold the list of the GP2ID and the original clinical ID pairs + how many samples are in the GP2 (s1 only or s1, s2,...) for all GP2 submitted individuals. It takes the sample manifest, scan the clinical ID to check if this is the additional submission of those already in the GP2 or not and then if its new, give new GP2ID and GP2sampleID. If the clinical_id is already existing in the GP2 then only provide GP2sampleID (GP2ID_sX+1). Also it errors if the original sample ID submitted is equal to the one in the list. (No duplication of sample ID from the same cohort)

In [None]:
    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1


def master_keyv2(studies):
    # ACCESS MASTERGP2IDS_JSON IN GP2 BUCKET
    client = storage.Client()
    bucket = client.get_bucket('eu-samplemanifest')
    blob = bucket.blob('IDSTRACKER/GP2IDSMAPPER.json')
    
    ids_tracker = {}
    with blob.open("r") as f:
        for k, v in ijson.kvitems(f, ''):
            if k in studies:
                ids_tracker.update({k:v})
    
    return(ids_tracker)


In [None]:


def getgp2idsv2(dfproc, n, study_code):
    df_dups = dfproc[dfproc.duplicated(keep=False, subset=['sample_id'])].sort_values('sample_id').reset_index(drop = True).copy()
    if df_dups.shape[0]>0:
        dupids_mapper = dict(zip(df_dups.clinical_id.unique(),
                            [num+n for num in range(len(df_dups.clinical_id.unique()))]))
        
        df_dup_chunks = []
        for clin_id, gp2id in dupids_mapper.items():
            df_dups_subset = df_dups[df_dups.clinical_id==clin_id].copy()
            df_dups_subset['GP2ID'] = [f'{study_code}_{gp2id:06}' for i in range(df_dups_subset.shape[0])]
            df_dups_subset['SampleRepNo'] = ['s'+str(i+1) for i in range(df_dups_subset.shape[0])]
            df_dups_subset['GP2sampleID'] = df_dups_subset['GP2ID'] + '_' + df_dups_subset['SampleRepNo']
            df_dup_chunks.append(df_dups_subset)
        df_dups_wids = pd.concat(df_dup_chunks)

    df_nodups = dfproc[~dfproc.duplicated(keep=False, subset=['clinical_id'])].sort_values('clinical_id').reset_index(drop = True).copy()

    if df_dups.shape[0]>0:
        n =  len(list(dupids_mapper.values())) + n
    else:
        n = n

    uids = [str(id) for id in df_nodups['sample_id'].unique()]
    mapid = {}
    for uid in uids:
        mapid[uid]= n
        n += 1
    df_nodups_wids = df_nodups.copy()
    df_nodups_wids['uid_idx'] = df_nodups_wids['sample_id'].map(mapid)
    df_nodups_wids['GP2ID'] = [f'{study_code}_{i:06}' for i in df_nodups_wids.uid_idx]
    df_nodups_wids['uid_idx_cumcount'] = df_nodups_wids.groupby('GP2ID').cumcount() + 1
    df_nodups_wids['GP2sampleID'] = df_nodups_wids.GP2ID + '_s' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids['SampleRepNo'] = 's' + df_nodups_wids.uid_idx_cumcount.astype('str')
    df_nodups_wids.drop(['uid_idx','uid_idx_cumcount'], axis = 1, inplace = True)

    if df_dups.shape[0]>0:
        df_newids = pd.concat([df_dups_wids, df_nodups_wids])
    else:
        df_newids = df_nodups_wids
    
    return(df_newids)

def assign_unique_gp2clinicalids(df, clinicalid_subset):

    if isinstance(clinicalid_subset, pd.Series):
        clinicalid_subset = clinicalid_subset.to_frame().T

    sampleid = clinicalid_subset.sort_values(by=['master_GP2sampleID'])\
                                .reset_index(drop = True)\
                                .dropna(subset=['master_GP2sampleID'], axis = 0)
    sampleid = sampleid.loc[sampleid.index[-1], 'master_GP2sampleID'].split("_")
    getuniqueid = sampleid[0] + "_" + sampleid[1]
    get_sidrepno = int(sampleid[2].replace("s","")) + 1

    index_modify = clinicalid_subset['index'].unique() #clinicalid_subset[clinicalid_subset['GP2sampleID'].isnull()] #.index
    assign_gp2sampleid = [getuniqueid + "_s" + str(get_sidrepno + i) for i in range(len(index_modify))]
    df.loc[index_modify, 'GP2sampleID'] = assign_gp2sampleid
    getnewidrows = df.loc[index_modify].copy()
    return (getnewidrows)

def master_keyv2(studies):
    # ACCESS MASTERGP2IDS_JSON IN GP2 BUCKET
    client = storage.Client()
    bucket = client.get_bucket('eu-samplemanifest')
    blob = bucket.blob('IDSTRACKER/GP2IDSMAPPER.json')
    
    ids_tracker = {}
    with blob.open("r") as f:
        for k, v in ijson.kvitems(f, ''):
            if k in studies:
                ids_tracker.update({k:v})
    
    return(ids_tracker)

In [None]:
40*"-"

In [None]:
        # GENERATE GP2 IDs #
        jumptwice()
        st.subheader('GP2 IDs assignment...')
        studynames = list(df['study'].unique())

        if st.session_state['master_get'] == None: # TO ONLY RUN ONCE
            #ids_tracker = generategp2ids.master_key(studies = studynames)
            ids_tracker = generategp2ids.master_keyv2(studies = studynames)
            study_subsets = []
            log_new = []
            df['GP2sampleID'] = None
            # GP2 ID ASSIGNMENT CODE BLOCK
            for study in studynames:
                st.write(f"Getting GP2IDs for {study} samples")
                df_subset = df[df.study==study].copy()
                try:
                    #study_tracker = st.session_state['store_tracker'][study]
                    study_tracker = ids_tracker[study]
                    study_tracker_df = pd.DataFrame.from_dict(study_tracker,
                                                            orient='index',
                                                            columns = ['master_GP2sampleID','clinical_id'])\
                                                    .rename_axis('master_sample_id').reset_index()\
                                                    .astype(str)

                    # Check if any sample ID exists in df_subset.
                    sample_id_unique = pd.merge(study_tracker_df, df_subset,
                                                left_on=['master_sample_id'], right_on=['sample_id'], how='inner')
                    if not sample_id_unique.empty:
                        st.error('We have detected sample ids submitted on previous versions')
                        st.error('Please, correct these sample IDs so that they are unique and resubmit the sample manifest.')
                        sample_id_unique = sample_id_unique.rename(columns={"clinical_id_y": "clinical_id"})
                        st.dataframe(
                        sample_id_unique[['study','sample_id','clinical_id']].style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                        )
                        stopapp=True
                    else:
                        stopapp=False
                except:
                    study_tracker = None
                    stopapp = False
                if stopapp:
                    st.stop()

                if bool(study_tracker):
                    # WORK ON DUPLICATED IDS
                    df_subset = df_subset.reset_index()
                    data_duplicated = pd.merge(df_subset, study_tracker_df, on=['clinical_id'], how='inner')
                    df_subset = df_subset.set_index('index')
                    df_subset.index.name = None

                    if data_duplicated.shape[0]>0:
                        new_clinicaldups = True
                        newids_clinicaldups = data_duplicated.groupby('clinical_id')\
                                                        .apply(lambda x: generategp2ids.assign_unique_gp2clinicalids(df_subset,x))

                        if newids_clinicaldups.shape[0]>0:
                            newids_clinicaldups = newids_clinicaldups.reset_index(drop=True)[['study','clinical_id','sample_id','GP2sampleID']]
                            log_new.append(newids_clinicaldups)
                    else:
                        new_clinicaldups = False
                        newids_clinicaldups = pd.DataFrame()

                    # GET GP2 IDs METADATA for new CLINICAL-SAMPLE ID pairs
                    df_newids = df_subset[df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                    if not df_newids.empty: # Get new GP2 IDs
                        df_wids = df_subset[~df_subset['GP2sampleID'].isnull()].reset_index(drop = True).copy()
                        df_wids['GP2ID'] = df_wids['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_wids['SampleRepNo'] = df_wids['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))

                        n=int(max(study_tracker_df['master_GP2sampleID'].to_list()).split("_")[1])+1
                        df_newids = generategp2ids.getgp2idsv2(df_newids, n, study)
                        df_subset = pd.concat([df_newids, df_wids], axis = 0)
                        study_subsets.append(df_subset)
                        log_new.append(df_newids[['study','clinical_id','sample_id','GP2sampleID']])
                        
                    else: # TO CONSIDER THE CASE IN WHICH WE ONLY HAD DUPLICATE IDS MAPPED ON THE MASTER FILE
                        df_subset['GP2ID'] = df_subset['GP2sampleID'].apply(lambda x: ("_").join(x.split("_")[:-1]))
                        df_subset['SampleRepNo'] = df_subset['GP2sampleID'].apply(lambda x: x.split("_")[-1])#.replace("s",""))
                        study_subsets.append(df_subset)

                # Brand new data - NO STUDY TRACKER FOR THIS COHORT
                else:
                    study = study
                    new_clinicaldups = False # Duplicates from master key json are treated differently to brand new data
                    n = 1
                    df_newids = generategp2ids.getgp2idsv2(df_subset, n, study)
                    study_subsets.append(df_newids)


                # CODE TO UPDATE THE GET FILE WE WILL USE TO UPDATE MASTER JSON
                if (new_clinicaldups) and (newids_clinicaldups.shape[0]>0):
                    tmp = pd.concat([df_newids[['study','clinical_id','sample_id','GP2sampleID']], newids_clinicaldups])
                    tmp['master_value'] = list(zip(tmp['GP2sampleID'],
                                                    tmp['clinical_id']))
                    ids_log = tmp.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                            x['master_value']))).to_dict()
                else:
                    df_update_master = df_newids.copy()
                    df_update_master['master_value'] = list(zip(df_update_master['GP2sampleID'],
                                                            df_update_master['clinical_id']))
                    ids_log = df_update_master.groupby('study').apply(lambda x: dict(zip(x['sample_id'],
                                                                                    x['master_value']))).to_dict()

                #generategp2ids.update_masterids(ids_log, study_tracker) # THIS WILL BE UPDATED ONCE THE USET CONFIRMS THE QC ( AT THE END)
                
                #if st.session_state['master_get'] == None:
                if (isinstance(st.session_state['all_ids'], list)):
                    st.session_state['all_ids'].append( [ids_log, study_tracker] )
                if st.session_state['all_ids'] == None:
                    st.session_state['all_ids'] = [ [ids_log, study_tracker] ]
            

            # OUT OF FOR LOOP // END OF GP2 IDS ASSIGNMENT. LET'S RESUME df.
            df = pd.concat(study_subsets, axis = 0)
            df = df[list(df)[-3:] + list(df)[:-3]]
            st.write("GPS IDs assignment... OK")

            #if st.session_state['master_get'] == None:
            st.session_state['df_copy'] = df
            if len(log_new) > 0:
                allnew = pd.concat(log_new, axis = 0).reset_index(drop=True)
                st.write("Thanks for uploading a new version of the sample manifest")
                st.write(f'We have detected a total of {allnew.shape[0]} new samples')
                st.write("We have assigned new GP2IDs to those. Showing them below...")
                st.dataframe(
                allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                #allnew.style.set_properties(**{"background-color": "brown", "color": "lawngreen"})
                )
            else:
                aggridPlotter(df)

            st.session_state['df_finalids'] = df
            st.session_state['master_get'] = 'DONE'

        else:
            df = st.session_state['df_finalids']
            aggridPlotter(df)
            # df_builder = GridOptionsBuilder.from_dataframe(st.session_state['df_copy'])
            # df_builder.configure_grid_options(alwaysShowHorizontalScroll = True,
            #                                     enableRangeSelection=True,
            #                                     pagination=True,
            #                                     paginationPageSize=10000,
            #                                     domLayout='normal')
            # godf = df_builder.build()
            # AgGrid(st.session_state['df_copy'],gridOptions=godf, theme='streamlit', height=300)
            #df = st.session_state['df_finalids']
        #st.session_state['master_get'] = 'DONE'
