In [3]:
import numpy as np
import pandas as pd
import regex
from dateutil.parser import parse

In [4]:
# Read clinical data files
path = '../data/'
clinical_all_df = pd.read_csv(path + 'MAARS_all_Fri_Apr_04_14h_CEST_2014.csv', sep='\t')
clinical_ad_df = pd.read_csv(path + 'MAARS_AD_full_20190131_12-34-49.csv', sep='\t')
clinical_ctrl_df = pd.read_csv(path + 'MAARS_Control_full_20190131_12-40-12.csv', sep='\t')
clinical_pso_df = pd.read_csv(path + 'MAARS_PSO_full_20190131_12-40-53.csv', sep='\t')

# Create function to change column names
def changeColName(col):
    """ This function takes a string as an input and applies following transformations:
        1) Finds and removes last substring between parenthesis
        2) Splits the string based on '#' and removes duplicate items
        3) Trims strings, collates back into one string and replaces spaces with _"""
    # Extract latest string between outermost parenthesis
    final = regex.findall('\(((?>[^\(\)]+|(?R))*)\)', col)
    if len(final): col = col.replace('(' + final[-1] + ')','')
    # Deconstruct into list of tags
    tags = col.split('#')
    # Remove duplicated tags
    tags, idx = np.unique(tags,return_index=True)
    tags = tags[np.argsort(idx)].tolist()
    # Trim strings, collate into one string and replace spaces with _
    tags = list(map(str.strip, tags))
    s = '->'.join(tags)
    s = s.replace(' ', '_')
    return s

# Merge all columns into a single array and write mapping
all_cols = pd.concat([pd.Series(clinical_ad_df.columns), pd.Series(clinical_pso_df.columns), pd.Series(clinical_ctrl_df.columns)]).drop_duplicates()
pd.concat([all_cols.map(lambda x: changeColName(x)), pd.Series(all_cols)], axis=1).rename(columns={0:'new_name',1:'old_name'}).to_csv('columns_mapping.csv')

# Change column names using changeColName
clinical_ad_df.columns = clinical_ad_df.columns.map(lambda x: changeColName(x))
clinical_pso_df.columns = clinical_pso_df.columns.map(lambda x: changeColName(x))
clinical_ctrl_df.columns = clinical_ctrl_df.columns.map(lambda x: changeColName(x))

In [5]:
# Ayoub: Reformatter les colonnes Treatment/medication/disease

def get_unique_values(df, col_index):
    values = np.setdiff1d(np.unique(df.iloc[:,col_index].astype('str').values),'nan')
    values = pd.Series(values).str.replace(' ', '_').values
    return values

def get_unique(values):
    _, idx = np.unique(values, return_index=True)
    return values[np.sort(idx)]

def generate_col_names(prefix, values):
    prefix_unique = get_unique(prefix)
    col_names = []
    for pref in prefix_unique:
        col_names.extend([pref + '->' + s for s in values])
    return col_names

def convert_columns(df, col_group_index, pattern_len):
    """ This function takes a dataframe, a range of columns and a pattern length as input.
        It dummifies columns generating one column for each pair sub_group/value."""
    length = pattern_len
    group_name_index = col_group_index[0:length]
    
    group = clinical_ad_df.iloc[:,col_group_index].columns
    tree = np.vstack(group.str.split('->'))
    group_name = tree[:,1]
    sub_group_order = tree[:,2]
    sub_group_name = get_unique(tree[:,3])
    sub_group_length = len(sub_group_name)
    
    # 1 - Generate temporary dataframe with columns = sub_group->item (e.g: (med.)_Medication_name->Acetylsalicylic_acid)
    # Find unique items (e.g. list of possible treatments, concomitant medication...)
    values = get_unique_values(df,group_name_index)
    # Create columns based on sub_group->item and generate temporary dataframe
    cols = generate_col_names(sub_group_name,values)
    temp_df = pd.DataFrame(np.nan, columns=cols, index=df.index)
    
    # Populate temp dataframe
    for i, row in df.iterrows():
        for col in group_name_index:
            item = row[col]
            col_name = row.index[col]
            sub_group = col_name.split('->')[3]
            if item is not np.nan:
                item = item.replace(' ', '_')
                temp_df.loc[i, sub_group + '->' + item] = True
                k = 1
                for sub_group in sub_group_name[1:]:
                    temp_df.loc[i, sub_group + '->' + item] = row[col + length*k]
                    k+=1
    return temp_df

def replace_yes_no_na(df_in, col_list):
    """ This function takes a dataframe and list of column names as an input. It creates a _was_missing column and replaces
    missing values with 0. It also changes 'No' values to 0 and 'Yes' values to 1"""
    df = df_in.copy()
    # make new columns indicating what will be imputed
    cols_with_missing = (col for col in col_list if df[col].isnull().any() | df[col].astype('str').str.contains('Unk').any())
    for col in cols_with_missing:
        df[col + '_was_missing'] = df[col].isnull() | df[col].astype('str').str.contains('Unk')
    df.loc[:,col_list] = df.loc[:,col_list].fillna(False)
    df.loc[:,col_list] = df.loc[:,col_list].replace(['Yes','No','Unknown'],[True,False,False])
    df.loc[:,col_list] = df.loc[:,col_list].astype('bool')
    return df

# Get group of columns to be categorized and apply convert_columns
print('-----------Converting concomitant medication/treatment/concurrent disease columns-----------')
group_index1 = range(57,111)
length = 9
ad_medication_df = convert_columns(clinical_ad_df, group_index1, length)
print("INFO: Created dataframe with converted cocomitant medication columns")
group_index2 = range(111,188)
length = 11
ad_treatment_df = convert_columns(clinical_ad_df, group_index2, length)
print("INFO: Created dataframe with converted treatment columns")
group_index3 = range(188,208)
length = 4
ad_concurrent_dis_df = convert_columns(clinical_ad_df, group_index3, length)
print("INFO: Created dataframe with converted concurrent disease columns")

# Concatenate new dataframes and merge into one
ad_col_change_df = pd.concat([ad_medication_df,ad_treatment_df,ad_concurrent_dis_df],axis=1, sort=False)
ad_df = clinical_ad_df.merge(ad_col_change_df, how='left', left_index=True, right_index=True)
print("INFO: Merged dataframes into ad_df. Shape is: {}".format(ad_df.shape))

print('-----------Formatting newly created columns-----------')
# Drop initial columns
ad_df = ad_df.drop(ad_df.columns[pd.np.r_[group_index1, group_index2, group_index3]],axis=1)
print("INFO: Dropped initial converted columns in ad_df. Shape is: {}".format(ad_df.shape))

# Manage NAN values: 1- Get name columns and replace Nan by 0, 2- Get "on_going" columns and apply replace_yes_no_na(), 3- Drop "Freq" and "Dose" columns
name_columns = ad_col_change_df.columns[ad_col_change_df.isin([True]).any()]
ad_df.loc[:,name_columns] = ad_df.loc[:,name_columns].fillna(False)
print("INFO: Replaced Nan by 0 in converted columns. Shape is: {}".format(ad_df.shape))
on_going_columns = ad_df.columns[ad_df.columns.str.contains('going')]
ad_df = replace_yes_no_na(ad_df, on_going_columns)
print("INFO: Changed on_going columns and added _was_missing feature. Shape is: {}".format(ad_df.shape))
responder_columns = ad_df.columns[ad_df.columns.str.lower().str.contains('responder')]
ad_df = replace_yes_no_na(ad_df, responder_columns)
print("INFO: Changed responder columns and added _was_missing feature. Shape is: {}".format(ad_df.shape))
dose_freq_columns = ad_df.columns[ad_df.columns.str.lower().str.contains('dose') | ad_df.columns.str.lower().str.contains('freq')]
ad_df = ad_df.drop(columns=dose_freq_columns)
print("INFO: Dropped Freq and Dose columns. Shape is: {}".format(ad_df.shape))

-----------Converting concomitant medication/treatment/concurrent disease columns-----------


INFO: Created dataframe with converted cocomitant medication columns


INFO: Created dataframe with converted treatment columns


INFO: Created dataframe with converted concurrent disease columns
INFO: Merged dataframes into ad_df. Shape is: (94, 808)
-----------Formatting newly created columns-----------
INFO: Dropped initial converted columns in ad_df. Shape is: (94, 657)
INFO: Replaced Nan by 0 in converted columns. Shape is: (94, 657)


INFO: Changed on_going columns and added _was_missing feature. Shape is: (94, 743)
INFO: Changed responder columns and added _was_missing feature. Shape is: (94, 765)
INFO: Dropped Freq and Dose columns. Shape is: (94, 619)


In [6]:
# Guillaume 
# dataframe : ad_df
#

def test_line(line):
    if line =='empty':
        return line
    elif (len(str.split(line, ";")) == 1) and (line !='empty'):
         return 'single'
    else:
        return 'multiple'

def encode_columns_with_semi(df, col_list):
    """Encode lines with e.g. multiple allergies separated by a semicolon ;
    Parameters:
    :df : input dataframe
    :col_list: list of columns to process
    
    returns: dataframe with dummyfied columns
    """
    col_list_encoded = []
    for col in col_list:
        col_list_encoded.append(col+'_encoded')
        df[col].fillna('empty', inplace = True)
        df[col+'_encoded']=df[col].apply(lambda line : test_line(line))
    return pd.get_dummies(df.drop(columns=col_list), columns=col_list_encoded, drop_first=True, dtype='bool')

## Sarra : a binariser et catégoriser
def binarize(base_elem, elem):
    if str.lower(elem) in str(base_elem):
        return True
    else:
        return False
def to_binarize_and_categorize(df, columns):
    for col in columns:
        values = df[col].unique().tolist()
        values = list(map(lambda x: str(x).split(';'), values))
        new_columns = set([ elem.strip().lower() for v in values for elem in v])
        new_columns.discard('nan')
        for elem in new_columns:
            df[col + '_' + str(elem).replace(' ', '_')] = df[col].apply(lambda x: binarize(x, elem))
    return df.drop(columns=columns)

def to_binarize(df, col_list):
    new_cols = [str(col) + '_binarized' for col in col_list]
    df[new_cols] = ~df[col_list].isna()
    return df.drop(columns=col_list)

def remove_constant_columns(df):
    list_col_unique = df.columns[ df.nunique()==1 ].tolist()
    list_col_unique.extend(df.columns[df.isna().all()].tolist())
    df = df.drop(list_col_unique, axis=1)
    return df
def remove_dates(df):
    for col in df.columns.unique():
        if ('date' in col.lower()):
            df = df.drop([col], axis=1)
    return df

In [7]:
# Ayoub: Replace nan values based on guidelines

# Get guidelines for nan columns and standardize instructions
nan_columns = pd.read_csv('./nan_columns.csv', header=None)
nan_columns.columns = nan_columns.iloc[2,:]
nan_columns.loc[0,:] = nan_columns.loc[0,:].str.lower().str.replace('é','e').str.strip()

# Drop dates and constant columns
ad_df = remove_constant_columns(remove_dates(ad_df))

# Extract Scorad related columns
scorad_columns = list(ad_df.columns[ad_df.columns.str.lower().str.contains('scorad')])
scorad_columns.append('patient->Clinical_Assessment->Global_Assessment_Score')
scorad_ad_df = ad_df[scorad_columns]
ad_df = ad_df.drop(columns=scorad_columns)

# Get Nan values in main dataset and do sanity check
nan_ad_cols = ad_df.columns[ad_df.isna().any()]
print('{} columns containing Nan values not present in instructions file'.format(len(nan_ad_cols[~nan_ad_cols.isin(nan_columns.columns)])))
nan_ad_df = nan_columns.loc[0:1,nan_ad_cols[nan_ad_cols.isin(nan_columns.columns)]]
print('Remaining Nan columns: {}'.format(len(nan_ad_cols)))

# Replace Nan values based on instructions
cols_to_remove = nan_ad_df.loc[:,nan_ad_df.loc[0,:].isin(['on vire', 'a virer', np.nan]) | nan_ad_df.loc[0,:].str.contains('vire')].columns
ad_df = ad_df.drop(columns=cols_to_remove)
nan_ad_cols = ad_df.columns[ad_df.isna().any()]
nan_ad_df = nan_columns.loc[0:1,nan_ad_cols[nan_ad_cols.isin(nan_columns.columns)]]
print('Removing columns - Remaining Nan columns: {}'.format(len(nan_ad_cols)))

# Categorize patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Known_Allergies column
ad_df = encode_columns_with_semi(ad_df, ['patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Known_Allergies'])
nan_ad_cols = ad_df.columns[ad_df.isna().any()]
nan_ad_df = nan_columns.loc[0:1,nan_ad_cols[nan_ad_cols.isin(nan_columns.columns)]]
print('Categorizing empty/single/multiple - Remaining Nan columns: {}'.format(len(nan_ad_cols)))

# Categorize and binarize
cols_to_categorize_and_binarize = nan_ad_df.loc[:,nan_ad_df.loc[0,:].str.contains('categorise')].columns #& nan_ad_df.loc[0,:].str.contains('binarise')
ad_df = to_binarize_and_categorize(ad_df,cols_to_categorize_and_binarize)
nan_ad_cols = ad_df.columns[ad_df.isna().any()]
nan_ad_df = nan_columns.loc[0:1,nan_ad_cols[nan_ad_cols.isin(nan_columns.columns)]]
print('Encoding categorical - Remaining Nan columns: {}'.format(len(nan_ad_cols)))

# Binarize columns
cols_to_categorize_and_binarize = nan_ad_df.loc[:,nan_ad_df.loc[0,:].str.contains('binarise')].columns
ad_df = to_binarize(ad_df,cols_to_categorize_and_binarize)
nan_ad_cols = ad_df.columns[ad_df.isna().any()]
nan_ad_df = nan_columns.loc[0:1,nan_ad_cols[nan_ad_cols.isin(nan_columns.columns)]]
print('Binarizing - Remaining Nan columns: {}'.format(len(nan_ad_cols)))
print('Shape of AD clinical dataset after transformation on Nan: {}'.format(ad_df.shape))

# Binarize Yes/No columns
object_cols = ad_df.columns[ad_df.dtypes == 'object']
object_yes_cols = [col for col in object_cols if ad_df[col].str.contains('Yes').any() | ad_df[col].str.contains('No').any()]
ad_df = replace_yes_no_na(ad_df, object_yes_cols)

# Drop sample identifier columns
#id_cols = list(ad_df.columns[ad_df.columns.str.contains('MAARS_Sample_identifier')]).remove(['involved_skin_biopsy->MAARS_Sample_identifier','uninvolved_skin_biopsy->MAARS_Sample_identifier'])
#id_cols.extend(['patient->Identification->Institution','patient->Identification->Physician'])
#ad_df = ad_df.drop(columns=id_cols)

# Categorize remaining columns
#ad_df.set_index('patient->Identification->MAARS_identifier')
object_cols = list(ad_df.columns[ad_df.dtypes == 'object'])
object_cols = [col for col in object_cols if col not in ad_df.columns[ad_df.columns.str.contains('MAARS')]]
ad_df = to_binarize_and_categorize(ad_df, object_cols)

print(ad_df.dtypes.value_counts())
ad_df = pd.concat([ad_df, scorad_ad_df], axis=1)
ad_df.to_csv('ad_df_with_scorad.csv')

0 columns containing Nan values not present in instructions file
Remaining Nan columns: 36
Removing columns - Remaining Nan columns: 14
Categorizing empty/single/multiple - Remaining Nan columns: 13
Encoding categorical - Remaining Nan columns: 8
Binarizing - Remaining Nan columns: 3
Shape of AD clinical dataset after transformation on Nan: (94, 323)


bool      339
object      4
int64       1
dtype: int64


In [8]:
final_data = ad_df.copy()
scorad_cols = [col for col in final_data.columns if ('scorad' in col.lower() or col == 'patient->Clinical_Assessment->Global_Assessment_Score')and (col != 'patient->SCORAD_index->SCORAD->SCORAD_Score')]
final_data = remove_constant_columns(final_data)
final_data = final_data.drop(scorad_cols, axis=1)
final_data.head()

Unnamed: 0,patient->Identification->MAARS_identifier,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subject_has_no_allergen-specific_IgE_and_no_allergen-specific_immediate_type_reactions,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subjects_who_have_received_systemic_antibiotics_within_the_previous_4_weeks_prior_to_screening,patient->Diagnostic_&_Phenotypic_Data->Year_of_birth,patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Smoking,patient->Hanifin_and_Rajka_diagnostic_criteria->2._Typical_morphology_and_distribution,patient->Hanifin_and_Rajka_diagnostic_criteria->4._White_dermographismus,patient->Hanifin_and_Rajka_diagnostic_criteria->5._Xerosis,patient->Hanifin_and_Rajka_diagnostic_criteria->6.Palmar_hyperlinearity/_Keratosis_pilaris,patient->Hanifin_and_Rajka_diagnostic_criteria->9._Early_age_of_onset,...,patient->Hanifin_and_Rajka_diagnostic_criteria->14._Intolerance_to:_lipid_solvents,patient->Hanifin_and_Rajka_diagnostic_criteria->8._Raised_serum_IgE_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Allergic_rhinitis_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Allergic_conjunctivitis_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Asthma_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Urticaria_binarized,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subject_has_no_allergen-specific_IgE_and_no_allergen-specific_immediate_type_reactions_was_missing,patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Smoking_was_missing,patient->Diagnostic_&_Phenotypic_Data->Gender_male,patient->SCORAD_index->SCORAD->SCORAD_Score
0,MAARS_1_003,False,False,1961,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,43.5
1,MAARS_1_008,False,False,1947,False,True,True,True,True,True,...,False,True,False,False,False,False,False,False,False,57.5
2,MAARS_1_016,False,False,1953,True,True,True,True,True,True,...,False,True,False,False,False,False,False,False,False,73.0
3,MAARS_1_017,False,False,1967,False,True,True,True,False,True,...,False,True,False,False,False,False,False,False,False,78.5
4,MAARS_1_044,False,False,1989,True,True,True,True,True,False,...,False,True,False,False,False,False,False,False,False,64.5


In [9]:
final_data.shape

(94, 302)

In [10]:
# Create Masks to encode specific columns
def apply_mask(mask, dataframe):
    all_columns = dataframe.apply( lambda x: mask(x.name, dataframe))
    columns = all_columns.index[all_columns == True]
    return columns
def mask_clinic(x, dataframe):
    if x != 'sample_id' and x != 'MAARS_identifier' and x != 'CUSTOM_Age':
        return True
    else:
        return False
def mask_all(x, dataframe):
    identifier = ('maars'in x.lower())
    date_of_birth = (x == 'patient->Diagnostic_&_Phenotypic_Data->Year_of_birth')
    scorad = (x == 'patient->SCORAD_index->SCORAD->SCORAD_Score')
    
    if  scorad | identifier| date_of_birth:
        return False
    else:
        return True
    

In [11]:
# Function to encode specific columns
from sklearn.preprocessing import LabelEncoder
def encode(dataframe, mask):
    columns = apply_mask(mask, dataframe)
    new = pd.DataFrame()
    lookup_table = pd.DataFrame(columns = ['column', 'lookup_values'], index = range(len(columns)))
    index = 0
    for col in dataframe.columns:
        if col in columns:
            mask_null = dataframe[col].isnull()
            le = LabelEncoder()
            encoded = le.fit_transform(dataframe[col].astype(str).apply(lambda x: x.lower()))
            le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))
            if 'nan' in le_name_mapping.keys():
                del le_name_mapping['nan']
            lookup_table.loc[index, :] = [col,le_name_mapping]
            index+=1        
            new[col] = encoded
            new[col]  = new[col].where(~mask_null, dataframe[col])
        else:
            new[col] = dataframe[col]
    return new, lookup_table

In [12]:
# Encode clinical data ( =can be useful)
new_clinic, lt_clinic = encode(clinical_all_df, mask_clinic)

In [13]:
# Encode ad_full data
new_ad_full, lt_ad_full = encode(final_data, mask_all)

In [14]:
new_ad_full.head()

Unnamed: 0,patient->Identification->MAARS_identifier,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subject_has_no_allergen-specific_IgE_and_no_allergen-specific_immediate_type_reactions,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subjects_who_have_received_systemic_antibiotics_within_the_previous_4_weeks_prior_to_screening,patient->Diagnostic_&_Phenotypic_Data->Year_of_birth,patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Smoking,patient->Hanifin_and_Rajka_diagnostic_criteria->2._Typical_morphology_and_distribution,patient->Hanifin_and_Rajka_diagnostic_criteria->4._White_dermographismus,patient->Hanifin_and_Rajka_diagnostic_criteria->5._Xerosis,patient->Hanifin_and_Rajka_diagnostic_criteria->6.Palmar_hyperlinearity/_Keratosis_pilaris,patient->Hanifin_and_Rajka_diagnostic_criteria->9._Early_age_of_onset,...,patient->Hanifin_and_Rajka_diagnostic_criteria->14._Intolerance_to:_lipid_solvents,patient->Hanifin_and_Rajka_diagnostic_criteria->8._Raised_serum_IgE_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Allergic_rhinitis_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Allergic_conjunctivitis_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Asthma_binarized,patient->Hanifin_and_Rajka_diagnostic_criteria->33_b)_Family_history_of_atopy->Fam._hist._Urticaria_binarized,patient->Inclusion/Exclusion_criteria->Exclusion_criteria->Subject_has_no_allergen-specific_IgE_and_no_allergen-specific_immediate_type_reactions_was_missing,patient->Diagnostic_&_Phenotypic_Data->Ethnicity/Family_History->Smoking_was_missing,patient->Diagnostic_&_Phenotypic_Data->Gender_male,patient->SCORAD_index->SCORAD->SCORAD_Score
0,MAARS_1_003,0,0,1961,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,43.5
1,MAARS_1_008,0,0,1947,0,1,1,1,1,1,...,0,1,0,0,0,0,0,0,0,57.5
2,MAARS_1_016,0,0,1953,1,1,1,1,1,1,...,0,1,0,0,0,0,0,0,0,73.0
3,MAARS_1_017,0,0,1967,0,1,1,1,0,1,...,0,1,0,0,0,0,0,0,0,78.5
4,MAARS_1_044,0,0,1989,1,1,1,1,1,0,...,0,1,0,0,0,0,0,0,0,64.5


In [16]:
lt_ad_full

Unnamed: 0,column,lookup_values
0,patient->Inclusion/Exclusion_criteria->Exclusi...,"{0: 'false', 1: 'true'}"
1,patient->Inclusion/Exclusion_criteria->Exclusi...,"{0: 'false', 1: 'true'}"
2,patient->Diagnostic_&_Phenotypic_Data->Ethnici...,"{0: 'false', 1: 'true'}"
3,patient->Hanifin_and_Rajka_diagnostic_criteria...,"{0: 'false', 1: 'true'}"
4,patient->Hanifin_and_Rajka_diagnostic_criteria...,"{0: 'false', 1: 'true'}"
...,...,...
291,patient->Hanifin_and_Rajka_diagnostic_criteria...,"{0: 'false', 1: 'true'}"
292,patient->Hanifin_and_Rajka_diagnostic_criteria...,"{0: 'false', 1: 'true'}"
293,patient->Inclusion/Exclusion_criteria->Exclusi...,"{0: 'false', 1: 'true'}"
294,patient->Diagnostic_&_Phenotypic_Data->Ethnici...,"{0: 'false', 1: 'true'}"


In [15]:
# Export ad_full encoded data
new_ad_full.to_csv('ad_df_encoded.csv', index=False)