In [46]:
import numpy as np
import pandas as pd
import regex

In [7]:
# Read clinical data files
path = './data/'
clinical_all_df = pd.read_csv(path + 'MAARS_all_Fri_Apr_04_14h_CEST_2014.csv', sep='\t')
clinical_ad_df = pd.read_csv(path + 'MAARS_AD_full_20190131_12-34-49.csv', sep='\t')
clinical_ctrl_df = pd.read_csv(path + 'MAARS_Control_full_20190131_12-40-12.csv', sep='\t')
clinical_pso_df = pd.read_csv(path + 'MAARS_PSO_full_20190131_12-40-53.csv', sep='\t')

In [135]:
# Create function to change column names
def changeColName(col):
    """ This function takes a string as an input and applies following transformations:
        1) Finds and removes last substring between parenthesis
        2) Splits the string based on '#' and removes duplicate items
        3) Trims strings, collates back into one string and replaces spaces with _"""
    
    # Extract latest string between outermost parenthesis
    final = regex.findall('\(((?>[^\(\)]+|(?R))*)\)', col)
    if len(final): col = col.replace('(' + final[-1] + ')','')
    
    # Deconstruct into list of tags
    tags = col.split('#')
    
    # Remove duplicated tags
    tags, idx = np.unique(tags,return_index=True)
    tags = tags[np.argsort(idx)].tolist()
    
    # Trim strings, collate into one string and replace spaces with _
    tags = list(map(str.strip, tags))
    s = '->'.join(tags)
    s = s.replace(' ', '_')
    
    return s

# Merge all columns into a single array and write mapping
all_cols = pd.concat([pd.Series(clinical_ad_df.columns), pd.Series(clinical_pso_df.columns), pd.Series(clinical_ctrl_df.columns)]).drop_duplicates()
pd.concat([all_cols.map(lambda x: changeColName(x)), pd.Series(all_cols)], axis=1).rename(columns={0:'new_name',1:'old_name'}).to_csv('columns_mapping.csv')

# Change column names using changeColName
clinical_ad_df.columns = clinical_ad_df.columns.map(lambda x: changeColName(x))
clinical_pso_df.columns = clinical_pso_df.columns.map(lambda x: changeColName(x))
clinical_ctrl_df.columns = clinical_ctrl_df.columns.map(lambda x: changeColName(x))

# Write updated data files
file_suffix = '_new_cols'
ext = '.csv'
clinical_ad_df.to_csv('MAARS_AD_full_20190131_12-34-49' + file_suffix + ext , sep='\t', index=False)
clinical_ctrl_df.to_csv('MAARS_Control_full_20190131_12-40-12' + file_suffix + ext , sep='\t', index=False)
clinical_pso_df.to_csv('MAARS_PSO_full_20190131_12-40-53' + file_suffix + ext , sep='\t', index=False)