# Formatting the raw json data obtained from the database

In [1]:
"""
Updated on 20th December 2023
@author: Dragos Gruia
"""

import json
from tqdm.notebook import tqdm
from base64 import b64encode
import pandas as pd
import json
import pickle
import os 
import warnings
warnings.filterwarnings('ignore')

Once the data has been downloaded via the API, specify in order the following information: the website linked to the data, the name of the file containing the data, whether or not speech was downloaded, and the output directory for your formatted data.

In [None]:
set_domain = 'ic3v2.cognitron.co.uk' #Website from which we want to download the data
output_file = 'patientsv2-23-11-23-cog.obj' #Progress report file from cognitron
download_speech = False # If true, only speech files will download
output_dir = "./data_temp/"

Import in-house functions used for parsing and formatting the data.

In [16]:
from parsing_functions import *

In [None]:
# Load data and perform parsing

path_to_file = output_file.replace('.obj','.json')
df = load_json(path_to_file)
dfdata = extract_from_data(df, "data")

In [None]:
# Remove subjects with missing information
# and reset the index.

dfdata.dropna(subset=['taskID'], inplace=True) 
dfdata = dfdata.reset_index(drop = True) 

# Harmonise the data across clinical tests
dfdata = task_specific_cleaning(dfdata)   

#Create summary metrics for each test 
dfdatascores = separate_score(dfdata, "Scores")
dfdata = dfdata.reset_index(drop = True)

### Create files for each task containing summaries of performance

In [7]:
# Pre-process summary task data
# Output each task as a separate file

summarydfs = []
for task in np.unique(dfdatascores.taskID):
    dfexamp = dfdatascores[dfdatascores["taskID"] == task]
    dfexamp = dfexamp.dropna(axis = 1)
    finddf = pd.merge(dfexamp.dropna(axis = 1), dfdata.drop(["Rawdata", "Scores"], axis = 1), on = ["user_id", "taskID"])
    finddf.to_csv(f"{output_dir}/{task}.csv")
    summarydfs.append(finddf)

In [None]:
# Pre-process summary questionnaire data
# Output each questionnaire as a separate file

questions = [task for task in dfdata.taskID if task.startswith("q")]
unique_questions = list(np.unique(questions))
summarydfs = []

for task in unique_questions:
    if task in questions:
        dfexamp = dfdata[dfdata["taskID"] == task].reset_index(drop = True)
        dfexamp_resp = separate_response_obj(dfexamp, col_response ="RespObject" )
        dfexamp_resp.to_csv(f"{output_dir}/{task}_questionnaire.csv")
        summarydfs.append(dfexamp_resp)                  

### Create files for each task containing raw trial by trial data.

Because the format of the clinical data changes after a specific timepoint, we separate the data into two dataframes, and any further formatting is done separately to each one. The two dataframes are merged at the very end.

In [17]:
dfdata1=dfdata[dfdata.startTime.astype(int) <= 1645639554944]
dfdata2=dfdata[dfdata.startTime.astype(int) > 1645639554944]

Raw data is then pre-processed separately for each dataframe.

In [None]:
raws1 = rawdata(dfdata1.reset_index(drop=True))
raws2 = rawdata(dfdata2.reset_index(drop=True))

Formatted raw data is then merged from the two dataframes, and outputted separately for each task and questionnaire

In [None]:
rawdfs = []
for task in tqdm(np.unique(dfdata.taskID)):
    dfs_task = []
    for df in raws1:  
        try:
            if df.shape[0] != 0:
                if np.unique(df["taskID"]).item() == task:
                    dfs_task.append(df)
        except:
            print(task)
            print(df)
            break 
    print(task)
    for df in raws2:  
        try:
            if df.shape[0] != 0:
                if np.unique(df["taskID"]).item() == task:
                    dfs_task.append(df)
        except:
            print(task)
            print(df)
            break 
    print(task)
        
    dff = pd.concat(dfs_task)
    dff.to_csv(f"{output_dir}/{task}_raw3.csv")
    rawdfs.append(dff)

## IF data contains speech, decrypt it, save as .WAV files and annotate each speech file 

Create a dictionanry of all possible trials in the speech tasks, to be used for annotations.

In [None]:
speech_stimuli = {
    "IC3_Repetition": ['VILLAGE', #20 words
        'MANNER',
        'GRAVITY',
        'AUDIENCE'
        'COFFEE',
        'PURPOSE',
        'CONCEPT',
        'MOMENT',
        'TREASON',
        'FIRE',
        'ELEPHANT',
        'CHARACTER',
        'BONUS',
        'RADIO',
        'TRACTOR'
        'HOSPITAL',
        'FUNNEL',
        'EFFORT',
        'TRIBUTE',
        'STUDENT'],
    "IC3_Reading": ['if', #11 words
        'frilt',
        'home',
        'to',
        'dwelb',
        'or',
        'listening',
        'and',
        'concert',
        'blosp',
        'treasure'],
    "IC3_NamingTest": ['funnel', #30 pictures
        'tree',
        'dominos',
        'toothbrush',
        'boomerang',
        'mask',
        'snail',
        'acorn',
        'scroll',
        'seahorse',
        'raquet',
        'unicorn',
        'bed',
        'scissors',
        'harmonica',
        'whistle',
        'canoe',
        'helicopter',
        'volcano',
        'house',
        'harp',
        'dart',
        'igloo',
        'pencil',
        'mushroom',
        'saw',
        'comb',
        'bench',
        'camel',
        'hanger'],
     "IC3_SpokenPicture": ['0', #2 pictures
        '1'
    ]     
    }

## Decrypt and annotate speech files

In [None]:
for task, stimuli_values in speech_stimuli.items():
    
    os.chdir(output_speech_files)
    
    # Open the formatted trial data for each task
    
    speech_data = pd.read_csv((f"{output_speech_files}_raw/{task}.csv"))
    trial_data = pd.read_csv((f"{output_speech_files}_raw/{task}_raw.csv"))
    
    
    for index, sub in speech_data.iterrows():
        
        os.chdir(output_speech_files)        
        
        if bool(sub.empty == False):
            
            # Create a directory for each subject
            
            voiceData = sub["media"]
            user_id = sub["user_id"]
            path2Task = sub["taskID"]
            
            if os.path.isdir(user_id) == False:
                os.mkdir(user_id)
            os.chdir(user_id)

            if os.path.isdir(path2Task) == False:
                os.mkdir(path2Task)
            os.chdir(path2Task) 
            
            # Clean the encrypted speech data
            
            temp_trial_data = trial_data[trial_data.loc[:, "user_id"].isin([user_id])] 
            voiceData = re.split("\'data:audio/wav;base64,",voiceData)
            voiceData.pop(0)
            voiceData = list(map(lambda x: x.replace('\',', ''), voiceData))
            
            # Add exception for when the subject did not consent to the speech task\
                
            if len(temp_trial_data) == 0:
                empty_file = open("no_speech.txt","w")
                empty_file.write("No speech files for this task")
                empty_file.close()
                continue
            
            if task == "IC3_SpokenPicture":
                temp_trial_data["Target"] = temp_trial_data["Level"].astype(str)
            
            # Address bug where one or more speech files are missing
            
            if len(voiceData) > len(temp_trial_data.Target):  
                temp_stimuli = pd.Series(stimuli_values)   
                if task == "IC3_Repetition":
                    new_row = pd.DataFrame({'Target': "Unknown_stimuli"}, index=[0])
                else:
                    missing_stimuli = temp_stimuli[(~temp_stimuli.isin(temp_trial_data.Target)).to_list().index(True)].upper()
                    new_row = pd.DataFrame({'Target': missing_stimuli}, index=[0])

                temp_trial_data = pd.concat([new_row, temp_trial_data.loc[:]]).reset_index(drop=True)
                
            # Decrypt and output each speech file
                
            for count,value in enumerate(voiceData):
                tempVoice = voiceData[count]   
                temp_name = temp_trial_data.Target.iloc[count].upper() + '_' + str(count) + '_' + path2Task + '_' + user_id + '.wav'      
                test_wav = open(temp_name,"wb")
                temp_bin = b64decode(tempVoice)
                test_wav.write(temp_bin)
                test_wav.close()
                
            
