# Data extraction from server using progress reports

#### We load the necessary libraries and download the data from our database. Progress reports are compiled via database queries, and contain information pertaining to participants' progress in the IC3 digital health platform. The information contained in the progress reports is then used to download data points that meet certain inclusion criteria via in-house API requests.   

In [None]:
"""
Updated on 5th of April 2024
@author: Dragos Gruia
"""

from http.client import HTTPSConnection
import json
import tqdm
from base64 import b64encode
import pandas as pd
import json
import pickle
import os 
import warnings
warnings.filterwarnings('ignore')

## Specify which data to download

One needs to specify (in this order) the specific website to which the data is linked, the progress report associated with that website, whether you wish to download speech or cognitive/questionnaire data, the name of the output file, and the name of the directory where the data will be saved. 

In [2]:
set_domain = 'ic3.cognitron.co.uk' # Website from which we want to download the data
ids = pd.read_csv("progress_reports/progreportic3_patients_v2_231123.csv")
download_speech = False # If true, only speech files will download
output_file = 'patientsv2-23-11-23-cog.obj'
output_dir = "./data_temp/"

In [None]:
# Specify which participants to keep. If all should be kept, comment the first line.
ids = ids.loc[ids['status.task_count'] > 2,:]
ids = ids.hash
len(ids)

## Download the data via HTTP request

In [None]:
# Set up the https connection

base_url = "www.cognitron.co.uk"
c = HTTPSConnection(base_url)

#Log in with your username, then use base64 encryption for security

user = "username"
passw = "password"
userAndPass = b64encode((f"{user}:{passw}").encode("ascii")).decode("ascii")
headers = { 'Authorization' : f'Basic {userAndPass}' }
data_dragos = []
weird_ids = []
i=0
attempts = 0

while i < len(ids):
    user_id = ids.iloc[i]
    try:
        
        # Download one participant's data at a time and check for data storage errors or corrupted files
        
        user_id = user_id.replace("'",'"')
        user_id = '-'.join([user_id[0:8], user_id[8:12], user_id[12:16],user_id[16:20],user_id[20:]])
        query = f"/api/v1/data/user/{user_id}/"
        c.request('GET', query, headers=headers)
        res = c.getresponse()
        res.status
        data = res.read()
        data_parsed = json.loads(data.decode())
        
        # Select only the data from the set domain, and the data type (speech or non-speech)
        # Create a progress bar to track the download progress 
        
        if data_parsed["Success"]:
            df = pd.DataFrame(data_parsed["Data"])
            df = df[df.domain == set_domain]
            if not df.empty:
                df["user_id"] = user_id
                if download_speech:
                    df = df.loc[df["taskname"].str.contains('IC3_Reading|IC3_Repetition|IC3_SpokenPicture|IC3_NamingTest', regex=True)]
                else:
                    df = df.loc[~df["taskname"].str.contains('IC3_Reading|IC3_Repetition|IC3_SpokenPicture|IC3_NamingTest', regex=True)]
                    
                df.reset_index(drop=True,inplace=True)
                data_dragos.append(df)
                i=i+1
                attempts = 0
                print(f'Downloading progress {i} / {len(ids)}')
                
    except:
        
        # Store information about the participants for which the download failed. 
        # Re-attempt download 3 up to a maximum of times before moving on to the next data point.     
        
        attempts = attempts + 1
        print(f'Something went wrong with {user_id}. Will reattempt {3-attempts} more times.')
        if attempts >=3:
            i=i+1
            weird_ids.append(user_id)
            print(f'Attempted to get data for {user_id} 3 times but failed. Moving on.')

# Save the data to a pickle file
        
fileObj = open(output_file, 'wb')
pickle.dump(data_dragos,fileObj)
fileObj.close()
print('Done.')
del data_dragos

## Add header information to the raw data and save it to a json file

In [5]:

# Open newly created pickle file 

fileObj = open(output_file, 'rb')
exampleObj = pickle.load(fileObj)
fileObj.close()

data_file = open(output_file.replace('.obj','.json'), "a")

# Add header information and write the data to a json file

for subj in tqdm(exampleObj):
    temp_subj = subj
    temp_subj.reset_index(inplace=True)
    tasks_no = len(temp_subj)
    for tasks in range(0,tasks_no):
        raw_data = temp_subj.iloc[tasks,:].data
        raw_data['user_id'] = temp_subj.user[tasks]
        raw_data['domain'] = temp_subj.domain[tasks]
        raw_data['os'] = temp_subj.os[tasks][0]
        raw_data['device'] = temp_subj.device[tasks][0]
        raw_data['browser'] = temp_subj.browser[tasks][0]
        raw_data = json.dumps(raw_data)
        data_file.write(raw_data)
        data_file.write("\n")
        
data_file.close()

100%|██████████| 57/57 [00:00<00:00, 431.63it/s]
