# Data extraction from server using progress reports

#### We load the necessary libraries and download the data from our database. Progress reports are compiled via database queries, and contain information pertaining to participants' progress in the IC3 digital health platform. The information contained in the progress reports is then used to download data points that meet certain inclusion criteria via in-house API requests.   

In [2]:
"""
Updated on 5th of April 2024
@author: Dragos Gruia
"""

from http.client import HTTPSConnection
import json
import tqdm
from base64 import b64encode
import pandas as pd
import json
from tqdm import tqdm
import pickle
import os 
import warnings
warnings.filterwarnings('ignore')

## Specify which data to download

One needs to specify (in this order) the working directory, the specific website to which the data is linked, the progress report associated with that website, whether you wish to download speech or cognitive/questionnaire data, and the name of the output file.

In [3]:
setwd = '/Users/dg519/Documents/normative_paper_github/data/data_patients_v2/raw_data'
os.chdir(setwd)

In [4]:
set_domain = 'ic3v2.cognitron.co.uk' # Website from which we want to download the data
ids = pd.read_csv("progreportic3_patients_v2_160424.csv")
download_speech = False # If true, only speech files will download
output_file = 'patientsv2-16-04-24-cog.obj'

In [5]:
ids.head()

Unnamed: 0.1,Unnamed: 0,user_hash,user_id,token,date_activated,date_activated_unix,battery_count,completed,completion_date,completion_date_unix,consent_status,last_email_date,last_email_date_unix,last_email_template,last_login,last_login_unix,last_task_date,last_task_date_unix,response_count
0,0,68fa20c3-aa21-466f-b442-a41021029b2f,Anon-25100722F24F449FBD3BC7164292EF71,rpKa-yfEc,2024-04-16 17:38:27.375178+00:00,1713289000.0,0,False,,,True,,,,2024-04-16 17:38:27.185174+00:00,1713289000.0,,,0
1,1,035a5ff0-405d-4f18-b3a7-025179f53c44,Anon-E1E5CCA9B2164BE5B41CF940D823B9F0,WcYZ-gHap,2024-04-16 15:36:19.958490+00:00,1713282000.0,0,False,,,True,,,,2022-02-04 11:00:36.857649+00:00,1643972000.0,2024-04-16 15:36:19.981897+00:00,1713282000.0,4
2,2,1979433a-0c10-469b-8b97-d6f56baba556,00014-session1-versiona,sPbK-qjtc,2024-04-16 15:36:18.727216+00:00,1713282000.0,0,False,,,True,,,,2022-02-18 19:27:25.188942+00:00,1645212000.0,2024-04-16 15:36:18.752231+00:00,1713282000.0,16
3,3,7d1ff07a-c867-4a56-b599-0e8f4154d7a8,0007-session1-versionA,V45V-aBqP,2024-04-16 15:36:18.650982+00:00,1713282000.0,0,False,,,True,,,,2022-02-08 13:35:46.504204+00:00,1644327000.0,2024-04-16 15:36:18.682346+00:00,1713282000.0,1
4,4,40caccb3-b354-49ab-9037-d3b7918c473d,ic300018-session1-versionA,569c-6AZH,2024-04-16 15:36:18.029021+00:00,1713282000.0,0,False,,,True,,,,2022-02-14 09:38:27.606892+00:00,1644832000.0,2024-04-16 15:36:18.058262+00:00,1713282000.0,1


In [5]:
# Specify which participants to keep. If all should be kept, comment the first line.
ids = ids.loc[ids['response_count'] > 2,:]
ids = ids.user_hash
len(ids)

90

In [6]:
ids.iloc[0]

'3e72528d-ea6a-488e-af27-af62a5ce7652'

## Download the data via HTTP request

In [7]:
# Set up the https connection

base_url = "ic3.cognitron.co.uk"
c = HTTPSConnection(base_url)

#Log in with your username, then use base64 encryption for security

user = "dragos-cristian.gruia19@imperial.ac.uk"
passw = "anielush123"
userAndPass = b64encode((f"{user}:{passw}").encode("ascii")).decode("ascii")
headers = { 'Authorization' : f'Basic {userAndPass}' }
data_dragos = []
weird_ids = []
i=0
attempts = 0

while i < len(ids):
    user_id = ids.iloc[i]
    try:
            
            # Download one participant's data at a time and check for data storage errors or corrupted files
        
        #user_id = user_id.replace("'",'"')
        #user_id = '-'.join([user_id[0:8], user_id[8:12], user_id[12:16],user_id[16:20],user_id[20:]])
        query = f"/api/v1/data/user/{user_id}/"
        c.request('GET', query, headers=headers)
        res = c.getresponse()
        #print(res.status)
        data = res.read()
        data_parsed = json.loads(data.decode())
        
        # Select only the data from the set domain, and the data type (speech or non-speech)
        # Create a progress bar to track the download progress 
        
        if data_parsed["Success"]:
            df = pd.DataFrame(data_parsed["Data"])
            df = df[df.domain == set_domain]
            if not df.empty:
                df["user_id"] = user_id
                
                if download_speech == False:
                    df = df.loc[~df["taskname"].str.contains('IC3_Reading|IC3_Repetition|IC3_SpokenPicture|IC3_NamingTest', regex=True)]
                    
                df.reset_index(drop=True,inplace=True)
                data_dragos.append(df)
                i=i+1
                attempts = 0
                print(f'Downloading progress {i} / {len(ids)}')
            
    except:
        
        # Store information about the participants for which the download failed. 
        # Re-attempt download 3 up to a maximum of times before moving on to the next data point.     
        
        attempts = attempts + 1
        print(f'Something went wrong with {user_id}. Will reattempt {3-attempts} more times.')
        if attempts >=3:
            i=i+1
            weird_ids.append(user_id)
            print(f'Attempted to get data for {user_id} 3 times but failed. Moving on.')

# Save the data to a pickle file
        
fileObj = open(output_file, 'wb')
pickle.dump(data_dragos,fileObj)
fileObj.close()
print('Done.')
del data_dragos

Downloading progress 1 / 90
Downloading progress 2 / 90
Downloading progress 3 / 90
Downloading progress 4 / 90
Downloading progress 5 / 90
Downloading progress 6 / 90
Downloading progress 7 / 90
Downloading progress 8 / 90
Downloading progress 9 / 90
Downloading progress 10 / 90
Downloading progress 11 / 90
Downloading progress 12 / 90
Downloading progress 13 / 90
Downloading progress 14 / 90
Downloading progress 15 / 90
Downloading progress 16 / 90
Downloading progress 17 / 90
Downloading progress 18 / 90
Downloading progress 19 / 90
Downloading progress 20 / 90
Downloading progress 21 / 90
Downloading progress 22 / 90
Downloading progress 23 / 90
Downloading progress 24 / 90
Downloading progress 25 / 90
Downloading progress 26 / 90
Downloading progress 27 / 90
Downloading progress 28 / 90
Downloading progress 29 / 90
Downloading progress 30 / 90
Downloading progress 31 / 90
Downloading progress 32 / 90
Downloading progress 33 / 90
Downloading progress 34 / 90
Downloading progress 35

In [None]:
df

## Add header information to the raw data and save it to a json file

In [8]:

# Open newly created pickle file 

fileObj = open(output_file, 'rb')
exampleObj = pickle.load(fileObj)
fileObj.close()

data_file = open(output_file.replace('.obj','.json'), "a")

# Add header information and write the data to a json file

for subj in tqdm(exampleObj):
    temp_subj = subj
    temp_subj.reset_index(inplace=True)
    tasks_no = len(temp_subj)
    for tasks in range(0,tasks_no):
        raw_data = temp_subj.iloc[tasks,:].data
        raw_data['user_id'] = temp_subj.user[tasks]
        raw_data['domain'] = temp_subj.domain[tasks]
        raw_data['os'] = temp_subj.os[tasks][0]
        raw_data['device'] = temp_subj.device[tasks][0]
        raw_data['browser'] = temp_subj.browser[tasks][0]
        raw_data = json.dumps(raw_data)
        data_file.write(raw_data)
        data_file.write("\n")
        
data_file.close()

100%|██████████| 90/90 [00:00<00:00, 478.72it/s]
