# Raw ActiGraph and activPAL data

The aim of this script is to combine the individual patient data (in time series form) to a single dataset, so that it can be analyzed more easily.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

SOURCE_DATASET_DIRECTORY = '../0-dataset/0-ra-data/'

In [2]:
df_activpal = pd.DataFrame()
df_actigraph = pd.DataFrame()

activpal_count = 0
actigraph_count = 0

for directory_name in os.listdir(SOURCE_DATASET_DIRECTORY):
    if directory_name.endswith('.xlsx'):
        continue
    else:
        for subdirectory_name in os.listdir(SOURCE_DATASET_DIRECTORY + directory_name):
            for subdirectory_content in os.listdir(SOURCE_DATASET_DIRECTORY + directory_name + '/' + subdirectory_name):
                if subdirectory_content.endswith('.csv'):
                    if directory_name.endswith('-ap'):
                        if not subdirectory_content.endswith('Events.csv'):
                            continue
                        skipped_rows = 0
                    else:
                        skipped_rows = 10
                    
                    df = pd.read_csv(
                        SOURCE_DATASET_DIRECTORY + directory_name + '/' + subdirectory_name + '/' + subdirectory_content,
                        skiprows = skipped_rows
                    )
                    df['directory_name'] = directory_name
                    df['subdirectory_name'] = subdirectory_name
                    
                    if directory_name.endswith('-ap'):
                        df_activpal = pd.concat([df_activpal, df])
                        activpal_count += 1
                    else:
                        df_actigraph = pd.concat([df_actigraph, df])
                        actigraph_count += 1

In [3]:
import re

def get_patient_id(subdirectory_name):
    """
    Extract patient_id based on the directory name.
    """
    patient_id = re.sub('[()]','', subdirectory_name)
    for removed_string in ['wk28','28wk','wk2','BTOS','AG']:
        patient_id = patient_id.replace(removed_string,'')
        
    patient_id = patient_id.split('_')[0].strip()[2:]
    
    return int(patient_id)

In [4]:
df_actigraph['patient_id'] = df_actigraph['subdirectory_name'].apply(get_patient_id)
df_activpal['patient_id'] = df_activpal['subdirectory_name'].apply(get_patient_id)

In [5]:
print("Actigraph data size: {:,.0f} records with {:,.0f} columns.".format(
    df_actigraph.shape[0],
    df_actigraph.shape[1]
))

print("Activpal data size: {:,.0f} records with {:,.0f} columns.".format(
    df_activpal.shape[0],
    df_activpal.shape[1]
))

Actigraph data size: 30,120 records with 15 columns.
Activpal data size: 80,030 records with 10 columns.


In [6]:
obs_actigraph = df_actigraph.groupby([
    'patient_id',
    'directory_name'
]).agg({
    'Date': 'count'
}).reset_index()

print("On average, there are {:,.0f} +- {:,.0f} Actigraph records per patient in each observation period".format(
    obs_actigraph['Date'].mean(),
    obs_actigraph['Date'].std()
))

On average, there are 10,040 +- 69 Actigraph records per patient in each observation period


In [7]:
obs_activpal = df_activpal.groupby([
    'patient_id',
    'directory_name'
]).agg({
    'Time': 'count'
}).reset_index()

print("On average, there are {:,.0f} +- {:,.0f} Activpal records per patient in each observation period".format(
    obs_activpal['Time'].mean(),
    obs_activpal['Time'].std()
))

On average, there are 26,677 +- 3,945 Activpal records per patient in each observation period


In [6]:
OUTPUT_DIRECTORY = '../0-dataset/1-preprocessed/'


df_actigraph.to_csv(
    OUTPUT_DIRECTORY + 'actigraph_combined.csv',
    index = False
)

df_activpal.to_csv(
    OUTPUT_DIRECTORY + 'activpal_combined.csv',
    index = False
)

# Patient metadata

In [7]:
for directory_name in os.listdir(SOURCE_DATASET_DIRECTORY):
    if directory_name.endswith('.xlsx'):
        df_patient_metadata = pd.read_excel(
            SOURCE_DATASET_DIRECTORY + directory_name,
            sheet_name = 'Sheet1'
        )

In [8]:
## only extract relevant columns
used_metadata_columns = df_patient_metadata.columns[:19]
df_patient_metadata_trimmed = df_patient_metadata[used_metadata_columns].copy()

In [9]:
df_patient_metadata_trimmed.columns = [
    'patient_id',
    'visit',
    'gender', ## gender is encoded as 0 (34) or 1 (105), not 1 or 2
    'include_activpal_data',
    'notes',
    'age',
    'height',
    'weight',
    'bmi',
    'sf_pf',
    'sf_role', 
    'sf_pain', 
    'sf_social', 
    'sf_mental', 
    'sf_emot', 
    'sf_vitality',
    'sf_gen_health', 
    'sf36_total', 
    'haq'
]

In [10]:
OUTPUT_DIRECTORY = '../0-dataset/1-preprocessed/'

df_patient_metadata_trimmed.to_csv(
    OUTPUT_DIRECTORY + 'patient_metadata_trimmed.csv',
    index = False
)