In [1]:
import numpy as np
import pandas as pd
import glob

from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from matplotlib import pyplot

In [2]:
# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together
    agg = concat(cols, axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

In [3]:
def load_and_transform(load_path, save_path, inVal_unit, outVal, days_of_training = 4, rowLimit = None):
    
    libre_files = glob.glob(f"{load_path}/*")
    print(len(libre_files))
    
    file_n = 1
    
    for f in libre_files:
        print(f)
        series = read_csv(f, header=0, index_col=1, nrows = rowLimit)
        values = series.values

        # values = values[0:20000]

        for j in range(1, (days_of_training + 1)):
            # transform the time series data into supervised learning
            inVal  = inVal_unit*j
            outVal = outVal
            data = series_to_supervised(values, n_in=inVal, n_out = outVal)

            # save out data as supervised data format
            np.savetxt(f"{save_path}/supervised_{file_n}_{inVal}_{outVal}_vals.csv", data, delimiter=",")

        file_n = file_n + 1
        
    return(len(libre_files))
    

In [7]:
def generate_labeled_data(load_path, rowLimit, n_files, train_bins, test_bins, gap_bins, outVal, hypo_thresh, prevelance_ratio):
    
    for i in range(1, n_files+1):
        # print(i)
        sub = pd.read_csv(f"{load_path}/supervised_{i}_{train_bins}_{outVal}_vals.csv",
                     nrows = rowLimit, header = None)

        if i==1:
            concat_output = sub
        else:
            frames = [concat_output, sub]
            concat_output = pd.concat(frames)
            
    print(concat_output.shape)
    
    # take every nth row; where n = test_bins
    all_data = concat_output.iloc[::test_bins, :]

    # divide into train and test splits (cols)
    train_cols = all_data.iloc[:, 0:train_bins]
    test_cols  = all_data.iloc[:, train_bins:train_bins + test_bins + gap_bins]
    
    # create labels
    minvalue = test_cols.min(axis = 1)
    label = np.where(minvalue < hypo_thresh, 1, 0)

    df_X = train_cols.copy()
    df_X['label'] = label
    
    # downsample to a specified ratio of case:control
    df_X_case = df_X[label==1]
    df_X_control = df_X[label==0]

    sample = df_X_control.sample(n=(df_X_case.shape[0] * prevelance_ratio))

    frame = [df_X_case, sample]
    output_frame = pd.concat(frame)
    
    return(output_frame)

In [5]:
# transform a time series dataset into a supervised learning dataset
# return the number of source files (sourceFile_n)
# (ie individuals libre data in the processed raw data folder (load_path))
sourceFile_n = load_and_transform(load_path = "/media/psf/Home/Documents/data/processed_libre",
                  save_path = "/media/psf/Home/Documents/data/libre_as_supervised2",
                  inVal_unit = 96,
                  outVal = 96,
                  days_of_training = 4,
                  rowLimit = 10000)

15
/media/psf/Home/Documents/data/processed_libre/proc_6.csv
/media/psf/Home/Documents/data/processed_libre/proc_7.csv
/media/psf/Home/Documents/data/processed_libre/proc_5.csv
/media/psf/Home/Documents/data/processed_libre/proc_4.csv
/media/psf/Home/Documents/data/processed_libre/proc_1.csv
/media/psf/Home/Documents/data/processed_libre/proc_3.csv
/media/psf/Home/Documents/data/processed_libre/proc_2.csv
/media/psf/Home/Documents/data/processed_libre/proc_14.csv
/media/psf/Home/Documents/data/processed_libre/proc_15.csv
/media/psf/Home/Documents/data/processed_libre/proc_11.csv
/media/psf/Home/Documents/data/processed_libre/proc_10.csv
/media/psf/Home/Documents/data/processed_libre/proc_12.csv
/media/psf/Home/Documents/data/processed_libre/proc_13.csv
/media/psf/Home/Documents/data/processed_libre/proc_9.csv
/media/psf/Home/Documents/data/processed_libre/proc_8.csv


In [5]:
# read in the supervised files and concatenate into a single file
# select by nth row to avoid overfitting / data leakage from one training set into another
# take interval to be duration of test window
# add progressive gaps

In [None]:
# set variables
n_files = sourceFile_n

test_duration_hours = 4
gap_hours = 0

train_bins = 192
test_bins = test_duration_hours * 4
gap_bins = gap_hours * 4

hypo_thresh = 3

prevelance_ratio = 4

# concatenate the data files, add label
data_lab = generate_labeled_data("/media/psf/Home/Documents/data/libre_as_supervised",
                                 None,
                                 n_files,
                                 train_bins,
                                 test_bins,
                                 gap_bins,
                                 96,
                                 hypo_thresh,
                                 prevelance_ratio)

In [187]:
data_lab.shape

(3182, 193)

In [190]:
for i in range(1, n_files+1):
    # print(i)
    sub = pd.read_csv(f"/media/psf/Home/Documents/data/libre_as_supervised/supervised_{i}_{train_bins}_{outVal}_vals.csv",
                 nrows = None, header = None)
    
    if i==1:
        concat_output = sub
    else:
        frames = [concat_output, sub]
        concat_output = pd.concat(frames)

In [191]:
concat_output.shape

(1104107, 288)

In [192]:
# take every nth row; where n = test_bins
all_data = concat_output.iloc[::test_bins, :]

# divide into train and test splits (cols)
train_cols = all_data.iloc[:, 0:train_bins]
test_cols  = all_data.iloc[:, train_bins:train_bins + test_bins + gap_bins]

In [193]:
all_data.shape

(69007, 288)

In [196]:
# create labels
minvalue = test_cols.min(axis = 1)
label = np.where(minvalue < hypo_thresh, 1, 0)

df_X = train_cols.copy()
df_X['label'] = label

In [197]:
df_X.shape

(69007, 193)

In [200]:
# downsample to a specified ratio of case:control
df_X_case = df_X[label==1]
df_X_control = df_X[label==0]

sample = df_X_control.sample(n=(df_X_case.shape[0] * prevelance_ratio))

frame = [df_X_case, sample]
output_frame = pd.concat(frame)

In [201]:
df_X_case.shape

(2410, 193)

In [203]:
output_frame.shape

(12050, 193)