## Detecting Heavy Drinking Data Preprocessing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

## Gathering Accelerometer Data

In [3]:
accel_data = pd.read_parquet("all_accelerometer_data.parquet")

In [4]:

accel_data = accel_data[2:].reset_index(drop=True)
accel_data['time'] = pd.to_datetime(accel_data['time'], unit='ms')
year_2017 = pd.to_datetime('2017')
accel_data = accel_data.loc[accel_data['time'] > year_2017]
accel_data

Unnamed: 0,time,pid,x,y,z
0,2017-05-02 14:04:42.409,SA0297,0.075800,0.027300,-0.010200
1,2017-05-02 14:04:42.455,SA0297,-0.035900,0.079400,0.003700
2,2017-05-02 14:04:42.500,SA0297,-0.242700,-0.086100,-0.016300
3,2017-05-02 14:04:43.945,SA0297,-0.288800,0.051400,-0.014500
4,2017-05-02 14:04:43.953,SA0297,-0.041300,-0.018400,-0.010500
...,...,...,...,...,...
14057560,2017-05-03 16:34:08.196,CC6740,-0.133956,0.124726,-0.010736
14057561,2017-05-03 16:34:08.220,CC6740,-0.100764,0.180872,0.046449
14057562,2017-05-03 16:34:08.245,CC6740,-0.131853,0.195934,0.181088
14057563,2017-05-03 16:34:08.270,CC6740,-0.149704,0.194482,0.202393


## Gathering TAC Data

In [5]:
pids = list(accel_data['pid'].unique())
pids

['SA0297',
 'PC6771',
 'BK7610',
 'DC6359',
 'MC7070',
 'MJ8002',
 'BU4707',
 'JR8022',
 'HV0618',
 'SF3079',
 'JB3156',
 'CC6740',
 'DK3500']

In [6]:
pid_tac_dfs = []

for pid in pids:
    pid_tac = pd.read_csv(f"data/clean_tac/{pid}_clean_TAC.csv")
    pid_tac = pid_tac.rename(columns={"timestamp":"time"})
    pid_tac['time'] = pd.to_datetime(pid_tac['time'], unit='s')
    pid_tac['pid'] = pid
    pid_tac = pid_tac[['time', 'pid', 'TAC_Reading']]

    pid_tac_dfs.append(pid_tac)

In [7]:
all_tac_df = pd.concat(pid_tac_dfs, axis=0)

In [8]:
all_tac_df['TAC_class'] = 0
all_tac_df.loc[all_tac_df['TAC_Reading'] >= 0.08, 'TAC_class'] = 1

In [9]:
all_tac_df

Unnamed: 0,time,pid,TAC_Reading,TAC_class
0,2017-05-02 09:18:43,SA0297,-0.010229,0
1,2017-05-02 09:49:06,SA0297,-0.002512,0
2,2017-05-02 09:54:23,SA0297,0.003249,0
3,2017-05-02 09:59:39,SA0297,0.005404,0
4,2017-05-02 10:04:55,SA0297,0.003377,0
...,...,...,...,...
46,2017-05-03 07:34:28,DK3500,0.011820,0
47,2017-05-03 08:04:52,DK3500,0.011012,0
48,2017-05-03 08:35:16,DK3500,0.010660,0
49,2017-05-03 09:05:38,DK3500,0.011030,0


In [26]:
# all_tac_df.to_parquet("all_tac_data.parquet")

In [10]:
all_tac_df = pd.read_parquet("all_tac_data.parquet")

## Reconciling Feature and Target Data

In [12]:
# adding a mergable "second" column
accel_data['seconds'] = accel_data['time'].dt.floor('s')
all_tac_df = all_tac_df.rename(columns={"time": "seconds"})

In [13]:
merged_df = pd.merge(accel_data, all_tac_df, on=['pid', 'seconds'], how="left")

In [14]:
merged_df.drop(columns=['seconds', 'TAC_Reading'], inplace=True)

In [15]:
# assigning the first reading of each pid to be 0 (sober) before forward filling

merged_df['pid_count'] = merged_df.groupby('pid').cumcount()
merged_df.loc[merged_df['pid_count'] == 0, 'TAC_class'] = 0
merged_df.drop(columns='pid_count', inplace=True)

In [15]:
# forward filling the TAC_class field
# start sober, sober util intoxicated read, intoxicated until next sober reading

merged_df['TAC_class'] = merged_df['TAC_class'].fillna(method='ffill')

  merged_df['TAC_class'] = merged_df['TAC_class'].fillna(method='ffill')


In [16]:
merged_df

Unnamed: 0,time,pid,x,y,z,TAC_class
0,2017-05-02 14:04:42.409,SA0297,0.075800,0.027300,-0.010200,0.0
1,2017-05-02 14:04:42.455,SA0297,-0.035900,0.079400,0.003700,0.0
2,2017-05-02 14:04:42.500,SA0297,-0.242700,-0.086100,-0.016300,0.0
3,2017-05-02 14:04:43.945,SA0297,-0.288800,0.051400,-0.014500,0.0
4,2017-05-02 14:04:43.953,SA0297,-0.041300,-0.018400,-0.010500,0.0
...,...,...,...,...,...,...
14057560,2017-05-03 16:34:08.196,CC6740,-0.133956,0.124726,-0.010736,0.0
14057561,2017-05-03 16:34:08.220,CC6740,-0.100764,0.180872,0.046449,0.0
14057562,2017-05-03 16:34:08.245,CC6740,-0.131853,0.195934,0.181088,0.0
14057563,2017-05-03 16:34:08.270,CC6740,-0.149704,0.194482,0.202393,0.0


In [17]:
# Percentage of the data representing sober readings
round((merged_df['TAC_class'] == 0).sum() / len(merged_df) * 100, 2)

82.53

In [18]:
# merged_df.to_parquet("merged_data.parquet")

In [16]:
merged_df = pd.read_parquet("merged_data.parquet")

merged_df

Unnamed: 0,time,pid,x,y,z,TAC_class
0,2017-05-02 14:04:42.409,SA0297,0.075800,0.027300,-0.010200,0.0
1,2017-05-02 14:04:42.455,SA0297,-0.035900,0.079400,0.003700,0.0
2,2017-05-02 14:04:42.500,SA0297,-0.242700,-0.086100,-0.016300,0.0
3,2017-05-02 14:04:43.945,SA0297,-0.288800,0.051400,-0.014500,0.0
4,2017-05-02 14:04:43.953,SA0297,-0.041300,-0.018400,-0.010500,0.0
...,...,...,...,...,...,...
14057560,2017-05-03 16:34:08.196,CC6740,-0.133956,0.124726,-0.010736,0.0
14057561,2017-05-03 16:34:08.220,CC6740,-0.100764,0.180872,0.046449,0.0
14057562,2017-05-03 16:34:08.245,CC6740,-0.131853,0.195934,0.181088,0.0
14057563,2017-05-03 16:34:08.270,CC6740,-0.149704,0.194482,0.202393,0.0


## Upsampling Accelerometer Data

In [20]:
def upsample_data_250hz(df):

    upsampled_dfs = {}
    pids = list(df['pid'].unique())

    for pid in tqdm(pids):
        pid_df = df[df['pid'] == pid]

        # there are sometimes multiple instances of the same milisecond.
        pid_df = pid_df.drop_duplicates("time")
        
        pid_df.set_index('time', inplace=True)
        
        upsampled_df = pid_df.resample('4ms').ffill()
        upsampled_df['pid'] = pid
        upsampled_df.reset_index(inplace=True)
        upsampled_df = upsampled_df.dropna()

        print(f"{pid} dataframe length - {len(upsampled_df)}")
        upsampled_dfs[pid] = upsampled_df

    return upsampled_dfs


In [None]:
df_list_250hz = upsample_data_250hz(merged_df)

In [18]:
df_list_250hz['BU4707']

NameError: name 'df_list_250hz' is not defined

In [19]:
for key in df_list_250hz.keys():
    df_list_250hz[key].to_parquet(f"{key}_250hz.parquet")

NameError: name 'df_list_250hz' is not defined