## Detecting Heavy Drinking Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

## Gathering Accelerometer Data

In [17]:
accel_data = pd.read_csv("data/all_accelerometer_data_pids_13.csv")

In [47]:
accel_data['time'] = pd.to_datetime(accel_data['time'], unit='ms')
year_2017 = pd.to_datetime('2017')
accel_data = accel_data.loc[accel_data['time'] > year_2017]

In [51]:
accel_data.to_parquet("all_accelerometer_data.parquet")

In [52]:
accel_data = pd.read_parquet("all_accelerometer_data.parquet")

In [53]:
accel_data

Unnamed: 0,time,pid,x,y,z
2,2017-05-02 14:04:42.409,SA0297,0.075800,0.027300,-0.010200
3,2017-05-02 14:04:42.455,SA0297,-0.035900,0.079400,0.003700
4,2017-05-02 14:04:42.500,SA0297,-0.242700,-0.086100,-0.016300
5,2017-05-02 14:04:43.945,SA0297,-0.288800,0.051400,-0.014500
6,2017-05-02 14:04:43.953,SA0297,-0.041300,-0.018400,-0.010500
...,...,...,...,...,...
14057562,2017-05-03 16:34:08.196,CC6740,-0.133956,0.124726,-0.010736
14057563,2017-05-03 16:34:08.220,CC6740,-0.100764,0.180872,0.046449
14057564,2017-05-03 16:34:08.245,CC6740,-0.131853,0.195934,0.181088
14057565,2017-05-03 16:34:08.270,CC6740,-0.149704,0.194482,0.202393


## Resampling Accelerometer Data

In [62]:
def resample_data_frequency(df):
    df.set_index('time', inplace=True)
    resampled_df = df.groupby('pid').resample('100L').first().reset_index()
    resampled_df.reset_index(drop=True, inplace=True)

    return resampled_df


## Gathering TAC Data

In [23]:
pids = list(accel_data['pid'].unique())
pids

['JB3156',
 'CC6740',
 'SA0297',
 'PC6771',
 'BK7610',
 'DC6359',
 'MC7070',
 'MJ8002',
 'BU4707',
 'JR8022',
 'HV0618',
 'SF3079',
 'DK3500']

In [40]:
pid_tac_dfs = []

for pid in pids:
    pid_tac = pd.read_csv(f"data/clean_tac/{pid}_clean_TAC.csv")
    pid_tac = pid_tac.rename(columns={"timestamp":"time"})
    pid_tac['time'] = pd.to_datetime(pid_tac['time'], unit='s')
    pid_tac['pid'] = pid
    pid_tac = pid_tac[['time', 'pid', 'TAC_Reading']]

    pid_tac_dfs.append(pid_tac)

In [41]:
all_tac_df = pd.concat(pid_tac_dfs, axis=0)

In [42]:
all_tac_df

Unnamed: 0,time,pid,TAC_Reading
0,2017-05-02 11:00:16,JB3156,0.002387
1,2017-05-02 11:30:39,JB3156,0.000271
2,2017-05-02 11:35:54,JB3156,-0.000839
3,2017-05-02 11:41:08,JB3156,-0.000651
4,2017-05-02 11:46:23,JB3156,0.000234
...,...,...,...
46,2017-05-03 07:34:28,DK3500,0.011820
47,2017-05-03 08:04:52,DK3500,0.011012
48,2017-05-03 08:35:16,DK3500,0.010660
49,2017-05-03 09:05:38,DK3500,0.011030
