In [3]:
import pandas as pd
import numpy as np

In [4]:
dg_df = pd.read_csv('./big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/Demographics.csv')
dg_df.head(16)

Unnamed: 0,ID,Gender,HbA1c
0,13,MALE,5.7
1,1,FEMALE,5.5
2,3,FEMALE,5.9
3,4,FEMALE,6.4
4,5,FEMALE,5.7
5,2,MALE,5.6
6,6,FEMALE,5.8
7,7,FEMALE,5.3
8,8,FEMALE,5.6
9,10,FEMALE,6.0


In [29]:
df = pd.read_csv('./features_na_included.csv', sep='\t', encoding='utf-8')

In [30]:
df['acc_mean'].isna().sum()

4336

In [31]:
def calculate_acc_mag(acc_df):
    acc_x = acc_df[' acc_x']
    acc_y = acc_df[' acc_y']
    acc_z = acc_df[' acc_z']
    acc_mag = np.sqrt(acc_x ** 2 + acc_y ** 2 + acc_z ** 2)
    return acc_mag

: 

In [32]:
for id in range(1, len(dg_df)+1):
    if id == 15:
        continue
    acc_df = pd.read_csv(f'./big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/{id:03}/ACC_{id:03}.csv')

    print(f'[ID: {id}] Now processing ...')

    # 데이터 처리
    acc_df['datetime'] = pd.to_datetime(acc_df['datetime'])
    acc_df['vector_mag'] = calculate_acc_mag(acc_df)
    acc_df.drop(columns=[' acc_x', ' acc_y', ' acc_z'], inplace=True)
    acc_df.set_index('datetime', inplace=True)

    # 5분 간격으로 리샘플링하고 각 구간의 개수 계산
    resampled = acc_df['vector_mag'].resample('5T').count()

    # 결과를 DataFrame으로 변환
    resampled_df = resampled.reset_index()
    resampled_df.columns = ['datetime', 'count']

    # 결과를 파일로 저장
    resampled_df.to_csv(f'./acc_resampled_counts/ID_{id:03}_5min_counts.csv', index=False)
    
    print(f'[ID: {id}] Resampled data saved.')

In [5]:
max_count = 9600

In [8]:
import os

# 저장된 CSV 파일 경로
directory = './acc_resampled_counts/'

# 디렉토리 내의 모든 CSV 파일 처리
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)

        # 9600개 미만인 시간 구간 찾기
        insufficient_intervals = df[df['count'] < max_count]

        if not insufficient_intervals.empty:
            print(f'{filename} contains insufficient intervals:')
            
            # 연속된 구간 묶기
            start_time = None
            end_time = None
            total_missing_count = 0
            for i in range(len(insufficient_intervals)):
                current_time = pd.to_datetime(insufficient_intervals.iloc[i]['datetime'])

                if start_time is None:
                    start_time = current_time
                    end_time = current_time
                else:
                    # 5분 간격이 연속적인지 확인
                    if current_time == end_time + pd.Timedelta(minutes=5):
                        end_time = current_time
                        total_missing_count = total_missing_count + (max_count - insufficient_intervals.iloc[i]['count'])
                    else:
                        # 연속 구간이 끝났을 때 출력
                        #print(df.iloc[i]['count'])
                        total_missing_count = total_missing_count + (max_count - insufficient_intervals.iloc[i]['count'])
                        print(f"{start_time} ~ {end_time}  -  minutes: {(total_missing_count / max_count) * 5}")
                        start_time = current_time
                        end_time = current_time

            # 마지막 연속 구간 출력
            if start_time is not None and end_time is not None:
                print(f"{start_time} ~ {end_time}")
            print('---')

ID_001_5min_counts.csv contains insufficient intervals:
2020-02-13 15:25:00 ~ 2020-02-13 15:25:00  -  minutes: 3.494791666666667
2020-02-15 06:20:00 ~ 2020-02-15 09:25:00  -  minutes: 185.4885416666667
2020-02-15 16:15:00 ~ 2020-02-16 15:05:00  -  minutes: 1557.5166666666667
2020-02-16 15:55:00 ~ 2020-02-16 16:50:00  -  minutes: 1616.9885416666668
2020-02-17 21:05:00 ~ 2020-02-17 21:50:00  -  minutes: 1662.6510416666667
2020-02-19 07:15:00 ~ 2020-02-19 12:15:00  -  minutes: 1964.3135416666667
2020-02-21 00:30:00 ~ 2020-02-21 07:40:00  -  minutes: 2396.6416666666664
2020-02-21 17:20:00 ~ 2020-02-21 19:40:00  -  minutes: 2540.110416666667
2020-02-22 17:55:00 ~ 2020-02-22 17:55:00
---
ID_015_5min_counts.csv contains insufficient intervals:
2020-07-05 15:10:00 ~ 2020-07-05 15:10:00  -  minutes: 3.111458333333333
2020-07-07 11:15:00 ~ 2020-07-19 13:45:00  -  minutes: 17432.158333333333
2020-07-20 07:40:00 ~ 2020-07-22 02:50:00  -  minutes: 20023.94583333333
2020-07-22 09:35:00 ~ 2020-07-24 