In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dg_df = pd.read_csv('../big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/Demographics.csv')
dg_df.head(16)

Unnamed: 0,ID,Gender,HbA1c
0,13,MALE,5.7
1,1,FEMALE,5.5
2,3,FEMALE,5.9
3,4,FEMALE,6.4
4,5,FEMALE,5.7
5,2,MALE,5.6
6,6,FEMALE,5.8
7,7,FEMALE,5.3
8,8,FEMALE,5.6
9,10,FEMALE,6.0


In [4]:
# 연속적인 결측치 구간 식별
missing_ranges = []

for id in range(1, len(dg_df)+1):
    if id == 15:
        continue
    df = pd.read_csv(f'../big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/{id:03}/BVP_{id:03}.csv')

    print(f'[ID: {id}] Now processing ...')

    # 데이터 처리
    df['datetime'] = pd.to_datetime(df['datetime'])

    # 전체 시간 범위를 250ms 간격으로 생성
    full_time_range = pd.date_range(start=df['datetime'].min(), end=df['datetime'].max(), freq='15.625ms')

    # 누락된 시간대 찾기
    missing_times = full_time_range.difference(df['datetime'])

    if not missing_times.empty:
        start_time = missing_times[0]
        for i in range(1, len(missing_times)):
            if missing_times[i] != missing_times[i-1] + pd.Timedelta('15.625ms'):
                end_time = missing_times[i-1]
                #missing_ranges.append((start_time, end_time, (end_time - start_time) / pd.Timedelta('15.625 ms') + 1))
                missing_ranges.append({
                    'id': id,
                    'start': start_time,
                    'end': end_time,
                    'count': int((end_time - start_time) / pd.Timedelta('15.625ms') + 1)
                })
                start_time = missing_times[i]
        # 마지막 구간 추가
        end_time = missing_times[-1]
        #missing_ranges.append((start_time, end_time, (end_time - start_time) / pd.Timedelta('15.625 ms') + 1))
        missing_ranges.append({
            'id': id,
            'start': start_time,
            'end': end_time,
            'count': int((end_time - start_time) / pd.Timedelta('15.625ms') + 1)
        })

# 결과를 데이터프레임으로 변환
missing_ranges_df = pd.DataFrame(missing_ranges)

# CSV 파일로 저장
missing_ranges_df.to_csv('bvp_missing_ranges.csv', index=False)

[ID: 1] Now processing ...
[ID: 2] Now processing ...
[ID: 3] Now processing ...
[ID: 4] Now processing ...
[ID: 5] Now processing ...
[ID: 6] Now processing ...
[ID: 7] Now processing ...
[ID: 8] Now processing ...
[ID: 9] Now processing ...
[ID: 10] Now processing ...
[ID: 11] Now processing ...
[ID: 12] Now processing ...
[ID: 13] Now processing ...
[ID: 14] Now processing ...
[ID: 16] Now processing ...


In [5]:
range_df = pd.read_csv(f'./bvp_missing_ranges.csv')

range_df['time_in_seconds'] = range_df['count']

# 초를 분과 시간으로 변환 (선택 사항)
range_df['time_in_minutes'] = range_df['time_in_seconds'] / 64
range_df['time_in_hours'] = round(range_df['time_in_seconds'] / 3600, 2)

range_df.drop(columns=['time_in_seconds', 'time_in_minutes'], inplace=True)

range_df.to_csv('bvp_missing_ranges_with_duration.csv', index=False)

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Demographics 데이터를 불러옴
dg_df = pd.read_csv('../big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/Demographics.csv')

# 각 id에 대해 그래프를 생성하고 저장
for id in range(1, len(dg_df) + 1):
    if id == 15:
        continue
    
    # 각 id의 데이터 읽기
    df = pd.read_csv(f'../big-ideas-lab-glycemic-variability-and-wearable-device-data-1.1.2/{id:03}/BVP_{id:03}.csv')
    
    # datetime 컬럼을 datetime 타입으로 변환
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # 전체 데이터 범위 설정
    full_range_start = df['datetime'].min()
    full_range_end = df['datetime'].max()

    # 출력하고자 하는 id 선택
    selected_id = id

    # 선택된 id의 데이터만 필터링
    selected_df = range_df[range_df['id'] == selected_id]
    
    # start와 end 컬럼을 datetime으로 변환
    selected_df['start'] = pd.to_datetime(selected_df['start'])
    selected_df['end'] = pd.to_datetime(selected_df['end'])

    # 그래프 생성
    fig, ax = plt.subplots(figsize=(12, 6))

    # 전체 데이터 범위 그리기
    ax.plot([full_range_start, full_range_end], [1, 1], color='gray', linewidth=5, alpha=0.3, label='Full Data Range')

    # 데이터의 시작시간과 끝시간 표시
    ax.text(full_range_start, 1.1, full_range_start.strftime('%Y-%m-%d %H:%M:%S'), ha='center', fontsize=10, color='gray')
    ax.text(full_range_end, 1.1, full_range_end.strftime('%Y-%m-%d %H:%M:%S'), ha='center', fontsize=10, color='gray')

    # 선택된 id에 대해 시간 범위 그리기
    for i, row in selected_df.iterrows():
        ax.plot([row['start'], row['end']], [1, 1], marker='o', color='b')

    # 그래프 설정
    ax.set_yticks([1])
    ax.set_yticklabels([f'ID {selected_id} Ranges'])
    ax.set_xlabel('Time')
    ax.set_title(f'Time Ranges for ID {selected_id} with Full Data Range')

    # 날짜 포맷 설정
    date_format = DateFormatter("%Y-%m-%d %H:%M")
    ax.xaxis.set_major_formatter(date_format)
    plt.xticks(rotation=45)

    plt.legend()
    plt.tight_layout()
    
    # 그래프를 이미지로 저장
    plt.savefig(f'./bvp_img/{id:03}_bvp_missing_ranges.png')

    # 그래프 닫기
    plt.close(fig)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['start'] = pd.to_datetime(selected_df['start'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['end'] = pd.to_datetime(selected_df['end'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['start'] = pd.to_datetime(selected_df['start'])
A value is trying to be set o