Required Packages:
- pandas  : 1.5.3
- openpyxl: 3.1.5
- xlrd    : 2.0.1

# 📚 | Import Libraries 

In [1]:
import config as cfg

import pandas as pd
import numpy as np
import openpyxl
import xlrd
import os
import zipfile
from io import BytesIO
from datetime import datetime

from library.path_utils import get_file_path, to_absolute_path

print("pandas  :", pd.__version__)
print("openpyxl:", openpyxl.__version__)
print("xlrd    :", xlrd.__version__)

pandas  : 2.3.1
openpyxl: 3.1.5
xlrd    : 2.0.2


# ⚙️ | Settings

In [2]:
RAW_PXPN_DIR = "./raw_data/PXPN"

# 엑셀 파일 경로 (실제 경로로 수정)
enroll_file_name = "1. 픽셀패닉 enroll 정보_250516"

zip_file_name = "pixelpanic_raw_data.zip"
output_folder_name = "./_tmp/PXPN"

# Code

In [3]:
enroll_path = get_file_path(RAW_PXPN_DIR, f"{enroll_file_name}.xlsx")
zip_path = get_file_path(RAW_PXPN_DIR, f"{zip_file_name}")
csv_path = get_file_path(RAW_PXPN_DIR, f"{enroll_file_name}.csv")
output_folder = to_absolute_path(output_folder_name)

# 엑셀 읽고 csv로 저장
df = pd.read_excel(enroll_path)
df.to_csv(csv_path, index=False)

enroll_path = get_file_path(RAW_PXPN_DIR, f"{enroll_file_name}.csv")

In [4]:
# 설문 리스트
top_5 = [
    '특성 불안 설문', '한국형 회복탄력성 지수', '한국어판 아침형-저녁형 설문지',
    '한글판 생물학적 리듬 평가 설문지', '유년기 외상 척도', '한국형 기분장애 설문지',
    '광장공포 인지 설문지', '알바니 공황-공포 질문지', '신체감각 설문지',
    '한글판 범불안 장애', '한국어판 우울증 선별도구'
]

# 결과 DataFrame 초기화: patient_code, date 컬럼 확보
result = pd.DataFrame(columns=['patient_code', '날짜'])


with zipfile.ZipFile(zip_path, 'r') as outer_zip:
    for i in range(6, 41):
        formatted_index = f'{i:02d}'
        patient_code = f'PXPN_100{formatted_index}'

        # 내부 zip 파일 경로
        inner_zip_name = f'ActiveData/{patient_code}_ActiveData.zip'
        if inner_zip_name not in outer_zip.namelist():
            continue

        inner_zip_bytes = BytesIO(outer_zip.read(inner_zip_name))
        with zipfile.ZipFile(inner_zip_bytes, 'r') as inner_zip:
            inner_file_name = f'{patient_code}_SurveyResponse.csv'
            if inner_file_name not in inner_zip.namelist():
                continue

            with inner_zip.open(inner_file_name) as f:
                df = pd.read_csv(f)

                # 작성일 컬럼에서 날짜만 추출
                date_value = pd.to_datetime(df['작성일'].iloc[0]).date()

                # 새로운 환자-작성일 행 추가
                if not ((result['patient_code'] == patient_code) & (result['날짜'] == date_value)).any():
                    new_row = pd.DataFrame({
                        'patient_code': [patient_code],
                        '날짜': [date_value]
                    })
                    result = pd.concat([result, new_row], ignore_index=True)

                # 점수 처리
                for j in top_5:
                    sub_df = df[df['설문명'] == j].reset_index(drop=True)
                    if sub_df.empty:
                        continue

                    # 역채점 점수가 있으면 사용
                    scores = []
                    for idx, row in sub_df.iterrows():
                        if row['역채점인 경우 역채점 점수'] != '-':
                            scores.append(float(row['역채점인 경우 역채점 점수']))
                        else:
                            val = row['점수']
                            scores.append('***' if pd.isna(val) else float(val))

                    # 컬럼명 생성 및 값 삽입
                    if j == '특성 불안 설문':
                        prefix = 'STAI_X2'
                    elif j == '한국형 회복탄력성 지수':
                        prefix = 'KRQ'
                    elif j == '한국어판 아침형-저녁형 설문지':
                        prefix = 'CSM'
                    elif j == '한글판 생물학적 리듬 평가 설문지':
                        prefix = 'BRIAN'
                    elif j == '한국형 기분장애 설문지':
                        prefix = 'MDQ'
                    elif j == '광장공포 인지 설문지':
                        prefix = 'ACQ'
                    elif j == '신체감각 설문지':
                        prefix = 'BSQ'
                    elif j == '한글판 범불안 장애':
                        prefix = 'GAD'
                    elif j == '한국어판 우울증 선별도구':
                        prefix = 'PHQ'

                    # 주제별 분리 처리 필요 설문
                    if j in ['유년기 외상 척도', '알바니 공황-공포 질문지']:
                        grouped = sub_df.copy()
                        grouped['real_score'] = scores
                        topic_order = {t: i+1 for i, t in enumerate(sorted(grouped['주제'].unique()))}
                        for topic, order in topic_order.items():
                            topic_df = grouped[grouped['주제'] == topic].reset_index(drop=True)
                            for qnum, sc in enumerate(topic_df['real_score'], start=1):
                                col_name = f"{('CTQ' if j=='유년기 외상 척도' else 'APPQ')}-{order}-{qnum}"
                                result.loc[
                                    (result['patient_code'] == patient_code) &
                                    (result['날짜'] == date_value),
                                    col_name
                                ] = sc
                    else:
                        for idx, sc in enumerate(scores, start=1):
                            col_name = f"{prefix}-{idx}"
                            result.loc[
                                (result['patient_code'] == patient_code) &
                                (result['날짜'] == date_value),
                                col_name
                            ] = sc

# 컬럼 순서 재배열: patient_code, date, 나머지
cols = ['patient_code', '날짜'] + [c for c in result.columns if c not in ['patient_code', '날짜']]
result = result[cols]

# 불필요 컬럼 삭제
result = result.drop(columns=['MDQ-14', 'MDQ-15', 'PHQ-10'], errors='ignore')

# 저장

output_path = os.path.join(output_folder, "questionnaire.csv")
os.makedirs(output_folder, exist_ok=True)
result.to_csv(output_path, index=False)


  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  result.loc[
  resu

In [5]:
# 사용할 접두어 목록
prefixes = ["PHQ", "STAI_X2", "CSM", "CTQ-1", "CTQ-2", "CTQ-3", "CTQ-4", "CTQ-5", "KRQ", "MDQ", "ACQ", "APPQ-1", "APPQ-2", "APPQ-3", "BSQ", "GAD", "BRIAN"]
# 결과를 저장할 데이터프레임
aggregated_df = result[['patient_code', '날짜']].copy()

for prefix in prefixes:
    # 해당 접두어로 시작하는 컬럼 찾기
    matched_cols = [col for col in result.columns if col.startswith(prefix)]
    
    # 값 합산해서 새로운 컬럼으로 추가
    aggregated_df[f'{prefix.rstrip("-")}'] = result[matched_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)

display(aggregated_df)


Unnamed: 0,patient_code,날짜,PHQ,STAI_X2,CSM,CTQ-1,CTQ-2,CTQ-3,CTQ-4,CTQ-5,KRQ,MDQ,ACQ,APPQ-1,APPQ-2,APPQ-3,BSQ,GAD,BRIAN
0,PXPN_10006,2024-11-04,0.0,32.0,31.0,11.0,13.0,17.0,28.0,12.0,219.0,1.0,21.0,0.0,10.0,2.0,25.0,2.0,25.0
1,PXPN_10007,2024-11-13,14.0,71.0,20.0,5.0,9.0,24.0,40.0,20.0,131.0,4.0,33.0,24.0,12.0,16.0,29.0,18.0,71.0
2,PXPN_10008,2024-11-04,2.0,54.0,24.0,5.0,9.0,7.0,28.0,5.0,165.0,4.0,35.0,29.0,28.0,31.0,49.0,7.0,46.0
3,PXPN_10009,2024-11-04,18.0,70.0,26.0,9.0,17.0,17.0,38.0,23.0,117.0,11.0,60.0,56.0,19.0,64.0,43.0,16.0,64.0
4,PXPN_10010,2024-11-06,15.0,67.0,20.0,5.0,6.0,5.0,15.0,7.0,163.0,4.0,43.0,32.0,32.0,58.0,57.0,13.0,64.0
5,PXPN_10011,2024-11-09,14.0,49.0,20.0,5.0,5.0,7.0,11.0,5.0,167.0,13.0,44.0,38.0,20.0,54.0,50.0,12.0,62.0
6,PXPN_10012,2024-11-11,0.0,60.0,20.0,5.0,6.0,5.0,26.0,14.0,153.0,0.0,37.0,26.0,7.0,23.0,42.0,9.0,0.0
7,PXPN_10013,2024-11-08,21.0,67.0,28.0,7.0,12.0,20.0,40.0,20.0,110.0,12.0,61.0,42.0,58.0,65.0,65.0,16.0,58.0
8,PXPN_10014,2024-11-18,13.0,71.0,14.0,15.0,5.0,16.0,28.0,9.0,123.0,2.0,33.0,31.0,13.0,30.0,46.0,18.0,70.0
9,PXPN_10015,2024-11-25,14.0,56.0,26.0,15.0,10.0,19.0,26.0,14.0,159.0,2.0,27.0,34.0,38.0,49.0,64.0,7.0,51.0


In [6]:
# 엑셀 파일 경로
file_path = enroll_path

# 엑셀 시트 읽기
df = pd.read_csv(file_path)

# 컬럼 이름 변경
df = df.rename(columns={'회원코드': 'patient_code', '2. 성별': 'gender'})

# 필요한 컬럼만 선택
df = df[['patient_code', '연구시작일', '연구종료일', 'gender']]

# 날짜 형식으로 변환
df['연구시작일'] = pd.to_datetime(df['연구시작일'], errors='coerce')
df['연구종료일'] = pd.to_datetime(df['연구종료일'], errors='coerce')

# 각 환자에 대해 날짜 생성
expanded_rows = []
for _, row in df.iterrows():
    if pd.notnull(row['연구시작일']) and pd.notnull(row['연구종료일']):
        date_range = pd.date_range(start=row['연구시작일'], end=row['연구종료일'])
        for date in date_range:
            expanded_rows.append({'patient_code': row['patient_code'], '날짜': date, 'gender': row['gender']})

# 결과 데이터프레임 생성
expanded_df = pd.DataFrame(expanded_rows)
expanded_df = expanded_df.replace({'남': '0', '여': '1'})
print(expanded_df)


    patient_code         날짜 gender
0     PXPN_10008 2024-11-04      0
1     PXPN_10008 2024-11-05      0
2     PXPN_10008 2024-11-06      0
3     PXPN_10008 2024-11-07      0
4     PXPN_10008 2024-11-08      0
..           ...        ...    ...
975   PXPN_10046 2025-07-02      0
976   PXPN_10046 2025-07-03      0
977   PXPN_10046 2025-07-04      0
978   PXPN_10046 2025-07-05      0
979   PXPN_10046 2025-07-06      0

[980 rows x 3 columns]


In [7]:
# 엑셀 파일 경로
file_path = enroll_path

# CSV 읽기
df = pd.read_csv(file_path)

# 컬럼 이름 변경 (생년월일 컬럼도 추가)
df = df.rename(columns={
    '회원코드': 'patient_code',
    '2. 성별': 'gender'  # 생년월일 컬럼 이름이 실제 다르면 이 부분 수정 필요
})

# 필요한 컬럼만 선택
df = df[['patient_code', '연구시작일', '연구종료일', 'gender', '3. 생년월일']]

# 날짜 형식으로 변환
df['연구시작일'] = pd.to_datetime(df['연구시작일'], errors='coerce')
df['연구종료일'] = pd.to_datetime(df['연구종료일'], errors='coerce')
df['3. 생년월일'] = pd.to_datetime(df['3. 생년월일'], errors='coerce')


# 각 환자에 대해 날짜 생성
expanded_rows = []
for _, row in df.iterrows():
    if pd.notnull(row['연구시작일']) and pd.notnull(row['연구종료일']):
        date_range = pd.date_range(start=row['연구시작일'], end=row['연구종료일'])
        for date in date_range:
            expanded_rows.append({
                'patient_code': row['patient_code'],
                '날짜': date,
                'gender': '0' if row['gender'] == '남' else '1'
            })

# 결과 데이터프레임 생성
expanded_df = pd.DataFrame(expanded_rows)

# 출력
print(expanded_df)

    patient_code         날짜 gender
0     PXPN_10008 2024-11-04      0
1     PXPN_10008 2024-11-05      0
2     PXPN_10008 2024-11-06      0
3     PXPN_10008 2024-11-07      0
4     PXPN_10008 2024-11-08      0
..           ...        ...    ...
975   PXPN_10046 2025-07-02      0
976   PXPN_10046 2025-07-03      0
977   PXPN_10046 2025-07-04      0
978   PXPN_10046 2025-07-05      0
979   PXPN_10046 2025-07-06      0

[980 rows x 3 columns]


In [8]:
expanded_df['날짜']    = pd.to_datetime(expanded_df['날짜'])
aggregated_df['날짜'] = pd.to_datetime(aggregated_df['날짜'])
expanded_answer = pd.merge(expanded_df, aggregated_df, on=['patient_code', '날짜'], how='outer')
expanded_answer = expanded_answer.rename(columns={'patient_code': 'ID', '날짜': 'date', 'GAD': 'GAD_7', 'CTQ-1': 'CTQ_1', 'CTQ-2': 'CTQ_2', 'CTQ-3': 'CTQ_3', 'CTQ-4': 'CTQ_4', 'CTQ-5': 'CTQ_5', 'APPQ-1': 'APPQ_1', 'APPQ-2': 'APPQ_2', 'APPQ-3': 'APPQ_3', 'PHQ': 'PHQ_9'})
output_path = os.path.join(output_folder, "questionnaire_test.csv")
expanded_answer.to_csv(output_path, index=False)

In [9]:
# 0. 날짜 형식 통일
expanded_answer['date'] = pd.to_datetime(expanded_answer['date'])

# 1. PXPN별 Panic 날짜 수집
PXPN_panic_dates = pd.DataFrame(columns=['ID', 'date'])
main_zip_path = zip_path  # 예: '/Users/.../ActiveData.zip'

with zipfile.ZipFile(main_zip_path, 'r') as outer_zip:
    for inner_name in outer_zip.namelist():
        if inner_name.startswith("ActiveData/") and inner_name.endswith('_ActiveData.zip'):
            pid = os.path.basename(inner_name).replace('_ActiveData.zip', '')

            with outer_zip.open(inner_name) as inner_file:
                data = inner_file.read()
                inner_bytes = BytesIO(data)

                if not zipfile.is_zipfile(inner_bytes):
                    print(f"❌ 내부 zip 아님 (무시됨): {inner_name}")
                    continue

                with zipfile.ZipFile(inner_bytes, 'r') as active_zip:
                    panic_csvs = [f for f in active_zip.namelist() if f.endswith('Panic.csv')]
                    if not panic_csvs:
                        print(f"⚠️ Panic.csv 없음: {inner_name}")
                        continue

                    with active_zip.open(panic_csvs[0]) as f:
                        df_panic = pd.read_csv(f)
                        if '작성일' not in df_panic.columns:
                            print(f"⚠️ '작성일' 없음: {inner_name}")
                            continue

                        for 작성일 in df_panic['작성일']:
                            PXPN_panic_dates = pd.concat([
                                PXPN_panic_dates,
                                pd.DataFrame({'ID': [pid], 'date': [작성일]})
                            ], ignore_index=True)

# 2. 날짜 및 panic 정리
PXPN_panic_dates['panic'] = 2
PXPN_panic_dates['date'] = pd.to_datetime(PXPN_panic_dates['date']).dt.strftime('%Y-%m-%d')
expanded_answer['date'] = pd.to_datetime(expanded_answer['date']).dt.strftime('%Y-%m-%d')

# 3. outer merge
merged = pd.merge(
    PXPN_panic_dates,
    expanded_answer,
    on=['ID', 'date'],
    how='outer'
)

# 4. 우선순위 panic 값 유지
merged = (
    merged
    .sort_values(['ID', 'date', 'panic'], ascending=[True, True, False])
    .drop_duplicates(subset=['ID', 'date'], keep='first')
)

# 5. 전날 panic=1 적용
df = merged.copy()
df['date'] = pd.to_datetime(df['date'])

for _, row in df[df['panic'] == 2].iterrows():
    pid = row['ID']
    curr_date = row['date']
    prev_date = curr_date - pd.Timedelta(days=1)

    # 연속된 2 제거
    while ((df['ID'] == pid) & (df['date'] == prev_date) & (df['panic'] == 2)).any():
        prev_date -= pd.Timedelta(days=1)

    mask = (df['ID'] == pid) & (df['date'] == prev_date)
    if mask.any():
        df.loc[mask & (df['panic'].fillna(0) < 1), 'panic'] = 1

df = df.sort_values(['ID', 'date']).reset_index(drop=True)
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df['panic'] = df['panic'].fillna(0)

# 6. Enroll 병합 (age)
enroll_df = pd.read_csv(enroll_path, encoding='utf-8')
enroll_df = enroll_df.rename(columns={
    '회원코드': 'patient_code',
    '3. 생년월일': 'birthdate',
    '연구종료일': 'end_date'
})
enroll_df['birthdate'] = pd.to_datetime(enroll_df['birthdate'], errors='coerce')
enroll_df['end_date'] = pd.to_datetime(enroll_df['end_date'], errors='coerce')
enroll_df['age'] = enroll_df['end_date'].dt.year - enroll_df['birthdate'].dt.year

age_df = enroll_df[['patient_code', 'age']].drop_duplicates().rename(columns={'patient_code': 'ID'})
final_df = df.merge(age_df, on='ID', how='left')

# 7. 저장
output_path = os.path.join(output_folder, "questionnaire_and_panic_date.csv")
final_df.to_csv(output_path, index=False)

⚠️ Panic.csv 없음: ActiveData/PXPN_10010_ActiveData.zip
⚠️ Panic.csv 없음: ActiveData/PXPN_10012_ActiveData.zip
⚠️ Panic.csv 없음: ActiveData/PXPN_10023_ActiveData.zip
⚠️ Panic.csv 없음: ActiveData/PXPN_10034_ActiveData.zip
⚠️ Panic.csv 없음: ActiveData/PXPN_10039_ActiveData.zip
⚠️ Panic.csv 없음: ActiveData/PXPN_10040_ActiveData.zip


In [10]:
# 파일 경로
csv_path = output_path
zip_path = zip_path
# 데이터 불러오기
df = pd.read_csv(csv_path)
df = df[df["ID"].str.startswith("PXPN")].copy()
df["date"] = pd.to_datetime(df["date"])

# 컬럼 초기화
for col in ['marriage', 'job', 'alcohol', 'coffee', 'smoking', 'menstruation', 'exercise',
            'smkHx', 'drinkHx', 'suicideHx',
            'suicide_need']: 
    df[col] = np.nan

# ZIP 열기
with zipfile.ZipFile(zip_path, 'r') as outer_zip:
    for pid in df['ID'].unique():
        inner_zip_name = f"ActiveData/{pid}_ActiveData.zip"
        if inner_zip_name not in outer_zip.namelist():
            continue

        # 내부 ZIP 열기
        with outer_zip.open(inner_zip_name) as inner_zip_file:
            inner_zip_bytes = BytesIO(inner_zip_file.read())
            with zipfile.ZipFile(inner_zip_bytes) as inner_zip:

                # 1. Sociodemographic 처리
                soc_path = f"{pid}_Sociodemographic.csv"
                if soc_path in inner_zip.namelist():
                    soc = pd.read_csv(inner_zip.open(soc_path), header=None, index_col=0).T
                    if '결혼' in soc.columns:
                        df.loc[df['ID'] == pid, 'marriage'] = 1 if soc['결혼'].values[0] == '기혼' else 0
                    if '현재 직업 유무' in soc.columns:
                        df.loc[df['ID'] == pid, 'job'] = 1 if soc['현재 직업 유무'].values[0] == 'Y' else 0
                    if '과거 흡연 여부' in soc.columns:
                        df.loc[df['ID'] == pid, 'smkHx'] = 1 if soc['과거 흡연 여부'].values[0] == 'Y' else 0
                    if '지금까지 음주 여부' in soc.columns:
                        df.loc[df['ID'] == pid, 'drinkHx'] = 1 if soc['지금까지 음주 여부'].values[0] == 'Y' else 0
                    if '과거 자살 시도 여부' in soc.columns:
                        df.loc[df['ID'] == pid, 'suicideHx'] = 1 if soc['과거 자살 시도 여부'].values[0] == 'Y' else 0
                    if '지난 1달간 자살시도 여부' in soc.columns:
                        df.loc[df['ID'] == pid, 'suicide_need'] = 1 if soc['지난 1달간 자살시도 여부'].values[0] == 'Y' else 0


                # 2. Pattern 처리
                pat_path = f"{pid}_Pattern.csv"
                if pat_path in inner_zip.namelist():
                    pat = pd.read_csv(inner_zip.open(pat_path))
                    pat['작성일'] = pd.to_datetime(pat['작성일'], errors='coerce')

                    for idx, row in df[df["ID"] == pid].iterrows():
                        d = row["date"]
                        today_rows = pat[pat["작성일"] == d]
                        for _, r in today_rows.iterrows():
                            t = r.get('종류', '')
                            st = r.get('세부종류', '')
                            amount = r.get('양', None)  # '양' 컬럼 값
                            # 운동
                            if t == '운동':
                                # 양 값이 있으면 그 값을, 없으면 1 로 디폴트
                                df.at[idx, 'exercise'] = amount if pd.notna(amount) else 1
                            # 카페인
                            if t == '카페인':
                                df.at[idx, 'coffee'] = amount if pd.notna(amount) else 1
                            # 흡연
                            if t == '흡연':
                                df.at[idx, 'smoking'] = amount if pd.notna(amount) else 1
                            # 음주(양이 아닌 단순 여부만 원하면 기존처럼 1로)
                            if t == '음주':
                                df.at[idx, 'alcohol'] = amount if pd.notna(amount) else 1
                            # 생리
                            if t == '생리' and st == '생리중':
                                df.at[idx, 'menstruation'] = 1

output_path = os.path.join(output_folder, "questionnaire_and_panic_dates_and_demo.csv")
df.to_csv(output_path, index=False)

  df.at[idx, 'exercise'] = amount if pd.notna(amount) else 1
  df.at[idx, 'alcohol'] = amount if pd.notna(amount) else 1
  df.at[idx, 'coffee'] = amount if pd.notna(amount) else 1
  df.at[idx, 'smoking'] = amount if pd.notna(amount) else 1


In [11]:
# 1. 경로 설정
zip_path = zip_path
processed = df  

# 3. PXPN ID 목록 추출
pxpn_ids = processed[processed['ID'].astype(str).str.startswith('PXPN')]['ID'].unique()

# 4. 감정 관련 컬럼 초기화
emotion_cols = ['positive_feeling', 'negative', 'positive_E', 'negative_E', 'anxiety', 'annoying']
for col in emotion_cols:
    if col not in processed.columns:
        processed[col] = np.nan

# 5. 디버그용 카운터 및 정보
match_count = 0
no_date_match = 0
processed_ids = set()
debug_info = []

# 6. 외부 zip 열기
try:
    with zipfile.ZipFile(zip_path, 'r') as outer_zip:

        # 7. 모든 PXPN ID에 대해 반복
        for pid in pxpn_ids:
            pid = str(pid).strip()
            inner_zip_name = f"ActiveData/{pid}_ActiveData.zip"

            if inner_zip_name not in outer_zip.namelist():
                debug_info.append(f"❌ ID {pid}: 내부 ZIP 없음 → {inner_zip_name}")
                continue

            try:
                with outer_zip.open(inner_zip_name) as inner_zip_file:
                    inner_zip_bytes = BytesIO(inner_zip_file.read())

                    with zipfile.ZipFile(inner_zip_bytes) as inner_zip:
                        checkup_filename = f"{pid}_Checkup.csv"

                        if checkup_filename not in inner_zip.namelist():
                            debug_info.append(f"⚠️ ID {pid}: Checkup 파일 없음")
                            continue

                        # Checkup CSV 읽기
                        checkup = pd.read_csv(inner_zip.open(checkup_filename))

                        # 날짜 타입 변환
                        processed_pid = processed[processed['ID'] == pid].copy()
                        processed_pid['date'] = pd.to_datetime(processed_pid['date'], errors='coerce')
                        checkup['작성일'] = pd.to_datetime(checkup['작성일'], errors='coerce')

                        # 감정 카테고리별 처리
                        for category in ['기분', '에너지', '불안', '짜증']:
                            category_data = checkup[checkup['종류'] == category]

                            for _, row in category_data.iterrows():
                                checkup_date = row['작성일']
                                score = row['척도']

                                for idx, proc_row in processed_pid.iterrows():
                                    proc_date = proc_row['date']
                                    if (
                                        proc_date.year == checkup_date.year and
                                        proc_date.month == checkup_date.month and
                                        proc_date.day == checkup_date.day
                                    ):
                                        if category == '기분':
                                            if score > 0:
                                                processed.at[idx, 'positive_feeling'] = score
                                            elif score < 0:
                                                processed.at[idx, 'negative'] = score
                                        elif category == '에너지':
                                            if score > 0:
                                                processed.at[idx, 'positive_E'] = score
                                            elif score < 0:
                                                processed.at[idx, 'negative_E'] = score
                                        elif category == '불안':
                                            processed.at[idx, 'anxiety'] = score
                                        elif category == '짜증':
                                            processed.at[idx, 'annoying'] = score

                                        match_count += 1
                                        processed_ids.add(pid)
            except Exception as e:
                debug_info.append(f"❗ ID {pid} 처리 중 오류: {str(e)}")
except Exception as e:
    debug_info.append(f"ZIP 파일 처리 전체 실패: {str(e)}")


# 8. 기분 및 에너지 충돌 조정 (절대값 기준)
mask_mood = processed['positive_feeling'].notna() & processed['negative'].notna()
for idx in processed[mask_mood].index:
    pos = processed.at[idx, 'positive_feeling']
    neg = processed.at[idx, 'negative']
    if abs(pos) > abs(neg):
        processed.at[idx, 'negative'] = 0
    elif abs(pos) < abs(neg):
        processed.at[idx, 'positive_feeling'] = 0
    else:
        processed.at[idx, 'positive_feeling'] = 0  # 동일하면 긍정 제거, 부정 유지

mask_energy = processed['positive_E'].notna() & processed['negative_E'].notna()
for idx in processed[mask_energy].index:
    posE = processed.at[idx, 'positive_E']
    negE = processed.at[idx, 'negative_E']
    if abs(posE) > abs(negE):
        processed.at[idx, 'negative_E'] = 0
    elif abs(posE) < abs(negE):
        processed.at[idx, 'positive_E'] = 0
    else:
        processed.at[idx, 'positive_E'] = 0

# 9. 값이 한쪽만 있을 경우 다른 쪽을 0으로 설정
processed.loc[
    processed['positive_feeling'].notna() & processed['negative'].isna(),
    'negative'
] = 0
processed.loc[
    processed['negative'].notna() & processed['positive_feeling'].isna(),
    'positive_feeling'
] = 0

processed.loc[
    processed['positive_E'].notna() & processed['negative_E'].isna(),
    'negative_E'
] = 0
processed.loc[
    processed['negative_E'].notna() & processed['positive_E'].isna(),
    'positive_E'
] = 0

# 10. 디버그 출력 (최대 20개)
for info in debug_info[:20]:
    print(info)


output_path = os.path.join(output_folder, "questionnaire_panic_demo_mood.csv")
processed.to_csv(output_path, index=False)

In [12]:
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_datetime64_any_dtype

# 전체 컬럼 순회하며 형식 통일
for col in processed.columns:
    if col == 'ID':
        processed[col] = processed[col].astype(str).str.strip()
    elif col == 'date':
        processed[col] = pd.to_datetime(processed[col], errors='coerce')  # 문자열 포함 시 자동 처리
    elif is_datetime64_any_dtype(processed[col]):
        processed[col] = pd.to_datetime(processed[col], errors='coerce')  # datetime이면 그대로
    elif is_string_dtype(processed[col]):
        processed[col] = processed[col].astype(str).str.strip()  # 문자열이면 정리
    else:
        # 예외적인 경우도 문자열로 통일
        processed[col] = processed[col].astype(str).str.strip()


output_path = os.path.join(output_folder, "processed.csv")
processed.to_csv(output_path, index=False)
print("모든 컬럼 형식 통일 완료 및 저장됨.")

모든 컬럼 형식 통일 완료 및 저장됨.


In [13]:
print(processed.columns)

Index(['ID', 'date', 'panic', 'gender', 'PHQ_9', 'STAI_X2', 'CSM', 'CTQ_1',
       'CTQ_2', 'CTQ_3', 'CTQ_4', 'CTQ_5', 'KRQ', 'MDQ', 'ACQ', 'APPQ_1',
       'APPQ_2', 'APPQ_3', 'BSQ', 'GAD_7', 'BRIAN', 'age', 'marriage', 'job',
       'alcohol', 'coffee', 'smoking', 'menstruation', 'exercise', 'smkHx',
       'drinkHx', 'suicideHx', 'suicide_need', 'positive_feeling', 'negative',
       'positive_E', 'negative_E', 'anxiety', 'annoying'],
      dtype='object')
