### Data Process

In [None]:
import pandas as pd
from pathlib import Path

EXCLUDE_SHEETS = ["500篇ID說明", "500篇ID處理說明", "工作表1"]
COLUMNS = [
    "AID", "PID", "Admissindate", "Sentence",
    "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "TimeInfo",
    "Remission", "Response", "緩解時間", "Acute", "急性住院時間", "DayCare", "慢性住院時間", "Episode", "Episode時間"
]

data_dir = "./data/raw/"
data_dir_path = Path(data_dir)
df = pd.DataFrame()
for file in data_dir_path.iterdir():
    # Load data
    data = pd.read_excel(file, sheet_name=None, engine='openpyxl', dtype=str)
    # Access sheet name
    sheet = pd.ExcelFile(file, engine='openpyxl')
    sheet = [s for s in sheet.sheet_names if s not in EXCLUDE_SHEETS]
    assert len(sheet) == 1
    sheet_name = sheet[0]
    # Merge data
    df = pd.concat([df, data.get(sheet_name)], ignore_index=False)
df = df[COLUMNS]
df.head()

In [None]:
df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
df.head()

In [39]:
TE_COLUMNS = ["Time_YMD", "Vague", "Age", "Ago_YMD"]
ED_COLUMNs = ["Remission", "Response", "Acute", "DayCare", "Episode"]

processed = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

duration_head = None
prev_sentence = None

# Event
for row in df.itertuples():
    # Target text
    target_text = []
    if not pd.isna(row.Remission) or not pd.isna(row.Response):
        target_text.append("Remission")
    if not pd.isna(row.Acute):
        target_text.append("Acute")
    if not pd.isna(row.DayCare):
        target_text.append("DayCare")
    if not pd.isna(row.Episode):
        target_text.append("Episode")
    if len(target_text) == 0:
        target_text = "None"
    else:
        target_text = ", ".join(target_text)
    # Merge to the previous example if the current sentence is the same as previous one
    if prev_sentence == row.Sentence:
        # Continue if no events and duplicated sentences
        if target_text == "None":
            continue
        else:
            processed["target_text"] = f"{processed['target_text']}, {target_text}"
    else:
        processed["aid"] = row.AID
        processed["pid"] = row.PID
        processed["prefix"] = "event detection"
        processed["input_text"] = f"{row.Sentence} options: Remission, Acute, DayCare, Episode."
        processed["target_text"] = target_text
    # Store sentence
    prev_sentence = row.Sentence

# Time
for row in df.itertuples():
    # Target text is None when TimeInfo column is NaN
    if pd.isna(row.TimeInfo):
        target_text = "None"
    else:
        # Target text is composed by two lines when duration is not NaN
        if not pd.isna(row.Duration):
            # Duration head
            if duration_head is None:
                duration_head = row.Time_YMD
                continue
            else:
                target_text = f"duration: {duration_head} to {row.Time_YMD}"
                duration_head = None
        else:
            target_text = []
            if not pd.isna(row.Time_YMD):
                target_text.append(f"time: {row.Time_YMD}.")
            if not pd.isna(row.Vague):
                target_text.append(f"vague: {row.Vague}.")
            if not pd.isna(row.Age):
                target_text.append(f"age: {row.Age}.")
            if not pd.isna(row.Ago_YMD):
                target_text.append(f"ago: {row.Ago_YMD}.")
            assert len(target_text) != 0
            target_text = " ".join(target_text)

    # Merge to the previous example if the current sentence is the same as previous one
    if row.Sentence == prev_sentence:
        if target_text != "None":
            processed["target_text"] = f"{processed['target_text']} {target_text}"
        else:
            continue
    else:
        processed["aid"] = row.AID
        processed["pid"] = row.PID
        processed["prefix"] = "time extraction"
        processed["input_text"] = f"{row.Sentence} admission date: {row.Admissindate}. options: time, vague, age, ago."
        processed["target_text"] = target_text
    # Store sentence
    prev_sentence = row.Sentence

# Save
processed_file = "./data/processed/data.xlsx"
processed_df = pd.DataFrame(processed)
processed_df.to_excel(processed_file)

AssertionError: 