### Data Process

In [15]:
import pandas as pd


def process_event(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    prev_sentence = None

    for row in df.itertuples():
        # Target text
        target_text = []
        if not pd.isna(row.Remission) or not pd.isna(row.Response):
            target_text.append("Remission")
        if not pd.isna(row.Acute):
            target_text.append("Acute")
        if not pd.isna(row.DayCare):
            target_text.append("DayCare")
        if not pd.isna(row.Episode):
            target_text.append("Episode")
        if len(target_text) == 0:
            target_text = "None"
        else:
            target_text = ", ".join(target_text)
        # Merge to the previous example if the current sentence is the same as previous one
        if prev_sentence == row.Sentence:
            # Continue if no events and duplicated sentences
            if target_text == "None":
                continue
            else:
                if rtn["target_text"][-1] == "None":
                    rtn["target_text"][-1] = target_text
                elif target_text not in rtn["target_text"][-1]:
                    rtn["target_text"][-1] = f"{rtn['target_text'][-1]}, {target_text}"
        else:
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append("event detection")
            rtn["input_text"].append(f"{row.Sentence} order: {row.Order}. options: Remission, Acute, DayCare, Episode.")
            rtn["target_text"].append(target_text)
        # Store sentence
        prev_sentence = row.Sentence

    return pd.DataFrame(rtn)


def process_time(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    duration_head = None
    prev_sentence = None

    for row in df.itertuples():
        # Target text is None when TimeInfo column is NaN
        if pd.isna(row.TimeInfo):
            target_text = "None"
        else:
            # Target text is composed by two lines when duration is not NaN
            if not pd.isna(row.Duration):
                # Duration head
                if duration_head is None:
                    duration_head = row.Time_YMD
                    continue
                else:
                    target_text = f"duration: {duration_head} to {row.Time_YMD}"
                    duration_head = None
            else:
                target_text = []
                if not pd.isna(row.Time_YMD):
                    target_text.append(f"time: {row.Time_YMD}.")
                if not pd.isna(row.Vague):
                    target_text.append(f"vague: {row.Vague}.")
                if not pd.isna(row.Age):
                    target_text.append(f"age: {row.Age}.")
                if not pd.isna(row.Ago_YMD):
                    target_text.append(f"ago: {row.Ago_YMD}.")
                assert len(target_text) != 0, row.AID
                target_text = " ".join(target_text)

        # Merge to the previous example if the current sentence is the same as previous one
        if row.Sentence == prev_sentence:
            # Continue if no events and duplicated sentences
            if target_text == "None":
                continue
            else:
                if rtn["target_text"][-1] == "None":
                    rtn["target_text"][-1] = target_text
                else:
                    rtn["target_text"][-1] = f"{rtn['target_text'][-1]} {target_text}"
        else:
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append("time extraction")
            rtn["input_text"].append(f"{row.Sentence} admission date: {row.Admissindate}. options: time, vague, age, ago.")
            rtn["target_text"].append(target_text)
        # Store sentence
        prev_sentence = row.Sentence

    return pd.DataFrame(rtn)


def process_data(df: pd.DataFrame) -> pd.DataFrame:

    rtn = pd.DataFrame()
    rtn = pd.concat([rtn, process_event(df)], ignore_index=False)
    rtn = pd.concat([rtn, process_time(df)], ignore_index=False)

    return rtn

In [None]:
import pandas as pd
from pathlib import Path

EXCLUDE_SHEETS = ["500篇ID說明", "500篇ID處理說明", "工作表1"]
COLUMNS = [
    "AID", "PID", "Admissindate", "Sentence",
    "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "TimeInfo",
    "Remission", "Response", "緩解時間", "Acute", "急性住院時間", "DayCare", "慢性住院時間", "Episode", "Episode時間"
]

data_dir = "./data/raw/"
data_dir_path = Path(data_dir)
processed = pd.DataFrame()
for file in data_dir_path.iterdir():
    # Load data
    data = pd.read_excel(file, sheet_name=None, engine='openpyxl', dtype=str)
    # Access sheet name
    sheet = pd.ExcelFile(file, engine='openpyxl')
    sheet = [s for s in sheet.sheet_names if s not in EXCLUDE_SHEETS]
    assert len(sheet) == 1
    sheet_name = sheet[0]
    # Select sheet and set columns
    df = data.get(sheet_name)
    df = df[COLUMNS]
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    # Process
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed_file = "./data/processed/data.xlsx"
processed.to_excel(processed_file, index=False)

In [1]:
import pandas as pd
import numpy as np

data = pd.read_excel("./data/processed/data.xlsx", engine='openpyxl', dtype=str)
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.7*len(data)), int(.8*len(data))])
print(f"Number of train data: {len(train)}")
print(f"Number of validation data: {len(validate)}")
print(f"Number of test data: {len(test)}")
print(f"Number of total data: {len(data)}")

Number of train data: 19049
Number of validation data: 2721
Number of test data: 5443
Number of total data: 27213


### Binary Datasets

* all in one merged by prefix ex. prefix=acute, prefix=vague...
* Cannot identify which time does event belongs to => the same sentence may have None and Not None.

In [5]:
for i in range(0, 4, 2):
    print(i)

0
2


In [16]:
from typing import Dict
from pathlib import Path
import pandas as pd
from datetime import datetime

EVENT_NAME = ["Remission", "Acute", "DayCare", "Episode"]


def process_event(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    for row in df.itertuples():
        for e_name in EVENT_NAME:
            if e_name == "Remission" and not pd.isna(row.Response):
                target_text == e_name
            else:
                target_text = "None" if pd.isna(getattr(row, e_name)) else e_name
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append(e_name.lower())
            rtn["input_text"].append(row.Sentence)
            rtn["target_text"].append(target_text)

    df = pd.DataFrame(rtn)
    df.drop_duplicates(inplace=True)

    return df


TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]

def get_time_target_text(prev_info: Dict, t_name:str) -> str:
    # The target text is None when there is no values
    if len(prev_info["values"]) == 0:
        target_text = "None"
    else:
        # Pair the duration values
        if t_name == "Duration":
            target_text = []
            assert len(prev_info["values"]) % 2 == 0, prev_info
            for i in range(0, len(prev_info["values"]), 2):
                target_text.append(f"{prev_info['values'][i]} to {prev_info['values'][i+1]}")
            target_text = ". ".join(target_text) + "."
        # Merge the collected values
        else:
            target_text = ". ".join(prev_info["values"]) + "."
    return target_text

def process_time(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    prev_info = {"aid": None, "pid": None, "sentence": None, "values": []}

    for t_name in TIME_NAME:
        for row in df.itertuples():
            # Merge data with the same sentence
            if row.Sentence == prev_info["sentence"]:
                if not pd.isna(getattr(row, t_name)) and not pd.isna(row.TimeInfo):
                    if t_name == "Duration":
                        if not pd.isna(row.Time):
                            prev_info["values"].append(row.Time)
                        elif not pd.isna(row.Age):
                            prev_info["values"].append(row.Age)
                        elif not pd.isna(row.Vague):
                            prev_info["values"].append(row.Vague)
                        elif not pd.isna(row.Ago):
                            prev_info["values"].append(row.Ago)
                    else:
                        prev_info["values"].append(getattr(row, t_name))
            # When encounter different sentence, store data and reset information collection
            else:
                if prev_info["aid"] is not None:
                    # Store data instance
                    rtn["aid"].append(prev_info["aid"])
                    rtn["pid"].append(prev_info["pid"])
                    rtn["prefix"].append(t_name.lower())
                    rtn["input_text"].append(prev_info["sentence"])
                    rtn["target_text"].append(get_time_target_text(prev_info, t_name))

                # Reset information collection
                if not pd.isna(getattr(row, t_name)) and not pd.isna(row.TimeInfo):
                    if t_name == "Duration":
                        if not pd.isna(row.Time):
                            value = [row.Time]
                        elif not pd.isna(row.Age):
                            value = [row.Age]
                        elif not pd.isna(row.Vague):
                            value = [row.Vague]
                        elif not pd.isna(row.Ago):
                            value = [row.Ago]
                        else:
                            print(f"{row.AID} {row.PID}")
                    else:
                        value = [getattr(row, t_name)]
                else:
                    value = []
                prev_info["aid"] = row.AID
                prev_info["pid"] = row.PID
                prev_info["sentence"] = row.Sentence
                prev_info["values"] = value

        # Store data instance
        rtn["aid"].append(prev_info["aid"])
        rtn["pid"].append(prev_info["pid"])
        rtn["prefix"].append(t_name.lower())
        rtn["input_text"].append(prev_info["sentence"])
        rtn["target_text"].append(get_time_target_text(prev_info, t_name))

    return pd.DataFrame(rtn)


def process_data(df: pd.DataFrame) -> pd.DataFrame:

    rtn = pd.DataFrame()
    rtn = pd.concat([rtn, process_event(df)], ignore_index=False)
    rtn = pd.concat([rtn, process_time(df)], ignore_index=False)

    return rtn

data = pd.read_excel("./data/raw/data.xlsx", sheet_name=None, engine='openpyxl', dtype=str)
sheet = pd.ExcelFile("./data/raw/data.xlsx", engine='openpyxl')
processed = pd.DataFrame()
for s_name in sheet.sheet_names:
    df = data.get(s_name)
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed.to_excel("./data/processed/data_binary.xlsx", index=False)

In [None]:
import pandas as pd
from transformers import AutoTokenizer

data = pd.read_excel("./data/processed/data_binary.xlsx")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base", cache_dir="/nfs/nas-7.1/chchen/cache/huggingface/")
input_len = [len(tokenizer(text)["input_ids"]) for text in data.input_text]
target_len = [len(tokenizer(text)["input_ids"]) for text in data.target_text]
print(max(input_len), max(target_len))

In [32]:
import pandas as pd
from pathlib import Path

EXCLUDE_SHEETS = ["500篇ID說明", "500篇ID處理說明", "工作表1"]
COLUMNS = [
    "AID", "PID", "Admissindate", "Sentence",
    "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "TimeInfo",
    "Remission", "Response", "緩解時間", "Acute", "急性住院時間", "DayCare", "慢性住院時間", "Episode", "Episode時間"
]

data_dir = "./data/raw/"
data_dir_path = Path(data_dir)
processed = pd.DataFrame()
for file in data_dir_path.iterdir():
    # Load data
    data = pd.read_excel(file, sheet_name=None, engine='openpyxl', dtype=str)
    # Access sheet name
    sheet = pd.ExcelFile(file, engine='openpyxl')
    sheet = [s for s in sheet.sheet_names if s not in EXCLUDE_SHEETS]
    assert len(sheet) == 1
    sheet_name = sheet[0]
    # Select sheet and set columns
    df = data.get(sheet_name)
    df = df[COLUMNS]
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    # Process
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed_file = "./data/processed/data_binary.xlsx"
processed.to_excel(processed_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [None]:
def add_order(df: pd.DataFrame) -> pd.DataFrame:
    """Indicate the order of sentence."""

    aid = df["AID"].values.tolist()
    
    order = []
    cur_aid = None
    cur_order = 0
    for idx in aid:
        if cur_aid != idx:
            cur_aid = idx
            cur_order = 0
        order.append(cur_order)
        cur_order += 1

    df["Order"] = order

    return df

In [1]:
# TODO: use rule to inference event time

In [None]:
import pandas as pd

output_file = "./outputs/results.xlsx"
df = pd.read_excel(output_file)

In [3]:
from sklearn.metrics import classification_report
# Event
df_event = df[df["prefix"] == "event detection"]
y_true = []
y_pred = []
labels = {"Acute": 0, "DayCare":1 , "Episode": 2, "Remission": 3, "None": 4}
for y in df_event.target_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_true.append(vector)
for y in df_event.pred_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_pred.append(vector)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       220
           1       0.79      0.39      0.52        28
           2       0.52      0.36      0.42       192
           3       0.68      0.53      0.60       122
           4       0.92      0.95      0.93      2144

   micro avg       0.89      0.88      0.88      2706
   macro avg       0.76      0.63      0.68      2706
weighted avg       0.88      0.88      0.87      2706
 samples avg       0.89      0.88      0.88      2706



In [7]:
df_time = df[df["prefix"] == "time extraction"]
correct = sum([target == pred for target, pred in zip(df_time.target_text.values, df_time.pred_text.values)])
correct_rate = correct / len(df_time)
print(correct_rate)

0.880246020260492


In [11]:
df_time = df[df["prefix"] == "time extraction"]
correct = sum([target == pred for target, pred in zip(df_time.target_text.values, df_time.pred_text.values)if target != "None"])
total = sum([1 for target in df_time.target_text.values if target != "None"])
correct_rate = correct / total
print(correct_rate)

0.7480490523968785
