### Data Process

In [15]:
import pandas as pd


def process_event(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    prev_sentence = None

    for row in df.itertuples():
        # Target text
        target_text = []
        if not pd.isna(row.Remission) or not pd.isna(row.Response):
            target_text.append("Remission")
        if not pd.isna(row.Acute):
            target_text.append("Acute")
        if not pd.isna(row.DayCare):
            target_text.append("DayCare")
        if not pd.isna(row.Episode):
            target_text.append("Episode")
        if len(target_text) == 0:
            target_text = "None"
        else:
            target_text = ", ".join(target_text)
        # Merge to the previous example if the current sentence is the same as previous one
        if prev_sentence == row.Sentence:
            # Continue if no events and duplicated sentences
            if target_text == "None":
                continue
            else:
                if rtn["target_text"][-1] == "None":
                    rtn["target_text"][-1] = target_text
                elif target_text not in rtn["target_text"][-1]:
                    rtn["target_text"][-1] = f"{rtn['target_text'][-1]}, {target_text}"
        else:
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append("event detection")
            rtn["input_text"].append(f"{row.Sentence} order: {row.Order}. options: Remission, Acute, DayCare, Episode.")
            rtn["target_text"].append(target_text)
        # Store sentence
        prev_sentence = row.Sentence

    return pd.DataFrame(rtn)


def process_time(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    duration_head = None
    prev_sentence = None

    for row in df.itertuples():
        # Target text is None when TimeInfo column is NaN
        if pd.isna(row.TimeInfo):
            target_text = "None"
        else:
            # Target text is composed by two lines when duration is not NaN
            if not pd.isna(row.Duration):
                # Duration head
                if duration_head is None:
                    duration_head = row.Time_YMD
                    continue
                else:
                    target_text = f"duration: {duration_head} to {row.Time_YMD}"
                    duration_head = None
            else:
                target_text = []
                if not pd.isna(row.Time_YMD):
                    target_text.append(f"time: {row.Time_YMD}.")
                if not pd.isna(row.Vague):
                    target_text.append(f"vague: {row.Vague}.")
                if not pd.isna(row.Age):
                    target_text.append(f"age: {row.Age}.")
                if not pd.isna(row.Ago_YMD):
                    target_text.append(f"ago: {row.Ago_YMD}.")
                assert len(target_text) != 0, row.AID
                target_text = " ".join(target_text)

        # Merge to the previous example if the current sentence is the same as previous one
        if row.Sentence == prev_sentence:
            # Continue if no events and duplicated sentences
            if target_text == "None":
                continue
            else:
                if rtn["target_text"][-1] == "None":
                    rtn["target_text"][-1] = target_text
                else:
                    rtn["target_text"][-1] = f"{rtn['target_text'][-1]} {target_text}"
        else:
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append("time extraction")
            rtn["input_text"].append(f"{row.Sentence} admission date: {row.Admissindate}. options: time, vague, age, ago.")
            rtn["target_text"].append(target_text)
        # Store sentence
        prev_sentence = row.Sentence

    return pd.DataFrame(rtn)


def process_data(df: pd.DataFrame) -> pd.DataFrame:

    rtn = pd.DataFrame()
    rtn = pd.concat([rtn, process_event(df)], ignore_index=False)
    rtn = pd.concat([rtn, process_time(df)], ignore_index=False)

    return rtn

In [None]:
import pandas as pd
from pathlib import Path

EXCLUDE_SHEETS = ["500篇ID說明", "500篇ID處理說明", "工作表1"]
COLUMNS = [
    "AID", "PID", "Admissindate", "Sentence",
    "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "TimeInfo",
    "Remission", "Response", "緩解時間", "Acute", "急性住院時間", "DayCare", "慢性住院時間", "Episode", "Episode時間"
]

data_dir = "./data/raw/"
data_dir_path = Path(data_dir)
processed = pd.DataFrame()
for file in data_dir_path.iterdir():
    # Load data
    data = pd.read_excel(file, sheet_name=None, engine='openpyxl', dtype=str)
    # Access sheet name
    sheet = pd.ExcelFile(file, engine='openpyxl')
    sheet = [s for s in sheet.sheet_names if s not in EXCLUDE_SHEETS]
    assert len(sheet) == 1
    sheet_name = sheet[0]
    # Select sheet and set columns
    df = data.get(sheet_name)
    df = df[COLUMNS]
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    # Process
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed_file = "./data/processed/data.xlsx"
processed.to_excel(processed_file, index=False)

In [1]:
import pandas as pd
import numpy as np

data = pd.read_excel("./data/processed/data.xlsx", engine='openpyxl', dtype=str)
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.7*len(data)), int(.8*len(data))])
print(f"Number of train data: {len(train)}")
print(f"Number of validation data: {len(validate)}")
print(f"Number of test data: {len(test)}")
print(f"Number of total data: {len(data)}")

Number of train data: 19049
Number of validation data: 2721
Number of test data: 5443
Number of total data: 27213


### Binary Datasets

* all in one merged by prefix ex. prefix=acute, prefix=vague...
* Cannot identify which time does event belongs to => the same sentence may have None and Not None.

In [1]:
from typing import Dict
from pathlib import Path
import pandas as pd
from datetime import datetime

EVENT_NAME = ["Remission", "Acute", "DayCare", "Episode"]


def process_event(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    for row in df.itertuples():
        for e_name in EVENT_NAME:
            if e_name == "Remission" and not pd.isna(row.Response):
                target_text == e_name
            else:
                target_text = "None" if pd.isna(getattr(row, e_name)) else e_name
            rtn["aid"].append(row.AID)
            rtn["pid"].append(row.PID)
            rtn["prefix"].append(e_name.lower())
            rtn["input_text"].append(row.Sentence)
            rtn["target_text"].append(target_text)

    df = pd.DataFrame(rtn)
    df.drop_duplicates(inplace=True)

    return df


TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]

def get_time_target_text(prev_info: Dict, t_name:str) -> str:
    # The target text is None when there is no values
    if len(prev_info["values"]) == 0:
        target_text = "None"
    else:
        # Pair the duration values
        if t_name == "Duration":
            target_text = []
            assert len(prev_info["values"]) % 2 == 0, prev_info
            for i in range(0, len(prev_info["values"]), 2):
                target_text.append(f"{prev_info['values'][i]} to {prev_info['values'][i+1]}")
            target_text = ". ".join(target_text) + "."
        # Merge the collected values
        else:
            target_text = ". ".join(prev_info["values"]) + "."
    return target_text

def process_time(df: pd.DataFrame) -> pd.DataFrame:

    rtn = {"aid": [], "pid": [], "prefix": [], "input_text": [], "target_text": []}

    prev_info = {"aid": None, "pid": None, "sentence": None, "values": []}

    for t_name in TIME_NAME:
        for row in df.itertuples():
            # Merge data with the same sentence
            if row.Sentence == prev_info["sentence"]:
                if not pd.isna(getattr(row, t_name)) and not pd.isna(row.TimeInfo):
                    if t_name == "Duration":
                        if not pd.isna(row.Time):
                            prev_info["values"].append(row.Time)
                        elif not pd.isna(row.Age):
                            prev_info["values"].append(row.Age)
                        elif not pd.isna(row.Vague):
                            prev_info["values"].append(row.Vague)
                        elif not pd.isna(row.Ago):
                            prev_info["values"].append(row.Ago)
                    else:
                        prev_info["values"].append(getattr(row, t_name))
            # When encounter different sentence, store data and reset information collection
            else:
                if prev_info["aid"] is not None:
                    # Store data instance
                    rtn["aid"].append(prev_info["aid"])
                    rtn["pid"].append(prev_info["pid"])
                    rtn["prefix"].append(t_name.lower())
                    rtn["input_text"].append(prev_info["sentence"])
                    rtn["target_text"].append(get_time_target_text(prev_info, t_name))

                # Reset information collection
                if not pd.isna(getattr(row, t_name)) and not pd.isna(row.TimeInfo):
                    if t_name == "Duration":
                        if not pd.isna(row.Time):
                            value = [row.Time]
                        elif not pd.isna(row.Age):
                            value = [row.Age]
                        elif not pd.isna(row.Vague):
                            value = [row.Vague]
                        elif not pd.isna(row.Ago):
                            value = [row.Ago]
                        else:
                            print(f"{row.AID} {row.PID}")
                    else:
                        value = [getattr(row, t_name)]
                else:
                    value = []
                prev_info["aid"] = row.AID
                prev_info["pid"] = row.PID
                prev_info["sentence"] = row.Sentence
                prev_info["values"] = value

        # Store data instance
        rtn["aid"].append(prev_info["aid"])
        rtn["pid"].append(prev_info["pid"])
        rtn["prefix"].append(t_name.lower())
        rtn["input_text"].append(prev_info["sentence"])
        rtn["target_text"].append(get_time_target_text(prev_info, t_name))

    return pd.DataFrame(rtn)


def process_data(df: pd.DataFrame) -> pd.DataFrame:

    rtn = pd.DataFrame()
    rtn = pd.concat([rtn, process_event(df)], ignore_index=False)
    rtn = pd.concat([rtn, process_time(df)], ignore_index=False)

    return rtn

data = pd.read_excel("./data/raw/data.xlsx", sheet_name=None, engine='openpyxl', dtype=str)
sheet = pd.ExcelFile("./data/raw/data.xlsx", engine='openpyxl')
processed = pd.DataFrame()
for s_name in sheet.sheet_names:
    df = data.get(s_name)
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed.to_excel("./data/processed/data_binary.xlsx", index=False)

In [None]:
import pandas as pd
from transformers import AutoTokenizer

data = pd.read_excel("./data/processed/data_binary.xlsx")
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base", cache_dir="/nfs/nas-7.1/chchen/cache/huggingface/")
input_len = [len(tokenizer(text)["input_ids"]) for text in data.input_text]
target_len = [len(tokenizer(text)["input_ids"]) for text in data.target_text]
print(max(input_len), max(target_len))

In [32]:
import pandas as pd
from pathlib import Path

EXCLUDE_SHEETS = ["500篇ID說明", "500篇ID處理說明", "工作表1"]
COLUMNS = [
    "AID", "PID", "Admissindate", "Sentence",
    "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "TimeInfo",
    "Remission", "Response", "緩解時間", "Acute", "急性住院時間", "DayCare", "慢性住院時間", "Episode", "Episode時間"
]

data_dir = "./data/raw/"
data_dir_path = Path(data_dir)
processed = pd.DataFrame()
for file in data_dir_path.iterdir():
    # Load data
    data = pd.read_excel(file, sheet_name=None, engine='openpyxl', dtype=str)
    # Access sheet name
    sheet = pd.ExcelFile(file, engine='openpyxl')
    sheet = [s for s in sheet.sheet_names if s not in EXCLUDE_SHEETS]
    assert len(sheet) == 1
    sheet_name = sheet[0]
    # Select sheet and set columns
    df = data.get(sheet_name)
    df = df[COLUMNS]
    df.Admissindate = pd.to_datetime(df.Admissindate, format="%Y-%m-%d")
    # Process
    processed = pd.concat([processed, process_data(df)], ignore_index=False)

# Save
processed_file = "./data/processed/data_binary.xlsx"
processed.to_excel(processed_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [None]:
def add_order(df: pd.DataFrame) -> pd.DataFrame:
    """Indicate the order of sentence."""

    aid = df["AID"].values.tolist()
    
    order = []
    cur_aid = None
    cur_order = 0
    for idx in aid:
        if cur_aid != idx:
            cur_aid = idx
            cur_order = 0
        order.append(cur_order)
        cur_order += 1

    df["Order"] = order

    return df

In [1]:
# TODO: use rule to inference event time

In [None]:
import pandas as pd

output_file = "./outputs/results.xlsx"
df = pd.read_excel(output_file)

In [3]:
from sklearn.metrics import classification_report
# Event
df_event = df[df["prefix"] == "event detection"]
y_true = []
y_pred = []
labels = {"Acute": 0, "DayCare":1 , "Episode": 2, "Remission": 3, "None": 4}
for y in df_event.target_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_true.append(vector)
for y in df_event.pred_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_pred.append(vector)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       220
           1       0.79      0.39      0.52        28
           2       0.52      0.36      0.42       192
           3       0.68      0.53      0.60       122
           4       0.92      0.95      0.93      2144

   micro avg       0.89      0.88      0.88      2706
   macro avg       0.76      0.63      0.68      2706
weighted avg       0.88      0.88      0.87      2706
 samples avg       0.89      0.88      0.88      2706



In [7]:
df_time = df[df["prefix"] == "time extraction"]
correct = sum([target == pred for target, pred in zip(df_time.target_text.values, df_time.pred_text.values)])
correct_rate = correct / len(df_time)
print(correct_rate)

0.880246020260492


In [11]:
df_time = df[df["prefix"] == "time extraction"]
correct = sum([target == pred for target, pred in zip(df_time.target_text.values, df_time.pred_text.values)if target != "None"])
total = sum([1 for target in df_time.target_text.values if target != "None"])
correct_rate = correct / total
print(correct_rate)

0.7480490523968785


### Binary Evaluation

In [None]:
from typing import List, Dict
from pathlib import Path
import json
import pandas as pd
from sklearn.metrics import classification_report

def compute_event_metric(labels: List, preds: List) -> str:
    return classification_report(labels, preds, digits=4)

def compute_time_metric(labels: List, preds: List) -> Dict:
    df = pd.DataFrame({"labels": labels, "preds": preds})
    df_right = df[df.labels==df.preds]
    df_wrong = df[df.labels!=df.preds]
    return {
        "tp": len(df_right[(df_right.labels!="None") & (df_right.preds!="None")]),
        "tn": len(df_right[(df_right.labels=="None") & (df_right.preds=="None")]),
        "fp": len(df_wrong[(df_wrong.labels=="None") & (df_wrong.preds!="None")]),
        "fn": len(df_wrong[(df_wrong.labels!="None") & (df_wrong.preds=="None")]),
        "ff": len(df_wrong[(df_wrong.labels!="None") & (df_wrong.preds!="None")])
    }

def load_result(path: str) -> pd.DataFrame:
    return pd.read_excel(path)


result_path = "./outputs/mt5-binary/results.xlsx"
output_dir = "./outputs/mt5-binary/"
data = load_result(result_path)

# Event
for event in ["remission", "acute", "daycare", "episode"]:
    df = data[data.prefix==event]
    metric = compute_event_metric(
        df.target_text.values.tolist(), df.pred_text.values.tolist()
    )
    output_path = Path(output_dir) / f"cr_{event}.txt"
    output_path.write_text(metric)

# Time
for time in ["duration", "time", "vague", "age", "ago", "persistence"]:
    df = data[data.prefix==time]
    metric = compute_time_metric(
        df.target_text.values.tolist(), df.pred_text.values.tolist()
    )
    output_path = Path(output_dir) / f"cm_{time}.txt"
    output_path.write_text(json.dumps(metric, indent=2, ensure_ascii=False))

### Predict Event and Time Concurrently

Merge all data

In [7]:
import pandas as pd
import re

def extract_chinese(text):
    pattern = re.compile("[\u4e00-\u9fa5]+")
    return pattern.findall(str(text))

data_path = "./data/raw/data.xlsx"
data = pd.read_excel(data_path)
chinese_segments = []
for row in data.itertuples():
    for col in ["Sentence", "Duration", "Time", "Vague", "Age", "Ago", "Persistence"]:
        if pd.isna(getattr(row, col)):
            continue
        segments = extract_chinese(getattr(row, col))
        if len(segments) != 0:
            chinese_segments.extend(segments)
chieses_segments = list(set(chinese_segments))
pd.DataFrame({"text": chieses_segments}).to_excel("./data/processed/chinese_segments.xlsx", index=False)

In [8]:
len(chieses_segments)

974

In [20]:
import pandas as pd

chinese_segments_translated = pd.read_excel("./data/processed/chinese_segments_translated.xlsx")
chinese_to_english = {row.Chinese: row.English for row in chinese_segments_translated.itertuples()}

def replace_chinese_with_english(text):
    if pd.isna(text):
        return text
    segments = extract_chinese(text)
    if len(segments) == 0:
        return text
    segments = sorted(segments, key=len, reverse=True)
    for seg in segments:
        text = text.replace(seg, chinese_to_english[seg])
    return text

data_path = "./data/raw/data.xlsx"
data = pd.read_excel(data_path)
for col in ["Sentence", "Duration", "Time", "Vague", "Age", "Ago", "Persistence"]:
    data[col] = data[col].apply(replace_chinese_with_english)

data.to_excel("./data/raw/data_translated.xlsx", index=False)

In [18]:
import pandas as pd
import re

def extract_chinese(text):
    pattern = re.compile("[\u4e00-\u9fa5]+")
    return pattern.findall(str(text))

data_path = "./data/raw/data_translated.xlsx"
data = pd.read_excel(data_path)
chinese_segments = []
for row in data.itertuples():
    for col in ["Sentence", "Duration", "Time", "Vague", "Age", "Ago", "Persistence"]:
        if pd.isna(getattr(row, col)):
            continue
        segments = extract_chinese(getattr(row, col))
        if len(segments) != 0:
            chinese_segments.extend(segments)
chinese_segments = list(set(chinese_segments))

In [9]:
import pandas as pd

TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]
EVENT_NAME = ["Remission", "Response", "Acute", "DayCare", "Episode"]

data = pd.read_excel("./data/raw/data.xlsx", sheet_name=None, engine='openpyxl', dtype=str)
sheet = pd.ExcelFile("./data/raw/data.xlsx", engine='openpyxl')
concat_df = pd.concat([data.get(s_name) for s_name in sheet.sheet_names])
for t_name in TIME_NAME:
    concat_df[t_name] = concat_df[t_name].fillna("")
    concat_df[t_name] = concat_df[t_name].astype(str)
for e_name in EVENT_NAME+["TimeInfo"]:
    concat_df[e_name] = concat_df[e_name].fillna(0)
    concat_df[e_name] = concat_df[e_name].astype(int)

concat_df = concat_df[concat_df.Vague != "Fact"]

concat_df = concat_df.loc[:, ["AID", "PID", "Sentence", "TimeInfo"]+TIME_NAME+EVENT_NAME]

def clear_no_time(row):
    if row.TimeInfo == 0:
        for t_name in TIME_NAME:
            setattr(row, t_name, "")
    return row

concat_df = concat_df.apply(clear_no_time, axis=1)

def extract_duration(row):
    if row.Duration != "":
        for t_name in TIME_NAME[1:]:
            if getattr(row, t_name) != "":
                row.Duration = getattr(row, t_name)
                setattr(row, t_name, "")
                break
    return row

concat_df = concat_df.apply(extract_duration, axis=1)

# https://stackoverflow.com/questions/33279940/how-to-combine-multiple-rows-of-strings-into-one-using-pandas
concat_df = concat_df.groupby(["AID", "PID", "Sentence", "TimeInfo"]).agg({
    "Duration": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Time": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Vague": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Age": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Ago": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Persistence": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Remission": lambda x: sum(x),
    "Response": lambda x: sum(x),
    "Acute": lambda x: sum(x),
    "DayCare": lambda x: sum(x),
    "Episode": lambda x: sum(x),
})

# Convert Index to Column
concat_df = concat_df.reset_index()

In [10]:
import pandas as pd
import re

def extract_chinese(text):
    pattern = re.compile("[\u4e00-\u9fa5]+")
    return pattern.findall(str(text))

chinese_segments = []
for row in concat_df.itertuples():
    for col in ["Sentence", "Duration", "Time", "Vague", "Age", "Ago", "Persistence"]:
        if pd.isna(getattr(row, col)):
            continue
        segments = extract_chinese(getattr(row, col))
        if len(segments) != 0:
            chinese_segments.extend(segments)
chieses_segments = list(set(chinese_segments))
pd.DataFrame({"text": chieses_segments}).to_excel("./data/processed/chinese_segments.xlsx", index=False)

In [11]:
import pandas as pd

chinese_segments_translated = pd.read_excel("./data/processed/chinese_segments_translated.xlsx")
chinese_to_english = {row.Chinese: row.English for row in chinese_segments_translated.itertuples()}

def replace_chinese_with_english(text):
    if pd.isna(text):
        return text
    segments = extract_chinese(text)
    if len(segments) == 0:
        return text
    segments = sorted(segments, key=len, reverse=True)
    for seg in segments:
        text = text.replace(seg, chinese_to_english[seg])
    return text

for col in ["Sentence", "Duration", "Time", "Vague", "Age", "Ago", "Persistence"]:
    concat_df[col] = concat_df[col].apply(replace_chinese_with_english)

In [12]:
def concat_duration(x):
    if x == "":
        return x
    rtn = []
    durations = x.split(", ")
    for i in range(0, len(durations), 2):
        rtn.append(f"{durations[i]} to {durations[i+1]}")
    return ", ".join(rtn)

concat_df["Duration"] = concat_df["Duration"].apply(concat_duration)

In [13]:
TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]
EVENT_NAME = ["Remission", "Response", "Acute", "DayCare", "Episode"]

parsed_data = []
for row in concat_df.itertuples():
    time_list = []
    for t_name in TIME_NAME:
        if getattr(row, t_name) != "":
            time_list.append(f"{t_name}: {getattr(row, t_name)}")
    time_text = "None" if len(time_list) == 0 else ". ".join(time_list)
    parsed_data.append([row.AID, row.PID, "time extraction", row.Sentence, time_text])
    
    event_list = []
    for e_name in EVENT_NAME:
        if getattr(row, e_name) > 0:
            if e_name == "Response":
                if "Remission" not in event_list:
                    event_list.append("Remission")
            else:
                event_list.append(e_name)
    event_text = "None" if len(event_list) == 0 else ", ".join(event_list)
    parsed_data.append([row.AID, row.PID, "event detection", row.Sentence, event_text])

pd.DataFrame(
    parsed_data, 
    columns=["aid", "pid", "prefix", "input_text", "target_text"]
).to_excel("./data/processed/data_translated_parsed.xlsx", index=False)

In [91]:
import pandas as pd
from transformers import AutoTokenizer

data = pd.read_excel("./data/processed/data_translated_parsed.xlsx")
tokenizer = AutoTokenizer.from_pretrained("t5-base", cache_dir="/nfs/nas-7.1/chchen/cache/huggingface/")
input_len = [len(tokenizer(text)["input_ids"]) for text in data.input_text]
target_len = [len(tokenizer(text)["input_ids"]) for text in data.target_text]
print(max(input_len), max(target_len))

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


526 78


Event Evaluation

In [1]:
import pandas as pd

df = pd.read_excel("./outputs/t5-translated/outputs.xlsx")

In [3]:
from sklearn.metrics import classification_report

df_event = df[df["prefix"] == "event detection"]
y_true = []
y_pred = []
labels = {"Acute": 0, "DayCare":1 , "Episode": 2, "Remission": 3, "None": 4}
for y in df_event.target_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        if y_i == "Response":
            y_i = "Remission"
        vector[labels[y_i]] = 1
    y_true.append(vector)
for y in df_event.pred_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        # if y_i not in labels:
        #     y_i = "None"
        if y_i == "Response":
            y_i = "Remission"
        vector[labels[y_i]] = 1
    y_pred.append(vector)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.85      0.88       234
           1       0.58      0.41      0.48        17
           2       0.80      0.84      0.82      1010
           3       0.72      0.71      0.71       184
           4       0.86      0.82      0.84      1223

   micro avg       0.83      0.82      0.82      2668
   macro avg       0.77      0.73      0.75      2668
weighted avg       0.83      0.82      0.82      2668
 samples avg       0.83      0.83      0.82      2668



In [4]:
df_time = df[df["prefix"] == "time extraction"]
correct = sum([target == pred for target, pred in zip(df_time.target_text.values, df_time.pred_text.values)])
correct_rate = correct / len(df_time)
print(correct_rate)

0.8875286916602907


In [8]:
TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]

df_time = df[df["prefix"] == "time extraction"]

for t_name in TIME_NAME:
    correct = 0
    total = 0
    for row in df_time.itertuples():
        target_index = row.target_text.find(t_name)
        if target_index == -1:
            target_text = "None"
        else:
            target_end_index = row.target_text.find(".", target_index)
            target_end_index = len(row.target_text) if target_end_index == -1 else target_end_index
            target_text = row.target_text[target_index:target_end_index]
        
        pred_index = row.pred_text.find(t_name)
        if pred_index == -1:
            pred_text = "None"
        else:
            pred_end_index = row.pred_text.find(".", pred_index)
            pred_end_index = len(row.pred_text) if pred_end_index == -1 else pred_end_index
            pred_text = row.pred_text[pred_index:pred_end_index]

        if target_text == pred_text == "None":
            continue

        if target_text == pred_text:
            correct += 1

        total += 1

    print(f"{t_name}: {correct}/{total}={round(correct/total, 4)}")

Duration: 35/57=0.614
Time: 470/548=0.8577
Vague: 45/145=0.3103
Age: 68/86=0.7907
Ago: 148/216=0.6852
Persistence: 4/43=0.093


### Process data

In [2]:
import re
import pandas as pd
import numpy as np

Collect Chinese segments

In [6]:
def extract_chinese(text):
    pattern = re.compile("[\u4e00-\u9fa5]+")
    return pattern.findall(str(text))

In [8]:
data = pd.read_excel("data/raw/w6/data_230406.xlsx")
chinese_segments = []
for i, row in data.iterrows():
    for col in ["Sentence", "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]:
        if pd.isna(row[col]):
            continue
        segments = extract_chinese(row[col])
        if len(segments) > 0:
            chinese_segments.extend(segments)
chieses_segments = list(set(chinese_segments))
pd.DataFrame({"text": chieses_segments}).to_excel("data/raw/w6/chinese_segments.xlsx", index=False)

After using Google translation, replace Chinese with English

In [9]:
chinese_to_english = pd.read_excel("data/raw/w6/chinese_to_english.xlsx")
chinese_to_english = {row.Chinese: row.English for row in chinese_to_english.itertuples()}

def replace_chinese_with_english(text):
    if pd.isna(text):
        return text
    segments = extract_chinese(text)
    if len(segments) == 0:
        return text
    # Longest match first
    segments = sorted(segments, key=len, reverse=True)
    for seg in segments:
        text = text.replace(seg, f" {chinese_to_english[seg]} ")
    return text

In [10]:
for col in ["Sentence", "Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]:
    data[col] = data[col].apply(replace_chinese_with_english)

Add sentence id

In [32]:
sentence_ids = []
cur_sent_id = 0
cur_aid = ""
cur_sentence = ""
for row in data.itertuples():
    if row.AID != cur_aid:
        cur_aid = row.AID
        cur_sent_id = 0
        cur_sentence = ""
    if row.Sentence != cur_sentence:
        cur_sentence = row.Sentence
        cur_sent_id += 1
        sentence_ids.append(cur_sent_id)
    else:
        sentence_ids.append(cur_sent_id)
data["Sentence_id"] = sentence_ids

In [33]:
data.to_excel("data/raw/w6/data_230406_translated.xlsx", index=False)

In [22]:
data = pd.read_excel("data/raw/w6/data_230406_translated.xlsx")

Set the data type

In [23]:
TIME_NAME = ["Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]
EVENT_NAME = ["Remission", "Acute", "DayCare", "Episode"]

for t_name in TIME_NAME:
    data[t_name] = data[t_name].fillna("")
    data[t_name] = data[t_name].astype(str)
for e_name in EVENT_NAME+["TimeInfo"]:
    data[e_name] = data[e_name].fillna(0)
    data[e_name] = data[e_name].astype(int)

Filter out Vague is Fact

In [24]:
data = data[data.Vague != "Fact"]

Clear out the time expression when TimeInfo is None

In [25]:
def clear_no_time(row):
    if row.TimeInfo == 0:
        for t_name in TIME_NAME:
            setattr(row, t_name, "")
    return row
data = data.apply(clear_no_time, axis=1)

Assign duration

In [26]:
def assign_duration(row):
    if row.Duration == "":
        return row
    for t_name in ["Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]:
        if getattr(row, t_name) != "":
            row.Duration = getattr(row, t_name)
            setattr(row, t_name, "")
            break
    return row
data = data.apply(assign_duration, axis=1)

Aggregate the data according to AID and

In [27]:
# https://stackoverflow.com/questions/33279940/how-to-combine-multiple-rows-of-strings-into-one-using-pandas
data = data.groupby(["AID", "PID", "Sentence_id", "Sentence", "TimeInfo"]).agg({
    "Duration": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Time_YMD": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Vague": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Age": lambda x: ', '.join([x_i.replace(".0", "") for x_i in x if x_i != ""]),
    "Ago_YMD": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Persistence": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Remission": lambda x: sum(x),
    "Acute": lambda x: sum(x),
    "DayCare": lambda x: sum(x),
    "Episode": lambda x: sum(x),
})

# Convert Index to Column
data = data.reset_index()

Concatenate duration

In [None]:
# Check duration errors
data_d = data[data["Duration"] != ""]
data_d["check"] = data_d["Duration"].apply(lambda x: len(x.split(",")) % 2 == 0)
data_d[data_d["check"] == False]

In [28]:
def concat_duration(x):
    if x == "":
        return x
    rtn = []
    durations = x.split(", ")
    for i in range(0, len(durations), 2):
        rtn.append(f"{durations[i]} to {durations[i+1]}")
    return ", ".join(rtn)
data["Duration"] = data["Duration"].apply(concat_duration)

Remove DayCare

In [29]:
data.drop(columns=["DayCare"], inplace=True)

Parse data to simpletransformers format

In [32]:
parsed_data = []
for row in data.itertuples():
    time_list = []
    for t_name in TIME_NAME:
        if getattr(row, t_name) != "":
            time_list.append(f"{t_name}: {getattr(row, t_name)}")
    time_text = "None" if len(time_list) == 0 else ". ".join(time_list)
    parsed_data.append([row.AID, row.PID, row.Sentence_id, "time extraction", row.Sentence, time_text])
    
    event_list = []
    for e_name in EVENT_NAME:
        if e_name == "DayCare":
            continue
        if getattr(row, e_name) > 0:
            event_list.append(e_name)
    event_text = "None" if len(event_list) == 0 else ", ".join(event_list)
    parsed_data.append([row.AID, row.PID, row.Sentence_id, "event detection", row.Sentence, event_text])

parsed_data = pd.DataFrame(
    parsed_data, 
    columns=["AID", "PID", "Sentence_id", "prefix", "input_text", "target_text"]
)
parsed_data.to_excel("./data/processed/data_230406.xlsx", index=False)

Split dataset by AID

In [33]:
SEED = 1309
TRAIN_RATIO = 0.7
EVAL_RATIO = 0.1
aid = data.AID.unique()
np.random.seed(SEED)
np.random.shuffle(aid)
train_aid, eval_aid, test_aid = np.split(aid, [int(TRAIN_RATIO*len(aid)), int((TRAIN_RATIO+EVAL_RATIO)*len(aid))])
train_data = parsed_data[parsed_data.AID.isin(train_aid)]
eval_data = parsed_data[parsed_data.AID.isin(eval_aid)]
test_data = parsed_data[parsed_data.AID.isin(test_aid)]
print(f"Train: {len(train_data)}, Eval: {len(eval_data)}, Test: {len(test_data)}")

Train: 18240, Eval: 2420, Test: 5248


In [16]:
train_data.to_excel("./data/processed/data_230406_train.xlsx", index=False)
eval_data.to_excel("./data/processed/data_230406_eval.xlsx", index=False)
test_data.to_excel("./data/processed/data_230406_test.xlsx", index=False)

### Evaluation

In [17]:
import pandas as pd
from sklearn.metrics import classification_report

In [19]:
df = pd.read_excel("outputs/flan-t5-base/outputs.xlsx")

Event evaluation

In [20]:
df_event = df[df["prefix"] == "event detection"]
y_true, y_pred = [], []
labels = {"Acute": 0, "DayCare":1 , "Episode": 2, "Remission": 3, "None": 4}
for y in df_event.target_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_true.append(vector)
for y in df_event.pred_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_pred.append(vector)
print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9248    0.5146    0.6613       239
           1     0.0000    0.0000    0.0000        17
           2     0.7987    0.7980    0.7983      1089
           3     0.6942    0.6590    0.6761       217
           4     0.7725    0.8328    0.8015      1154

   micro avg     0.7847    0.7717    0.7782      2716
   macro avg     0.6380    0.5609    0.5875      2716
weighted avg     0.7853    0.7717    0.7729      2716
 samples avg     0.7873    0.7795    0.7805      2716



  _warn_prf(average, modifier, msg_start, len(result))


Time evaluation

In [21]:
TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]

df_time = df[df["prefix"] == "time extraction"]

for t_name in TIME_NAME:
    correct = 0
    total = 0
    for row in df_time.itertuples():
        target_index = row.target_text.find(t_name)
        if target_index == -1:
            target_text = "None"
        else:
            target_end_index = row.target_text.find(".", target_index)
            target_end_index = len(row.target_text) if target_end_index == -1 else target_end_index
            target_text = row.target_text[target_index:target_end_index]
        
        pred_index = row.pred_text.find(t_name)
        if pred_index == -1:
            pred_text = "None"
        else:
            pred_end_index = row.pred_text.find(".", pred_index)
            pred_end_index = len(row.pred_text) if pred_end_index == -1 else pred_end_index
            pred_text = row.pred_text[pred_index:pred_end_index]

        if target_text == pred_text == "None":
            continue

        if target_text == pred_text:
            correct += 1

        total += 1

    print(f"{t_name}: {correct}/{total}={round(correct/total, 4)}")

Duration: 18/64=0.2812
Time: 423/520=0.8135
Vague: 42/157=0.2675
Age: 62/102=0.6078
Ago: 115/199=0.5779
Persistence: 4/39=0.1026


In [2]:
import pandas as pd
from sklearn.metrics import classification_report

df = pd.read_excel("outputs/t5-base/outputs.xlsx")

df_event = df[df["prefix"] == "event detection"]
y_true, y_pred = [], []
labels = {"Acute": 0, "Episode": 1, "Remission": 2, "None": 3}
for y in df_event.target_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_true.append(vector)
for y in df_event.pred_text.values.tolist():
    vector = [0 for _ in range(len(labels))]
    for y_i in y.split(","):
        y_i = y_i.strip()
        vector[labels[y_i]] = 1
    y_pred.append(vector)
print(classification_report(y_true, y_pred, digits=4, target_names=list(labels.keys())))

TIME_NAME = ["Duration", "Time", "Vague", "Age", "Ago", "Persistence"]

df_time = df[df["prefix"] == "time extraction"]

for t_name in TIME_NAME:
    correct = 0
    total = 0
    for row in df_time.itertuples():
        target_index = row.target_text.find(t_name)
        if target_index == -1:
            target_text = "None"
        else:
            target_end_index = row.target_text.find(".", target_index)
            target_end_index = len(row.target_text) if target_end_index == -1 else target_end_index
            target_text = row.target_text[target_index:target_end_index]
        
        pred_index = row.pred_text.find(t_name)
        if pred_index == -1:
            pred_text = "None"
        else:
            pred_end_index = row.pred_text.find(".", pred_index)
            pred_end_index = len(row.pred_text) if pred_end_index == -1 else pred_end_index
            pred_text = row.pred_text[pred_index:pred_end_index]

        if target_text == pred_text == "None":
            continue

        if target_text == pred_text:
            correct += 1

        total += 1

    print(f"{t_name}: {correct}/{total}={round(correct/total, 4)}")

              precision    recall  f1-score   support

       Acute     0.9307    0.7866    0.8526       239
     Episode     0.8163    0.7998    0.8080      1089
   Remission     0.7744    0.5853    0.6667       217
        None     0.7713    0.8634    0.8148      1164

   micro avg     0.8008    0.8088    0.8048      2709
   macro avg     0.8232    0.7588    0.7855      2709
weighted avg     0.8037    0.8088    0.8035      2709
 samples avg     0.8055    0.8100    0.8051      2709

Duration: 28/60=0.4667
Time: 429/514=0.8346
Vague: 60/149=0.4027
Age: 79/96=0.8229
Ago: 140/201=0.6965
Persistence: 4/43=0.093


Output data with TimeInfo not set
* `ni`: with no TimeInfo

In [5]:
import pandas as pd
import numpy as np

data = pd.read_excel("data/raw/w6/data_230406_translated.xlsx")

TIME_NAME = ["Duration", "Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]
EVENT_NAME = ["Remission", "Acute", "DayCare", "Episode"]

for t_name in TIME_NAME:
    data[t_name] = data[t_name].fillna("")
    data[t_name] = data[t_name].astype(str)
for e_name in EVENT_NAME+["TimeInfo"]:
    data[e_name] = data[e_name].fillna(0)
    data[e_name] = data[e_name].astype(int)

def assign_duration(row):
    if row.Duration == "":
        return row
    for t_name in ["Time_YMD", "Vague", "Age", "Ago_YMD", "Persistence"]:
        if getattr(row, t_name) != "":
            row.Duration = getattr(row, t_name)
            setattr(row, t_name, "")
            break
    return row
data = data.apply(assign_duration, axis=1)

data = data.groupby(["AID", "PID", "Sentence_id", "Sentence", "TimeInfo"]).agg({
    "Duration": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Time_YMD": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Vague": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Age": lambda x: ', '.join([x_i.replace(".0", "") for x_i in x if x_i != ""]),
    "Ago_YMD": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Persistence": lambda x: ', '.join([x_i for x_i in x if x_i != ""]),
    "Remission": lambda x: sum(x),
    "Acute": lambda x: sum(x),
    "DayCare": lambda x: sum(x),
    "Episode": lambda x: sum(x),
})
data = data.reset_index()

def concat_duration(x):
    if x == "":
        return x
    rtn = []
    durations = x.split(", ")
    for i in range(0, len(durations), 2):
        rtn.append(f"{durations[i]} to {durations[i+1]}")
    return ", ".join(rtn)
data["Duration"] = data["Duration"].apply(concat_duration)

data.drop(columns=["DayCare"], inplace=True)

data.to_excel("data/raw/w6/data_230406_ni.xlsx", index=False)

parsed_data = []
for row in data.itertuples():
    time_list = []
    for t_name in TIME_NAME:
        if getattr(row, t_name) != "":
            time_list.append(f"{t_name}: {getattr(row, t_name)}")
    time_text = "None" if len(time_list) == 0 else ". ".join(time_list)
    parsed_data.append([row.AID, row.PID, row.Sentence_id, row.TimeInfo, "time extraction", row.Sentence, time_text])
    
    event_list = []
    for e_name in EVENT_NAME:
        if e_name == "DayCare":
            continue
        if getattr(row, e_name) > 0:
            event_list.append(e_name)
    event_text = "None" if len(event_list) == 0 else ", ".join(event_list)
    parsed_data.append([row.AID, row.PID, row.Sentence_id, row.TimeInfo, "event detection", row.Sentence, event_text])

parsed_data = pd.DataFrame(
    parsed_data, 
    columns=["AID", "PID", "Sentence_id", "TimeInfo", "prefix", "input_text", "target_text"]
)
parsed_data.to_excel("./data/processed/data_230406_ni.xlsx", index=False)

SEED = 1309
TRAIN_RATIO = 0.7
EVAL_RATIO = 0.1
aid = data.AID.unique()
np.random.seed(SEED)
np.random.shuffle(aid)
train_aid, eval_aid, test_aid = np.split(aid, [int(TRAIN_RATIO*len(aid)), int((TRAIN_RATIO+EVAL_RATIO)*len(aid))])
test_data = parsed_data[parsed_data.AID.isin(test_aid)]
test_data.to_excel("./data/processed/data_230406_test_ni.xlsx", index=False)

Use rule to fill in the rows with no TimeInfo
* Duration => last time (Duration: 2008-07-23 to 2008-09-12 => 2008-09-12)

* Multiple time in the same category => last time (Time_YMD: 2010-06, 2010-06-17 => 2010-06-17)

* Multiple time in different categories => all (Time_YMD: 2005-07. Ago_YMD: 4Y. Persistence: 23D => Time_YMD: 2005-07. Ago_YMD: 4Y. Persistence: 23D)

* Multiple events => all (Ago_YMD: 4M -> Acute, Episode => Ago_YMD: 4M -> Acute, Ago_YMD: 4M -> Episode)