In [1]:
import pandas as pd
import random

In [2]:
def to_df(data_dir, column_names=None, trim_col=[]):
    with open(data_dir, "r") as f:
        lines = f.readlines()
        lines = [line.split('\t') for line in lines]
    if not column_names:
        column_names = ['token', 'begin', 'end', 'section', 'filename', 'concept', 'label']
    df = pd.DataFrame(lines, columns = column_names)
    if trim_col:
        for col in trim_col:
            df[col] = df[col].str.rstrip("\n")
    return df

In [124]:
def write(df):
    def concat_row_values(row):
        if row.iloc[1] is None:
            return ""
        return '\t'.join(row.values.astype(str))

    # Apply the function across rows and join the resulting strings with new lines
    final_text = df.apply(concat_row_values, axis=1).str.cat(sep='\n')

    return final_text

In [127]:
# which_set = "dev"
for which_set in ["dev", "train", "test"]:
    DATA_DIR = "/Users/chenkx/git/clinical-negation/emnlp2017-bilstm-cnn-crf/data/i2b2_2010/real/%s.txt" % which_set
    highly_negated = ["Physical examination/Status", "Review of systems", "Allergies", "Complications"]
    # lowly_negated = ["Patient information/Demographics", "Present illness", "Hospital course", "Social history",
    #                  "Family history", "Addendum", "Radiology", "Unknown/Unclassified", "Problems",
    #                  "Reasons/Indications",
    #                  "Procedures/Surgery", "Chief complaint", "Nutrition", "Past history", "Assessment", "Diagnoses",
    #                  "Laboratory tests", "Follow-up/Instructions", "Assessment/Plan", "Allergies", "Medications",
    #                  "Investigations/Results"]

    raw = to_df(DATA_DIR, trim_col=["label"])
    raw["row_id"] = raw.index.to_list()
    raw['section'] = raw['section'].fillna(method='ffill')

    low_subset = raw.copy()
    low_subset.loc[low_subset.section.isin(highly_negated), "concept"] = "O"
    low_subset.loc[low_subset.section.isin(highly_negated), "label"] = "N/A"
    high_subset = raw.copy()
    high_subset.loc[~high_subset.section.isin(highly_negated), "concept"] = "O"
    high_subset.loc[~high_subset.section.isin(highly_negated), "label"] = "N/A"

    print(which_set)
    print("Ratio of the number of concepts in the high subset to the full set: %.4f (%d/%d)" % 
            (sum(high_subset.concept=="B")/sum(raw.concept=="B"),
            sum(high_subset.concept=="B"),
            sum(raw.concept=="B")))

    # write to file: the subset of highly-negated sections
    with open("/Users/chenkx/git/clinical-negation/emnlp2017-bilstm-cnn-crf/data/i2b2_2010-showing-highly_negated/%s.txt" % which_set, "w") as f:
        f.write(write(high_subset))
    # write to file: the subset of the rest of the dataset
    with open("/Users/chenkx/git/clinical-negation/emnlp2017-bilstm-cnn-crf/data/i2b2_2010-showing-lowly_negated/%s.txt" % which_set, "w") as f:
        f.write(write(low_subset))

dev
Ratio of the number of concepts in the high subset to the full set: 0.1232 (276/2241)
train
Ratio of the number of concepts in the high subset to the full set: 0.1426 (1387/9726)
test
Ratio of the number of concepts in the high subset to the full set: 0.1300 (2411/18550)


Appendix

In [59]:
downsample_ln = to_df("/Users/chenkx/git/clinical-negation/emnlp2017-bilstm-cnn-crf/data/i2b2_2010_downsample-lowly_negated/%s.txt" % which_set,
                  column_names = ['token', 'begin', 'end', 'section', 'filename', 'concept', 'label',"row_id","keep"])
raw["dowsample-lowly_negated"] = False
raw.loc[raw.row_id.isin(pd.to_numeric(downsample_ln.row_id).astype("Int64")), "dowsample-lowly_negated"] = True

In [60]:
downsample = to_df("/Users/chenkx/git/clinical-negation/emnlp2017-bilstm-cnn-crf/data/i2b2_2010_downsample/%s.txt" % which_set,
                  column_names = ['token', 'begin', 'end', 'section', 'filename', 'concept', 'label',"row_id", "keep"])
raw["dowsample"] = False
raw.loc[raw.row_id.isin(pd.to_numeric(downsample.row_id).astype("Int64")), "dowsample"] = True

In [121]:
raw.head(5)

Unnamed: 0,token,begin,end,section,filename,concept,label,row_id,dowsample-lowly_negated,dowsample
0,223159990,0,9,Unknown/Unclassified,223159990.txt,O,,0,False,False
1,PUOMC,10,15,Unknown/Unclassified,223159990.txt,O,,1,False,False
2,3546292,16,23,Unknown/Unclassified,223159990.txt,O,,2,False,False
3,181083,24,30,Unknown/Unclassified,223159990.txt,O,,3,False,False
4,366324,31,37,Unknown/Unclassified,223159990.txt,O,,4,False,False
