In [1]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on the MIMIC-III-50 dataset (e.g. Bert, RoBERTa, LEGAL-BERT)."""
import csv
import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score,precision_score
from trainer import MultilabelTrainer
from scipy.special import expit
from torch import nn
import glob
import shutil

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from hierbert import HierarchicalBert
from deberta import DebertaForSequenceClassification


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.9.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)



In [16]:
train_dataset1 = load_dataset("csv", data_files="/home/ghan/caml-mimic-fixed-/mimicdata/mimic3/train_50.csv")
eval_dataset = load_dataset("csv", data_files="/home/ghan/caml-mimic-fixed-/mimicdata/mimic3/dev_50.csv")
predict_dataset = load_dataset("csv", data_files="/home/ghan/caml-mimic-fixed-/mimicdata/mimic3/test_50.csv")


Using custom data configuration default-2828371e2f74ebe3
Found cached dataset csv (/home/ghan/.cache/huggingface/datasets/csv/default-2828371e2f74ebe3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-6249d4195201014c


Downloading and preparing dataset csv/default to /home/ghan/.cache/huggingface/datasets/csv/default-6249d4195201014c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/ghan/.cache/huggingface/datasets/csv/default-6249d4195201014c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-b097882698f56793


Downloading and preparing dataset csv/default to /home/ghan/.cache/huggingface/datasets/csv/default-b097882698f56793/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/ghan/.cache/huggingface/datasets/csv/default-b097882698f56793/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
train_dataset1

DatasetDict({
    train: Dataset({
        features: ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'],
        num_rows: 8066
    })
})

In [18]:
eval_dataset

DatasetDict({
    train: Dataset({
        features: ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'],
        num_rows: 1573
    })
})

In [19]:
predict_dataset

DatasetDict({
    train: Dataset({
        features: ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'],
        num_rows: 1729
    })
})

In [14]:
train_dataset1['train']

Dataset({
    features: ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'],
    num_rows: 8066
})

In [15]:
len(train_dataset1['train'])

8066

In [9]:
train_dataset = load_dataset("lex_glue", name='ecthr_a', split="train")

Downloading and preparing dataset lex_glue/ecthr_a to /home/ghan/.cache/huggingface/datasets/lex_glue/ecthr_a/1.0.0/8a66420941bf6e77a7ddd4da4d3bfb7ba88ef48c1d55302a568ac650a095ca3a...


Generating train split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset lex_glue downloaded and prepared to /home/ghan/.cache/huggingface/datasets/lex_glue/ecthr_a/1.0.0/8a66420941bf6e77a7ddd4da4d3bfb7ba88ef48c1d55302a568ac650a095ca3a. Subsequent calls will reuse this data.


In [11]:
train_dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 9000
})