### Framing Checkbox Classification as an NLI Task

In [15]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

from util.utilities import (
    temperature_scaled_mixing, 
    get_dataset,
    get_clf_data,
    one_hot_encode,
    explode_onehot_df,
    onehot_label_counts,
    min_descriptions,
    duplicates_1,
    duplicates_2,
    word_replace_dict,
    bad_labels,
    contact_labels
)

import torch
from torch.optim import Adam
from torch import nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import (
    DistilBertTokenizerFast, 
    DistilBertModel , 
    PreTrainedTokenizer,
    DistilBertForSequenceClassification,
)
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.metrics import hamming_loss, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Part 1. Setting up entailment data creation

#### Entailment Data Setup

Let's set up the entailment dataset. The simplest way is to 
- use the case note as sentence 1
- the label text as sentence 2
- With the gold label set to "entailment."

However, we can get some additional training data.
- "No Show" and "Attempted client contact" imply that no other contact type is possible.
  - We can duplicate all the other casenote-label pairs with the gold label set to "contradiction"
- Inversely, any type of contact or service implies that "No show" is not possible
- "HMIS Release of Information - Agreed" implies that "HMIS - Declined" is a contradiction, and vice versa
- "Client assigned DACC CSR" contradicts "Client not assigned DACC CSR due to court order"
- "Direct Contact" contradicts "Collateral Contact"
- Lastly, all other labels not involved with the casenote can be added with the neutral gold label

We can add more to the list as we find more.

I also think that some labels need to be augmented/turned into sentences:
- "Direct Contact" -> "Direct Contact with Client by Case Manager/This Writer"
- etc.

Using the upsampling heuristic from the classification upsampling
- Review: For each case note, there's a set of checkboxes checked.
- I'm adding a "MIN_DESC" column which contains the checkbox that is checked the least number of times across the dataset
And my upsampling algorithm up/down-samples as if each case note only has one label, the MIN_DESC value
- However, I don't want to cause extreme overfitting, so rather than using a large T (which controls how much to upsample the labels that occur the least), I am using T=1 for basic examples-proportional mixing, while applying a high K which sets an artifical limit on the label set sizes. 



General Purpose Method for making an entailment set from this kind of data. (Initially wrote several cells writing and testing this code, then transformed it into a method)

In [16]:
ID = "id"
ENTAILMENT = "entailment"
NOT_ENTAILMENT = "not_entailment"
CONTRADICTION = "contradiction"
NEUTRAL = "neutral"
PREMISE = "premise"
HYPOTHESIS = "hypothesis"

GOLD_2 = "gold_2"
GOLD_3 = "gold_3"
GOLD_2_IDX = GOLD_2 + "_idx"
GOLD_3_IDX = GOLD_3 + "_idx"

two_label_set = [ENTAILMENT, NOT_ENTAILMENT]
two_label_dict = {val: i for i, val in enumerate(two_label_set)}

three_label_set = [ENTAILMENT, NEUTRAL, CONTRADICTION]
three_label_dict = {val: i for i, val in enumerate(three_label_set)}

def set_contradiction(row, contra_set):
    label = row[HYPOTHESIS]
    return CONTRADICTION if label in contra_set else row[GOLD_3]

def make_entailment_set(df: pd.DataFrame, premise_col, hypothesis_col, id_col, labels_list=None,
                        contra_dict=None, aug_dict=None, add_two_label_set=False):
    """
    Creates an entailment dataset out of a multilabel text-label pair dataset where each 
    label for a text is represented with a duplicated entry with a different value in the 
    label column. 

    premise_col: column containing the text. Gets renamed to "premise" in output
    hypothesis_col: column containing the label. Gets renamed to "hypothesis" in output
    id_col: the column which contains an id that's duplicated for each text entry. Gets renamed to "id" in output
    contra_dict: a dictionary containing contradiction information from domain knowledge about the dataset
    aug_dict: a dictionary containing mappings from labels to augmented label text. 
    add_two_label_set: By default, this returns a three label entailment set. Set this to true to add a column with a two_label entailment labels.

    output df structure:
    columns: id, premise, hypothesis, gold_3, gold_3_idx, (optional: gold_2, gold_2_idx), ... other columns in original df
    """

    # Restructure
    df = df.rename(columns={id_col: ID, premise_col: PREMISE, hypothesis_col: HYPOTHESIS})
    df = df.reset_index(drop=True)

    # Step 1: Basic set
    df[GOLD_3] = ENTAILMENT
    df_cols = df.columns
    new_cols = [HYPOTHESIS, GOLD_3] # Cols which will have new values for the new entries 
    old_cols = [col for col in df.columns if col not in new_cols] # Cols which will have duplicated vals for the new entries
    basic_set = df

    # Step 2: Add every casenote-label pair possible with "contradiction" or "neutral" as appropriate
    if labels_list is None:
        labels_list = df[HYPOTHESIS].unique()
    grouped = basic_set.groupby(ID)
    df_list = [basic_set]
    for name, group in grouped:
        existing_labels = group[HYPOTHESIS].tolist()
        remaining_labels = [label for label in labels_list 
                            if label not in existing_labels]

        new_df = pd.DataFrame(columns=df_cols)

        new_df[HYPOTHESIS] = pd.Series(remaining_labels)
        new_df[GOLD_3] = NEUTRAL

        # Preserve values of old cols in new rows
        # We already know that the values of these old cols will be the same for each note
        for col in old_cols:
            new_df[col] = group[col].iloc[0]

        if contra_dict:
            contradictions_set = set()
            for ex_lab in existing_labels:
                contradictions_set.update(contra_dict.get(ex_lab, [ex_lab]))
            new_df[GOLD_3] = new_df.apply(lambda x: set_contradiction(x, contradictions_set), axis=1)

        df_list.append(new_df)

    full_set = pd.concat(df_list)

    # I fixed the error in my code causing duplicates, but leaving this here just in case.
    full_set = full_set.drop_duplicates()

    # Add label indicies 
    full_set[GOLD_3_IDX] = full_set[GOLD_3].apply(lambda x: three_label_dict[x])

    if aug_dict:
        full_set[HYPOTHESIS] = full_set[HYPOTHESIS].apply(lambda x: aug_dict.get(x, x)) 

    if add_two_label_set:
        full_set[GOLD_2] = full_set[GOLD_3].apply(lambda x: NOT_ENTAILMENT if x != ENTAILMENT else ENTAILMENT)
        full_set[GOLD_2_IDX] = full_set[GOLD_2].apply(lambda x: two_label_dict[x])


    return full_set 

#### Testing method for turning a dataset with entailment predictions into a classification report

In [17]:
PREDS = "preds"
ONEHOT_GOLDS = "onehot_golds"
ONEHOT_PREDS = "onehot_preds"

# Testing method for turning a dataset with entailment predictions into a classification report
def entailment_clf_report(test_df: pd.DataFrame, model_outputs, classes, aug_rev_dict=None):
    """
    For an entailment-classification set made using make_entailment_set,
    Given the test set and the models final outputs, print the classification report.

    test_df: df containing test set. Test set should contain
    num_notes * num_labels entries, such that when you group by the id_col,
    there are num_labels entries for each note. Make sure id_col (or any df column)
    is not the current index. 
    
    model_outputs: An array of len(test_df.index) such that the i-th row in model_outputs
    corresponds to the i-th row in test_df. Make sure to NOT shuffle the testing data
    loader to ensure this is the case. 

    id_col: id column that's duplicated for each note-label pair.
    aug_rev_dict: if provided, will map the hypothesis according to the mappings in the dict.
    """

    test_df = test_df.reset_index(drop=True) # reset index to 0 -> len(test_df.index)
    test_df[PREDS] = pd.Series(model_outputs) # so this works as intended

    if aug_rev_dict:
        test_df[HYPOTHESIS] = test_df[HYPOTHESIS].apply(lambda x: aug_rev_dict.get(x, x))

    # entailment index is always 0
    # Either 2 or 3 labelset works.
    
    # The actual labels checked (anything that's not entailment was not part of note labels)
    golds_df = test_df[test_df[GOLD_3_IDX] == 0].reset_index(drop=True)

    # The predictions (anything that model predicts as entailment is a label prediction)
    preds_df = test_df[test_df[PREDS] == 0].reset_index(drop=True)

    # It's possible that preds has fewer notes (unique ids) because some notes never had an entailment prediction
    
    # Need to provide classes because its possible that certain classes were never predicted as entailment by model
    onehot_df = one_hot_encode(golds_df, HYPOTHESIS, ONEHOT_GOLDS, ID, classes=classes)[0]
    empty_preds = pd.Series(dict(zip(onehot_df[ID].tolist(), np.zeros(shape=(len(onehot_df.index), len(classes))))))
    onehot_df = onehot_df.set_index(ID).merge(empty_preds.rename(ONEHOT_PREDS), left_index=True, right_index=True)
    
    onehot_preds_df = one_hot_encode(preds_df, HYPOTHESIS, ONEHOT_PREDS, ID, classes=classes)[0].set_index(ID)
    onehot_df[ONEHOT_PREDS].update(onehot_preds_df[ONEHOT_PREDS])

    y_true = onehot_df[ONEHOT_GOLDS].tolist()
    y_pred = onehot_df[ONEHOT_PREDS].tolist()

    print(classification_report(y_true, y_pred, target_names=classes, zero_division=0))

#### Experimental Setup
- For each note in our original dataset, there are 20 (len(label_list)) entries
- For each noteid,
  - There are 20 entries
  - Each entry contains the same SENTENCE1 (premise), which is the case note  
  - Each contains one of the 20 labels
  - If a label was checked for the case note, the gold is "entailment"
  - For the remaining labels which contradict the checked labels, the gold is "contradiction"
  - The remaining golds are "neutral"
  - There is an "ALT_GOLD" column which contains only "entailment" and "not_entailment"

#### Train-Test Split

There's two ways to do this
1. Just split up `full_set` by just sampling 20% of the dataset to be the test set.
   1. This would could split up a casenote with some entries in the train set and some in the test set
   2. This isn't that big of a deal because this is not actually multi-label classification 
2. Keep case notes together 

I think it makes more sense to keep the casenotes together.  I can sample X number of noteids, split them into test and train, then populate the test and train with the entries that correspond to those noteids

In [18]:
FULL = "full"
CONTRA = "contra"
BASIC = "basic"

def get_basic_set(df):
    # For the provided entailment df, only keep the 
    # entailment entries. (identical to simply adding a 
    # column containing "entailment" for each entry in original non-entailment dataset)
    return df.loc[df[GOLD_3] == ENTAILMENT]

def get_contradictions_set(df):
    # For the provided entailment df, only keep the
    # entailment and contradction entries (not neutrals)

    # If using the two_label_set for training, this teaches the model
    # that only contradictions are not_entailment, and there's not
    # data about the neutrals.
    return df.loc[df[GOLD_3] != NEUTRAL]

def split_entailment_set(df: pd.DataFrame, frac=1, test_size=0.2, random_state=42, mix_train_params=None):
    """
    Regardless of train set, the test set needs to contain every possible note-label pair
    for each note in the test set. 

    df: entailment set obtained with make_entailment_set
    mix_train_params: If not none, temperature mix training set with provided (T, K, label_col) params.
    T: temperature, K: artifical size limit for labels of one class, label_col: the col containing the labels to upsample using. 

    test_size: fraction of unique notes in test set from full set. 
    frac: what fraction of unique notes to retain in train and test set (after splitting and mixing, if applicable)
    random_state: iykyk
    """

    # percentage of casenotes to sample
    # num_samples = int(len(unique_ids) * frac)
    # sampled_noteids = np.random.RandomState(seed=random_state).choice(unique_ids, num_samples)
    
    unique_ids_df = df.drop_duplicates(ID)
    train_notes, test_notes = train_test_split(unique_ids_df, test_size=test_size, random_state=random_state)

    if mix_train_params:
        T, K, label_col = mix_train_params
        train_notes = temperature_scaled_mixing(train_notes, label_col, T, K, frac)
    else: 
        train_notes = train_notes.sample(frac=frac, random_state=random_state)

    train_notes = train_notes[ID].tolist()
    test_notes = test_notes.sample(frac=frac, random_state=random_state)[ID].tolist()

    # Train notes can have duplicate ids
    grouped = df.groupby(ID)
    train_dfs = []
    for name, group in grouped:
        count = train_notes.count(name)
        for _ in range(count):
            train_dfs.append(group)
    train_set = pd.concat(train_dfs)

    test_set = df.loc[df[ID].isin(test_notes)]

    print("Total notes: %d" % len(unique_ids_df.index))
    print("Num Notes in Train Set: %d, Test Set: %d" % (len(train_notes), len(test_notes)))
    print("Total Entries: %d" % (len(train_set.index) + len(test_set.index)))
    print("Num Entries in Train: %d, Test: %d" % (len(train_set.index), len(test_set.index)))

    return train_set.reset_index(drop=True), test_set.reset_index(drop=True)

#### Model
Initial Plan: Since I'm going to use the the RobertaForSequenceClassification made by the authors of DocNLI later on, I will use it here as well. It adds a custom ClassificationHead on top of the RobertaModel provided by the transformers library.

It turns out that roberta-large is way too big for my GPU to handle, so I'm going to use DistilBert. I can use the same model I used for classification, but need a new data loader and a runner. But I think writing a loader shouldn't be too hard.

Copying code from my classifier notebook

In [19]:
default_distilbert_dim = 768
distilbert_dropout = 0.1

class DistilBertForClassification(nn.Module):

    def __init__(self, num_classes):
        super(DistilBertForClassification, self).__init__()
        self.db = DistilBertModel.from_pretrained('distilbert-base-uncased')

        self.dense_layer = nn.Linear(default_distilbert_dim, default_distilbert_dim)
        self.dropout = nn.Dropout(distilbert_dropout)
        self.non_lin = nn.Tanh()
        self.classifier = nn.Linear(default_distilbert_dim, num_classes)
    
    def forward(self, input_ids, input_mask):
        db_output = self.db(input_ids=input_ids, attention_mask=input_mask)

        # DistilBert outputs a tuple where the first element is the hidden states
        # at the output of the model's last layer. It's of size:
        # batch_size, seq_len, hidden_size=768
        hidden_states = db_output[0]

        # Now we want the encoding of the [CLS] token. If you input a sequence 
        # of n words, the output will be a sequence of n tensors. Every sequence
        # begins with the [CLS] so we just need the first tensor. So we "pool"
        # the model by taking the hidden state corresponding to the first tensor.
        cls_tensor = hidden_states[:, 0]

        output = self.dense_layer(cls_tensor)
        output = self.non_lin(output)
        output = self.dropout(output)

        result = self.classifier(output)

        # Because I want to use my classifier and the transformers SequenceClassifier interchangeably,
        # I'll return a SequenceClassifierOutput here

        return SequenceClassifierOutput(
            logits=result
        )


#### Data Loader
Using my classifier data loader as a starting point

In [20]:
# I deleted a lot of code from my experimenting, but noting key takeaways:
# 1. Roberta used two SEP tokens to separate sentence

# The dataset can be agnostic of the entailment label type
class NLIDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, df, 
                 premise_col, hypothesis_col, gold_col, max_len=None):
        """
        if max_len is undefined or None, tokenizer uses predefined model max length
        """

        self.tokenizer = tokenizer
        self.df = df
        self.max_len = max_len

        self.data = self.load_data(df[premise_col], df[hypothesis_col], df[gold_col])
    
    # Num notes
    def __len__(self):
        return len(self.df.index)

    def load_data(self, premise_list, hypothesis_list, gold_list):
        token_ids = []
        mask_ids = []
        labels = []

        # DistilBert doesn't take token_type_ids as input. 
        # Sequences should just be separated using the special tokens

        # longest_first truncates token by token, from longest sequence in pair
        # or only_first, truncates only from first seq. in pair
        for premise, hypothesis, gold in tqdm(zip(premise_list, hypothesis_list, gold_list)):
            inputs = self.tokenizer(
                text=premise,
                text_pair=hypothesis,
                add_special_tokens=True,
                padding="max_length",
                truncation="longest_first",
                max_length=self.max_len,
                return_token_type_ids=False, 
                return_attention_mask=True,
                return_tensors="pt"
            )

            ids = inputs["input_ids"]
            mask = inputs["attention_mask"]

            token_ids.append(ids)
            mask_ids.append(mask)
            labels.append(gold)
        
        token_ids = torch.cat(token_ids)
        mask_ids = torch.cat(mask_ids)
        labels = torch.tensor(labels)
        
        dataset = TensorDataset(token_ids, mask_ids, labels)

        # print("Dataset Length: ", len(dataset))
        return dataset

    def get_data_loader(self, batch_size=4, shuffle=True):
        data_loader = DataLoader(
            self.data,
            shuffle=shuffle,
            batch_size=batch_size
        )

        return data_loader

In [21]:
# Just a note
# MNLI Labels: o: entailment, 1: neutral, 2: contradiction

class DistilBertNLIClassifier():
    def __init__(self, premise_col, hypothesis_col, gold_col, label_names, 
                 train_data, test_data, zero_shot=False, max_len=128,
                 train_batch_size=4, train_shuffle=True,
                 valid_batch_size=4, valid_shuffle=True,
                 epochs=1, learning_rate=1e-05, tokenizer=None, model=None):

        self.max_len = max_len
        self.train_batch_size = train_batch_size
        self.valid_batch_size = valid_batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        
        self.tokenizer = tokenizer
        if not tokenizer:
            self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
    
        self.model = model
        if not model:
            self.model = DistilBertForClassification(len(label_names))
        self.model.to(device)

        self.zero_shot = zero_shot

        # Creating the dataset and dataloader
        # The data loader chunks the data into smaller batches for the NN since you can't
        # feed all the data at the same time into the network

        if not zero_shot:
            print("TRAIN Dataset: {}".format(train_data.shape))
            self.training_set = NLIDataset(tokenizer=self.tokenizer, df=train_data, 
                                    premise_col=premise_col, hypothesis_col=hypothesis_col, 
                                    gold_col=gold_col, max_len=self.max_len)
            self.training_loader = self.training_set.get_data_loader(batch_size=train_batch_size, shuffle=train_shuffle)


        print("TEST Dataset: {}".format(test_data.shape))
        self.testing_set = NLIDataset(tokenizer=self.tokenizer, df=test_data, 
                                         premise_col=premise_col, hypothesis_col=hypothesis_col, 
                                         gold_col=gold_col, max_len=self.max_len)
        self.testing_loader = self.testing_set.get_data_loader(batch_size=valid_batch_size, shuffle=valid_shuffle)

        self.label_names = label_names

        # For Multi Label, I used BCE. Here, only one label is possible per exampe. 
        # So I'll use CrossEntropy

        self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=learning_rate)
    
    def loss_function(self, outputs, targets):
        return nn.CrossEntropyLoss()(outputs, targets)
    
    def train(self):
        model = self.model
        training_loader = self.training_loader
        optimizer = self.optimizer

        model.train()
        for epoch in range(self.epochs):
            
            for batch_idx, (input_ids, input_mask, labels) in tqdm(enumerate(training_loader)):
                
                optimizer.zero_grad()

                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)

                # CrossEntropyLoss requires that the targets are LongTensors
                labels = labels.type(torch.LongTensor).to(device)

                outputs = model(input_ids, input_mask).logits
                loss = self.loss_function(outputs, labels)

                # Print loss every 5000 examples
                if batch_idx % 5000 == 0:
                    print("Epoch: %d, Loss: %f" % (epoch, loss.item()))
                
                loss.backward()
                optimizer.step()
    
    def test_model(self, test_set, return_predictions=False):
        model = self.model
        testing_loader = test_set

        final_targets = []
        final_outputs = []

        model.eval()
        with torch.no_grad():
            for batch_idx, (input_ids, input_mask, labels) in tqdm(enumerate(testing_loader)):
                
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)

                outputs = model(input_ids, input_mask).logits

                curr_targets = labels.numpy().tolist()
                curr_outputs = torch.log_softmax(outputs, dim=0).cpu().detach().numpy().tolist()
        
                final_targets.extend(curr_targets)
                final_outputs.extend(curr_outputs)
        
        if return_predictions:
            # A probability above .5 signifies that the label belongs to the note
            final_outputs = [np.argmax(probs) for probs in final_outputs]

        return final_outputs, final_targets

    # The validation here needs to be more complex. Right now the report will
    # just tell me details about how well the model classified pairs into the 3 entailment categories.
    # I need a validation metric/report which which gives me results about the 3 categories for 
    # each checkbox label.
    def validation_report(self, outputs, targets):
        hamm_loss = hamming_loss(outputs, targets)

        print("Hamming Loss: %f" % hamm_loss)

        clf_report = classification_report(targets, outputs, target_names=self.label_names, zero_division=0, output_dict=True)
        clf_report = pd.DataFrame(clf_report).transpose()
        display(clf_report)

        return hamm_loss, clf_report

    def validate(self, ret=False):
        final_outputs, final_targets = self.test_model(self.testing_loader, return_predictions=True)
        self.validation_report(final_outputs, final_targets)

        if ret:
            return final_outputs, final_targets

    def __str__(self):
        return "Hyperparams:\n max_tokens_per_example: %d\n train_batch_size: %d\n valid_batch_size: %d\n epochs: %d\n learning rate: %f\n train_size: %d\n test_size: %d\n" % (
            self.max_len,
            self.train_batch_size,
            self.valid_batch_size,
            self.epochs,
            self.learning_rate,
            len(self.training_set),
            len(self.testing_set)
        )

### Data setup

In [22]:
# Jumping to Dataset 2 because its known to yield better results
all_labels_to_ignore = contact_labels + bad_labels

df1 = get_clf_data(duplicate_labels_dict=duplicates_2, drop_global_duplicates=True,
                   word_replace_dict=word_replace_dict, labels_to_ignore=all_labels_to_ignore)
df1["DESCRIPTION"].value_counts().sort_index()

--- LOADING DATA... ---


  df = pd.read_csv(strs[name], **args)


Total initial entries: 108704
Total initial notes: 64385
- Removed irrelevant columns for checkbox classification, kept: Index(['NOTEID', 'NOTE', 'DESCRIPTION', 'GROUPNAME'], dtype='object')
- Dropped empty notes
- Dropped interventions checkboxes
- Consolidated duplicate labels using the provided mappings.
{'Direct Contact': ['Client contact in office', 'Direct Contact through Outreach', 'Client contact', 'Client Contact out of office'], 'Attempted client contact': ['Attempted client contact through Outreach', 'Attempt to locate client'], 'Client Assessment Conducted': ['New Client assessment completed'], 'Coordinated Assessment Completed at DACC': ['Coordinated Assessment Completed through HOST contact', 'Coordinated Assessment Scheduled at DACC'], 'Release of information forms signed for all appropriate vendors and agencies': ['Client declined to sign release of information']}
- Dropped entries containing the provided labels
- Substituted occurences of words in text with provided wo

Client Assessment Conducted                                                      303
Client Birth certificate                                                         687
Client DPS report                                                                 80
Client Texas State ID                                                            769
Client assigned DACC CSR                                                          20
Coordinated Assessment Completed at DACC                                         400
HMIS Release of Information - Agreed                                             101
HMIS profile created/updated                                                     363
Legal Issues                                                                    1834
Release of information forms signed for all appropriate vendors and agencies     390
Treatment Plan completed and signed                                              129
Name: DESCRIPTION, dtype: int64

In [None]:
df1 = min_descriptions(df1, "DESCRIPTION", "MIN_DESC", "NOTEID")
oh_df1, classes = one_hot_encode(df=df1, label_col="DESCRIPTION", new_col="LABELS", duplicate_id_col="NOTEID")

oh_df1.head(1)

In [24]:
# I updated this list based on label analysis. 
# Collateral contact doesn't involve client. 
# ALso some of the "Agreed/Declined" pairs in the data often occur together. It's
# probably a mistake in data collection, so we still don't want to teach the model
# specifically that those things are contradictory (bc training set is often incorrect)
no_client_contact = ["No Show", "Attempted client contact", "Collateral Contact"]
contact = [label for label in classes if label not in no_client_contact]

contradictions = {
    "No Show": contact,
    "Attempted client contact": contact,
}

In [25]:
# Not automating to allow individual control of each mapping
# Moving the subject to the beginning. Reworked these after label analysis

"""
For reference while writing augmentations
word_replace_dict = {
    "birth certificate": ["birth cert", "birth certificate", "bc"],
    "texas id": ["texas id", "tx id", "state id", "texas state id", "tx state id"],
    "_CM_": ["case manager", "casemanager", "writer", "cm"],
    "case manager": ["_CM_", "this _CM_"],
    "coordinated assessment": ["coordinated assessment"],
    "social security card": ["ss", "ssc"],
    "ROI": ["release of information", "rois"],
    "client": ["cl", "client", "clt", "peer"],
}
"""

augmentations = {
    'Asked client to complete a DACC Customer survey': 'The client completed the survey',
    'Attempted client contact': 'The case manager attempted client contact',
    'Client Assessment Conducted': "The case manager conducted Client Assessment", 
    # These ID Docs are tricky. Using "... was mentioned"
    'Client Birth certificate': "The client's birth certificate was mentioned",
    'Client DPS report': "The client's DPS report was run",
    'Client Texas State ID': "The client's Texas State ID was mentioned",
    'Client assigned DACC CSR': "The client was assigned CSR",
    'Client declined to sign release of information': 'The client declined to sign release of information',
    'Client not assigned DACC CSR due to court order': 'The client was not assigned DACC CSR due to court order',
    # Collateral contact is with parties other than the client. So many choices for word information: "news, communication, information, etc."
    'Collateral Contact': "The case manager sent or received information about the client",
    'Coordinated Assessment Completed at DACC': 'A coordinated assessment was completed',
    'Coordinated Assessment Scheduled at DACC': "A coordinated assessment was scheduled",
    'Direct Contact': "The case manager met or spoke with the client", # Having an "or" in an entailment set seems worrying
    'HMIS Release of Information - Agreed': "The client agreed to sign HMIS ROI",
    'HMIS Release of Information - Declined': "The client declined to sign HMIS ROI",
    'HMIS profile created/updated': "The case manager created or updated the client's HMIS profile",
    'Legal Issues': "There were legal issues with the client",
    'No Show': "The Client did not show up",
    'Release of information forms signed for all appropriate vendors and agencies': 'The client signed ROIs',
    'Treatment Plan completed and signed': 'Plan of action signed and completed by the client.'
}

# For mapping back to OG label
augmentations_rev = {v: k for k, v in augmentations.items()}

In [26]:
e_df1 = make_entailment_set(df1, premise_col="NOTE", hypothesis_col="DESCRIPTION", id_col="NOTEID", aug_dict=augmentations, add_two_label_set=True)

print("Notes in original df * num labels =", df1.groupby("NOTEID").ngroups * len(classes))
print("Entries in entailment set:", len(e_df1.index))
e_df1.head(1)

Notes in original df * num labels = 45155
Entries in entailment set: 45155


Unnamed: 0,id,premise,hypothesis,MIN_DESC,gold_3,gold_3_idx,gold_2,gold_2_idx
0,1485602,"client walked in for on-call case manager services. case manager assisted client in contacting the FL DMV to work on clearing CLs unpaid tickets. FL DMV rep. reports that client must complete course at the DUI school. FL DMV rep. reports to complete this course out of state client needs to contact Celeste Havis (1800-832-9623, 512-834-6628 ext. 2910). Once client completes the course client must mail certification, letter stating why client completed out of state with CLs signature, copy of texas id (proof of residency- ex. utility bill), schools phone number, agency license #, course information, state course was taken in. client must also complete Advanced driver improvement school online (www.flhsmv.gov, search ADI, ticket # 12984py) to address Habitual Traffic Offender. client must also mail a $280 money order/check made out to Motorist Services to BMC PO Box 5775 Tallahassee, FL 32314-5775. client can also use a courier service to expedite process Back Track 850-878-54-37, 850-222-2666).",The client's Texas State ID was mentioned,Client Texas State ID,entailment,0,entailment,0


In [27]:
# Visual inspect. Looks good!

# grouped = e_df1.groupby(ID)

# t = 2
# for name, group in grouped:
#     display(group)
#     t -= 1

#     if t == 0:
#         break

# I already see some errors after my label analysis
# Collateral contact is contact with someone other than the client ABOUT the client, so it's not a contradiction of no show!
# Also, I can't call it "The CM had collateral contact with the client." for the same reason above.

# As for my actual method, its working great!

### 1. Pre-Trained DistilBert not fine tuned on any NLI Dataset

#### Fine-Tuned on Checkbox Data
I was thinking of passing the basic set, but then this model would have no knowledge about what constitutes neturals and contradictions. I am rerunning this model after learning about how the initial contradictions set is not really a good idea. However, I am rerunning it anyway to display the results and show why I decided to pursue the label analysis.

Dataset 2, unmixed, full-set

In [28]:
# No mixing, using FULL set. BUT there's way too much data to train on it in a reasonable time. 
# So keeping only half of original data. 
train_set, test_set = split_entailment_set(e_df1, frac=1, test_size=0.2, random_state=42)

Total notes: 4105
Num Notes in Train Set: 3284, Test Set: 821
Total Entries: 45155
Num Entries in Train: 36124, Test: 9031


In [29]:
# clf1: full_train, full_test, MNLI Labels, 10% of full casenote entailment dataset, only pretrained bert
# Average # words in notes is ~92, so setting max_len to 256. 
clf1 = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, train_data=train_set, test_data=test_set,
                              train_batch_size=16, valid_batch_size=16, valid_shuffle=False, max_len=256)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TRAIN Dataset: (36124, 8)


36124it [00:23, 1551.60it/s]


TEST Dataset: (9031, 8)


9031it [00:06, 1493.78it/s]


In [30]:
# At 4 iterations a second, it will take
# (((train_size / train_batch_size) / 4) / 60) minutes
# So for the full data set with alllll ~64000 notes, it'll be 200 minutes

print(str(clf1))
print("---TRAINING---")
clf1.train()
print("---VALIDATION---")
outputs, targets = clf1.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 256
 train_batch_size: 16
 valid_batch_size: 16
 epochs: 1
 learning rate: 0.000010
 train_size: 36124
 test_size: 9031

---TRAINING---


0it [00:00, ?it/s]

Epoch: 0, Loss: 1.153531


2258it [08:38,  4.35it/s]


---VALIDATION---


565it [00:42, 13.31it/s]


Hamming Loss: 0.580224


Unnamed: 0,precision,recall,f1-score,support
entailment,0.145066,0.434483,0.217509,1015.0
neutral,0.909091,0.417914,0.572601,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.419776,0.419776,0.419776,0.419776
macro avg,0.351386,0.284132,0.26337,9031.0
weighted avg,0.823222,0.419776,0.532692,9031.0


In [31]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.08      0.24      0.12        63
                                                    Client Birth certificate       0.15      0.54      0.23       129
                                                           Client DPS report       0.02      0.40      0.04        15
                                                       Client Texas State ID       0.17      0.55      0.25       155
                                                    Client assigned DACC CSR       0.00      0.00      0.00         7
                                    Coordinated Assessment Completed at DACC       0.07      0.33      0.12        81
                                        HMIS Release of Information - Agreed       0.01      0.04      0.02        25
                                                HMIS pr

**Some Reminders** 
- Precision = True Positives / (True Positives + False Positives)
  - What proportion of positive identifications was actually correct?
- Recall = True Positives / (True Positives + False Negatives)
  - What proportion of actual positives was identified correctly? 
- Accuracy = (True Positives + True Negatives) / Total

The entailment row tells us about the checkboxes. 
- Of the casenote-label pairs it predicted correctly, the model got 89.345% of them right
- But it did a very poor job of actually predicting the checkboxes and had a recall of 22.4%
- It's a similar story with the contradictions
- I think because of the huge number of neutrals, the dataset is hugely overfitting on them. All casenotes contain some similar patterns, and the model is being told that most casenote-label pairs are neutral. I think I should remove the neutrals from the dataset creation process.

The train set contains WAY too many note groups (the 20 entries for each casenote-pair) that only have an "entailment" label for direct contact. I think some data mixing is probably in order. 

OK Now what does this actually mean. I have to interpret these results. One idea for seeing model understanding is make a small set of examples with very simple premises that are common patterns in the casenotes. Ex. For Direct Contact
- "The CM met with the Client." -> "The CM had Direct Contact with the client"

And then seeing how the model handles this. 

Dataset 2, unmixed, contra-set

In [32]:
# Try again, this time with entailments and not_entailments, but only keep the not_entailments
# that are actually contradictions.
contra_set = get_contradictions_set(train_set)
# Test set still needs to contain all labels for each note.

In [33]:
# Trying just contradictions, with the 2 label task
# contra_train, full_test, docnli_labels, only pretrained distilbert
clf2 = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_2_IDX,
                              label_names=two_label_set, valid_shuffle=False,
                              train_data=contra_set, test_data=test_set)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TRAIN Dataset: (4061, 8)


4061it [00:02, 1607.29it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1599.97it/s]


In [34]:
print(str(clf2))

print("---TRAINING---")
clf2.train()
print("---VALIDATION---")
outputs, targets = clf2.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 4061
 test_size: 9031

---TRAINING---


4it [00:00, 11.31it/s]

Epoch: 0, Loss: 0.671351


1016it [00:43, 23.39it/s]


---VALIDATION---


2258it [00:23, 95.86it/s]

Hamming Loss: 0.521537





Unnamed: 0,precision,recall,f1-score,support
entailment,0.121026,0.581281,0.20034,1015.0
not_entailment,0.897738,0.465444,0.613046,8016.0
accuracy,0.478463,0.478463,0.478463,0.478463
macro avg,0.509382,0.523362,0.406693,9031.0
weighted avg,0.810443,0.478463,0.566662,9031.0


In [35]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.09      0.63      0.15        63
                                                    Client Birth certificate       0.12      0.50      0.19       129
                                                           Client DPS report       0.01      0.27      0.01        15
                                                       Client Texas State ID       0.20      0.63      0.30       155
                                                    Client assigned DACC CSR       0.01      0.57      0.02         7
                                    Coordinated Assessment Completed at DACC       0.19      0.49      0.27        81
                                        HMIS Release of Information - Agreed       0.04      0.68      0.07        25
                                                HMIS pr

Dataset 2, t=4, full-set

In [36]:
# No mixing, using FULL set. BUT there's way too much data to train on it in a reasonable time. 
# So keeping only half of original data. 
import math

largeK = math.pow(2, 21)
train_set, test_set = split_entailment_set(e_df1, frac=1, test_size=0.2, random_state=42,
                                           mix_train_params=(4, largeK, "MIN_DESC"))

Total notes: 4105
Num Notes in Train Set: 3284, Test Set: 821
Total Entries: 45155
Num Entries in Train: 36124, Test: 9031


In [37]:
# clf1: full_train, full_test, MNLI Labels, 10% of full casenote entailment dataset, only pretrained bert
# Average # words in notes is ~92, so setting max_len to 256. 
clf1 = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, train_data=train_set, test_data=test_set,
                              train_batch_size=16, valid_batch_size=16, valid_shuffle=False, max_len=256)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TRAIN Dataset: (36124, 8)


36124it [00:22, 1635.10it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1715.44it/s]


In [38]:
# At 4 iterations a second, it will take
# (((train_size / train_batch_size) / 4) / 60) minutes
# So for the full data set with alllll ~64000 notes, it'll be 200 minutes

print(str(clf1))
print("---TRAINING---")
clf1.train()
print("---VALIDATION---")
outputs, targets = clf1.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 256
 train_batch_size: 16
 valid_batch_size: 16
 epochs: 1
 learning rate: 0.000010
 train_size: 36124
 test_size: 9031

---TRAINING---


1it [00:00,  2.00it/s]

Epoch: 0, Loss: 0.991936


2258it [08:08,  4.62it/s]


---VALIDATION---


565it [00:40, 13.99it/s]

Hamming Loss: 0.543351





Unnamed: 0,precision,recall,f1-score,support
entailment,0.147693,0.391133,0.214421,1015.0
neutral,0.931052,0.464945,0.620185,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.456649,0.456649,0.456649,0.456649
macro avg,0.359582,0.285359,0.278202,9031.0
weighted avg,0.84301,0.456649,0.574581,9031.0


In [39]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.14      0.51      0.22        63
                                                    Client Birth certificate       0.32      0.67      0.43       129
                                                           Client DPS report       0.11      0.47      0.17        15
                                                       Client Texas State ID       0.23      0.65      0.34       155
                                                    Client assigned DACC CSR       0.00      0.00      0.00         7
                                    Coordinated Assessment Completed at DACC       0.07      0.31      0.11        81
                                        HMIS Release of Information - Agreed       0.05      0.16      0.08        25
                                                HMIS pr

Dataset 2, t=4, contra_set

In [40]:
# Try again, this time with entailments and not_entailments, but only keep the not_entailments
# that are actually contradictions.
contra_set = get_contradictions_set(train_set)
# Test set still needs to contain all labels for each note.

In [41]:
# Trying just contradictions, with the 2 label task
# contra_train, full_test, docnli_labels, only pretrained distilbert
clf2 = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_2_IDX,
                              label_names=two_label_set, valid_shuffle=False,
                              train_data=contra_set, test_data=test_set)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TRAIN Dataset: (5287, 8)


5287it [00:03, 1649.02it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1667.22it/s]


In [42]:
print(str(clf2))

print("---TRAINING---")
clf2.train()
print("---VALIDATION---")
outputs, targets = clf2.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 5287
 test_size: 9031

---TRAINING---


1it [00:00,  8.62it/s]

Epoch: 0, Loss: 0.562019


1322it [00:53, 24.60it/s]


---VALIDATION---


2258it [00:22, 102.16it/s]


Hamming Loss: 0.510021


Unnamed: 0,precision,recall,f1-score,support
entailment,0.124451,0.586207,0.205314,1015.0
not_entailment,0.901176,0.477794,0.62449,8016.0
accuracy,0.489979,0.489979,0.489979,0.489979
macro avg,0.512814,0.532001,0.414902,9031.0
weighted avg,0.81388,0.489979,0.577379,9031.0


In [43]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.07      0.71      0.13        63
                                                    Client Birth certificate       0.15      0.47      0.23       129
                                                           Client DPS report       0.01      0.20      0.01        15
                                                       Client Texas State ID       0.24      0.63      0.35       155
                                                    Client assigned DACC CSR       0.01      0.71      0.03         7
                                    Coordinated Assessment Completed at DACC       0.20      0.62      0.30        81
                                        HMIS Release of Information - Agreed       0.04      0.76      0.07        25
                                                HMIS pr

I didn't shuffle the validation set so I could reconstruct label lists for each note, but looking at the results here tells me that this set up is not adequate -- Only a 9.3% recall. A very small number of actual positives identified here. Instead of working to understand what this model is learning right now, let me try the next step: Training on MNLI then using my basic set (just the entailment labels) to fine tune. 

### 2. MNLI DistilBert

The MNLI dataset contains fairly short premise-hypothesis pairs. The premises for the case data are much longer. Thus, I would have ideally wanted to use DocNLI which has paragraph-sentence pairs, but the authors provided Roberta for their pretrained model which is too large for my machine to handle. (Kept getting cuda out of memory errors even with batch_size=1). I could train on the DocNLI dataset and I want to, but in the interest of time I will use MNLI since there is a pretrained model available on huggingface.

#### Zero Shot

In [44]:
# Zero Shot
tokenizer = DistilBertTokenizerFast.from_pretrained("typeform/distilbert-base-uncased-mnli", truncation=True, do_lower_case=True);
model = DistilBertForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli');

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [45]:
# Test on full_test from above
# full_test, MNLI labels, pretrained distilbert + fine tuned on MNLI, NO fine tuning on casenotes
clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False, zero_shot=True,
                              train_data=None, test_data=test_set, tokenizer=tokenizer, model=model)

TEST Dataset: (9031, 8)


9031it [00:05, 1788.93it/s]


In [46]:
outputs, targets = clf.validate(ret=True)
# test data explanation above

2258it [00:22, 101.79it/s]

Hamming Loss: 0.598605





Unnamed: 0,precision,recall,f1-score,support
entailment,0.130518,0.337931,0.188306,1015.0
neutral,0.896232,0.409431,0.562083,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.401395,0.401395,0.401395,0.401395
macro avg,0.34225,0.249121,0.25013,9031.0
weighted avg,0.810172,0.401395,0.520074,9031.0


In [47]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.07      0.22      0.11        63
                                                    Client Birth certificate       0.34      0.33      0.33       129
                                                           Client DPS report       0.04      0.27      0.07        15
                                                       Client Texas State ID       0.33      0.35      0.34       155
                                                    Client assigned DACC CSR       0.00      0.00      0.00         7
                                    Coordinated Assessment Completed at DACC       0.13      0.43      0.20        81
                                        HMIS Release of Information - Agreed       0.02      0.16      0.03        25
                                                HMIS pr

As expected, this does really poorly. But one point to note is that classifer fine-tuned on the full set has a better recall by several percentage points, so it is possible that model is learning something useful. Let's try fine tuning the MNLI distilbert. Here I want to fine tune using my basic set (only entailment examples)

#### Fine Tuned, Dataset 2, unmixed, basic

In [48]:
# No mixing, using FULL set. BUT there's way too much data to train on it in a reasonable time. 
# So keeping only half of original data. 
train_set, test_set = split_entailment_set(e_df1, frac=1, test_size=0.2, random_state=42)

Total notes: 4105
Num Notes in Train Set: 3284, Test Set: 821
Total Entries: 45155
Num Entries in Train: 36124, Test: 9031


In [49]:
basic_set = get_basic_set(train_set)

In [50]:
# Test on full_test from above
clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=basic_set, test_data=test_set, tokenizer=tokenizer, model=model)

TRAIN Dataset: (4061, 8)


4061it [00:02, 1658.06it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1802.39it/s]


In [51]:
print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 4061
 test_size: 9031

---TRAINING---


1it [00:00,  3.51it/s]

Epoch: 0, Loss: 6.370784


1016it [00:41, 24.44it/s]


---VALIDATION---


2258it [00:22, 102.17it/s]


Hamming Loss: 0.657956


Unnamed: 0,precision,recall,f1-score,support
entailment,0.096056,0.242365,0.137584,1015.0
neutral,0.887883,0.354666,0.506864,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.342044,0.342044,0.342044,0.342044
macro avg,0.32798,0.19901,0.214816,9031.0
weighted avg,0.798889,0.342044,0.46536,9031.0


In [52]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.11      0.30      0.16        63
                                                    Client Birth certificate       0.49      0.13      0.21       129
                                                           Client DPS report       0.00      0.07      0.01        15
                                                       Client Texas State ID       0.19      0.24      0.21       155
                                                    Client assigned DACC CSR       0.00      0.00      0.00         7
                                    Coordinated Assessment Completed at DACC       0.05      0.15      0.07        81
                                        HMIS Release of Information - Agreed       0.03      0.08      0.04        25
                                                HMIS pr

#### Fine Tuned, Dataset 2, unmixed, full

In [53]:
tokenizer = DistilBertTokenizerFast.from_pretrained("typeform/distilbert-base-uncased-mnli", truncation=True, do_lower_case=True);
model = DistilBertForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli');

clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=train_set, test_data=test_set, tokenizer=tokenizer, model=model)

print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


TRAIN Dataset: (36124, 8)


36124it [00:20, 1794.04it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1748.39it/s]


Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 36124
 test_size: 9031

---TRAINING---


1it [00:00,  6.48it/s]

Epoch: 0, Loss: 1.183082


5003it [03:23, 24.58it/s]

Epoch: 0, Loss: 0.017626


9031it [06:07, 24.58it/s]


---VALIDATION---


2258it [00:22, 101.69it/s]

Hamming Loss: 0.584431





Unnamed: 0,precision,recall,f1-score,support
entailment,0.160931,0.469951,0.239759,1015.0
neutral,0.920483,0.408683,0.566048,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.415569,0.415569,0.415569,0.415569
macro avg,0.360471,0.292878,0.268602,9031.0
weighted avg,0.835117,0.415569,0.529376,9031.0


In [54]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.08      0.56      0.14        63
                                                    Client Birth certificate       0.18      0.36      0.24       129
                                                           Client DPS report       0.03      0.07      0.04        15
                                                       Client Texas State ID       0.15      0.37      0.21       155
                                                    Client assigned DACC CSR       0.00      0.00      0.00         7
                                    Coordinated Assessment Completed at DACC       0.11      0.69      0.19        81
                                        HMIS Release of Information - Agreed       0.02      0.08      0.03        25
                                                HMIS pr

#### Fine Tuned, Dataset 2, unmixed, contra

In [55]:
tokenizer = DistilBertTokenizerFast.from_pretrained("typeform/distilbert-base-uncased-mnli", truncation=True, do_lower_case=True);
model = DistilBertForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli');

contra_set = get_contradictions_set(train_set)

clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=contra_set, test_data=test_set, tokenizer=tokenizer, model=model)

print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


TRAIN Dataset: (4061, 8)


4061it [00:02, 1702.54it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1737.21it/s]


Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 4061
 test_size: 9031

---TRAINING---


1it [00:00,  6.65it/s]

Epoch: 0, Loss: 5.686823


1016it [00:41, 24.51it/s]


---VALIDATION---


2258it [00:22, 102.26it/s]


Hamming Loss: 0.664932


Unnamed: 0,precision,recall,f1-score,support
entailment,0.09309,0.256158,0.136555,1015.0
neutral,0.888247,0.34506,0.497035,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.335068,0.335068,0.335068,0.335068
macro avg,0.327112,0.200406,0.211197,9031.0
weighted avg,0.798878,0.335068,0.45652,9031.0


In [56]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.09      0.40      0.15        63
                                                    Client Birth certificate       0.43      0.12      0.19       129
                                                           Client DPS report       0.00      0.07      0.01        15
                                                       Client Texas State ID       0.19      0.20      0.19       155
                                                    Client assigned DACC CSR       0.01      0.14      0.02         7
                                    Coordinated Assessment Completed at DACC       0.07      0.15      0.10        81
                                        HMIS Release of Information - Agreed       0.00      0.04      0.01        25
                                                HMIS pr

#### Fine Tuned, Dataset 2, T=4, basic

In [57]:
# No mixing, using FULL set. BUT there's way too much data to train on it in a reasonable time. 
# So keeping only half of original data. 
train_set, test_set = split_entailment_set(e_df1, frac=1, test_size=0.2, random_state=42,
                                           mix_train_params=(4, largeK, "MIN_DESC"))

Total notes: 4105
Num Notes in Train Set: 3284, Test Set: 821
Total Entries: 45155
Num Entries in Train: 36124, Test: 9031


In [58]:
basic_set = get_basic_set(train_set)

In [59]:
# Test on full_test from above
clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=basic_set, test_data=test_set, tokenizer=tokenizer, model=model)

TRAIN Dataset: (5198, 8)


5198it [00:03, 1503.57it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1708.68it/s]


In [60]:
print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 5198
 test_size: 9031

---TRAINING---


1it [00:00,  6.72it/s]

Epoch: 0, Loss: 0.000113


1300it [00:53, 24.50it/s]


---VALIDATION---


2258it [00:22, 102.24it/s]


Hamming Loss: 0.626066


Unnamed: 0,precision,recall,f1-score,support
entailment,0.134641,0.202956,0.161886,1015.0
neutral,0.892988,0.395584,0.548284,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.373934,0.373934,0.373934,0.373934
macro avg,0.342543,0.199513,0.236723,9031.0
weighted avg,0.807757,0.373934,0.504856,9031.0


In [61]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.11      0.29      0.16        63
                                                    Client Birth certificate       0.41      0.22      0.28       129
                                                           Client DPS report       0.00      0.00      0.00        15
                                                       Client Texas State ID       0.17      0.21      0.19       155
                                                    Client assigned DACC CSR       0.02      0.29      0.04         7
                                    Coordinated Assessment Completed at DACC       0.29      0.06      0.10        81
                                        HMIS Release of Information - Agreed       0.03      0.24      0.05        25
                                                HMIS pr

#### Fine Tuned, Dataset 2, unmixed, full

In [62]:
tokenizer = DistilBertTokenizerFast.from_pretrained("typeform/distilbert-base-uncased-mnli", truncation=True, do_lower_case=True);
model = DistilBertForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli');

clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=train_set, test_data=test_set, tokenizer=tokenizer, model=model)

print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


TRAIN Dataset: (36124, 8)


36124it [00:22, 1629.47it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1702.61it/s]


Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 36124
 test_size: 9031

---TRAINING---


1it [00:00,  6.57it/s]

Epoch: 0, Loss: 0.008286


5003it [03:22, 24.58it/s]

Epoch: 0, Loss: 0.144652


9031it [06:06, 24.61it/s]


---VALIDATION---


2258it [00:22, 100.78it/s]


Hamming Loss: 0.601595


Unnamed: 0,precision,recall,f1-score,support
entailment,0.151777,0.450246,0.227024,1015.0
neutral,0.917616,0.391841,0.549174,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.398405,0.398405,0.398405,0.398405
macro avg,0.356464,0.280696,0.258733,9031.0
weighted avg,0.831543,0.398405,0.512967,9031.0


In [63]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.13      0.51      0.21        63
                                                    Client Birth certificate       0.16      0.34      0.22       129
                                                           Client DPS report       0.03      0.60      0.07        15
                                                       Client Texas State ID       0.21      0.39      0.27       155
                                                    Client assigned DACC CSR       0.01      0.14      0.01         7
                                    Coordinated Assessment Completed at DACC       0.15      0.54      0.24        81
                                        HMIS Release of Information - Agreed       0.00      0.00      0.00        25
                                                HMIS pr

#### Fine Tuned, Dataset 2, unmixed, contra

In [64]:
tokenizer = DistilBertTokenizerFast.from_pretrained("typeform/distilbert-base-uncased-mnli", truncation=True, do_lower_case=True);
model = DistilBertForSequenceClassification.from_pretrained('typeform/distilbert-base-uncased-mnli');

contra_set = get_contradictions_set(train_set)

clf = DistilBertNLIClassifier(premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX,
                              label_names=three_label_set, valid_shuffle=False,
                              train_data=contra_set, test_data=test_set, tokenizer=tokenizer, model=model)

print(str(clf))

print("---TRAINING---")
clf.train()

print("---VALIDATION---")
outputs, targets = clf.validate(ret=True)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


TRAIN Dataset: (5198, 8)


5198it [00:03, 1429.92it/s]


TEST Dataset: (9031, 8)


9031it [00:05, 1604.06it/s]


Hyperparams:
 max_tokens_per_example: 128
 train_batch_size: 4
 valid_batch_size: 4
 epochs: 1
 learning rate: 0.000010
 train_size: 5198
 test_size: 9031

---TRAINING---


1it [00:00,  6.81it/s]

Epoch: 0, Loss: 4.730008


1300it [00:52, 24.54it/s]


---VALIDATION---


2258it [00:22, 102.17it/s]


Hamming Loss: 0.659063


Unnamed: 0,precision,recall,f1-score,support
entailment,0.098918,0.252217,0.142104,1015.0
neutral,0.887178,0.352171,0.504197,8016.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.340937,0.340937,0.340937,0.340937
macro avg,0.328699,0.201462,0.215434,9031.0
weighted avg,0.798585,0.340937,0.463501,9031.0


In [65]:
entailment_clf_report(test_set, outputs, classes, augmentations_rev)

                                                                              precision    recall  f1-score   support

                                                 Client Assessment Conducted       0.07      0.19      0.11        63
                                                    Client Birth certificate       0.37      0.11      0.17       129
                                                           Client DPS report       0.00      0.13      0.01        15
                                                       Client Texas State ID       0.27      0.28      0.27       155
                                                    Client assigned DACC CSR       0.01      0.14      0.02         7
                                    Coordinated Assessment Completed at DACC       0.07      0.27      0.11        81
                                        HMIS Release of Information - Agreed       0.04      0.16      0.07        25
                                                HMIS pr

#### Making a few shot set from my understanding of the domain

In [66]:
examples = {
    "The CM met with the Client.": "Direct Contact",
    "The CM spoke with the client on the phone.": "Direct Contact",
    "The CM received an email about the client": "Collateral Contact",
    "The Client did not show up.": "No Show",
    "This CM tried to locate the client": "Attempted client contact",
    "The Client met with judge at court regarding a case.": "Legal Issues",
    "The client's case in court is still indicted.": "Legal Issues",
    "The client spoke with the attorney.": "Legal Issues",
    "The CM ordered a birth certificate for the client": "Client Birth certificate",
    "The CM ordered a Texas State ID for the client": "Client Texas State ID",
    "The CM completed ROI for several organizations": "Release of information forms signed for all appropriate vendors and agencies",
    "The CM completed Release of Information forms.": "Release of information forms signed for all appropriate vendors and agencies",
    'The client updated contact info in HMIS.': "HMIS profile created/updated",
    "The CM met with the client and completed DACC CM Intake Paperwork": "Client Assessment Conducted",
    "The client arrived for his intake with this CM.": "Client Assessment Conducted",
    "The client completed CA at DACC.": "Coordinated Assessment Completed at DACC",
    "The client scored 16 on CA": "Coordinated Assessment Completed at DACC",
    "The client completed Coordinated Assessment at DACC": "Coordinated Assessment Completed at DACC",
    "The CM scheduled a CA with client.": "Coordinated Assessment Scheduled at DACC",
    "The client agreed to completed Coordinated Assessment.": "Coordinated Assessment Scheduled at DACC",
    # Treatment Plan generally doesn't have explicit text implying Treatment Plan. Instead, it usually coincides with "intake"
    "The CM met with client and completed intake paperwork": "Treatment Plan completed and signed",
    # HMIS Release - Agreed: Same here -- usually coincides with completing coordinated Assessment
    # Same with declined. Looks like this is info that's just recorded in the checkboxes, not in notes.
    # Same with "Client declined to sign release of information"
    "The CM ran client's DPS report": "Client DPS report",
    "The client received the client's FBI background check": "Client DPS report",
    # Few notes explicitly state "CSR", and I think CSR and Customer Survey are duplicates
    # and I think HMIS ROI - Declined and "Client declined to sign ROI" are also duplicates
    "The CM asked client to complete CSR": "Client assigned DACC CSR",
    # Only two items in "Client not assigned DACC CSR due to court order", both without context
    
}

notes = examples.keys()
labels = examples.values()

exdf = pd.DataFrame({PREMISE: notes, HYPOTHESIS: labels}).reset_index(drop=False)
classes = exdf[HYPOTHESIS].unique()
exdf = make_entailment_set(exdf, PREMISE, HYPOTHESIS, "index", classes, aug_dict=augmentations)
exdf.head()

Unnamed: 0,id,premise,hypothesis,gold_3,gold_3_idx
0,0,The CM met with the Client.,The case manager met or spoke with the client,entailment,0
1,1,The CM spoke with the client on the phone.,The case manager met or spoke with the client,entailment,0
2,2,The CM received an email about the client,The case manager sent or received information about the client,entailment,0
3,3,The Client did not show up.,The Client did not show up,entailment,0
4,4,This CM tried to locate the client,The case manager attempted client contact,entailment,0


In [67]:
small_test = NLIDataset(clf.tokenizer, df=exdf, premise_col=PREMISE, hypothesis_col=HYPOTHESIS, gold_col=GOLD_3_IDX)

outputs, targets = clf.test_model(small_test.get_data_loader(shuffle=False), return_predictions=True)
clf.validation_report(outputs, targets)

exdf["PREDS"] = pd.Series(outputs)
exdf["PREDS"] = exdf["PREDS"].apply(lambda x: three_label_set[x])
exdf.drop(GOLD_3_IDX, axis=1, inplace=True)

grouped = exdf.groupby(ID)
for name, group in grouped:
    print(name)
    display(group)

360it [00:00, 2333.80it/s]
90it [00:03, 23.10it/s]

Hamming Loss: 0.711111





Unnamed: 0,precision,recall,f1-score,support
entailment,0.069767,0.5,0.122449,24.0
neutral,0.929293,0.27381,0.422989,336.0
contradiction,0.0,0.0,0.0,0.0
accuracy,0.288889,0.288889,0.288889,0.288889
macro avg,0.33302,0.257937,0.181812,360.0
weighted avg,0.871991,0.288889,0.402953,360.0


0


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
0,0,The CM met with the Client.,The case manager met or spoke with the client,entailment,neutral
0,0,The CM met with the Client.,The case manager sent or received information about the client,neutral,neutral
1,0,The CM met with the Client.,The Client did not show up,neutral,entailment
2,0,The CM met with the Client.,The case manager attempted client contact,neutral,entailment
3,0,The CM met with the Client.,There were legal issues with the client,neutral,contradiction
4,0,The CM met with the Client.,The client's birth certificate was mentioned,neutral,contradiction
5,0,The CM met with the Client.,The client's Texas State ID was mentioned,neutral,entailment
6,0,The CM met with the Client.,The client signed ROIs,neutral,entailment
7,0,The CM met with the Client.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,0,The CM met with the Client.,The case manager conducted Client Assessment,neutral,neutral


1


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
1,1,The CM spoke with the client on the phone.,The case manager met or spoke with the client,entailment,entailment
0,1,The CM spoke with the client on the phone.,The case manager sent or received information about the client,neutral,neutral
1,1,The CM spoke with the client on the phone.,The Client did not show up,neutral,entailment
2,1,The CM spoke with the client on the phone.,The case manager attempted client contact,neutral,entailment
3,1,The CM spoke with the client on the phone.,There were legal issues with the client,neutral,contradiction
4,1,The CM spoke with the client on the phone.,The client's birth certificate was mentioned,neutral,contradiction
5,1,The CM spoke with the client on the phone.,The client's Texas State ID was mentioned,neutral,entailment
6,1,The CM spoke with the client on the phone.,The client signed ROIs,neutral,entailment
7,1,The CM spoke with the client on the phone.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,1,The CM spoke with the client on the phone.,The case manager conducted Client Assessment,neutral,neutral


2


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
2,2,The CM received an email about the client,The case manager sent or received information about the client,entailment,entailment
0,2,The CM received an email about the client,The case manager met or spoke with the client,neutral,neutral
1,2,The CM received an email about the client,The Client did not show up,neutral,entailment
2,2,The CM received an email about the client,The case manager attempted client contact,neutral,entailment
3,2,The CM received an email about the client,There were legal issues with the client,neutral,contradiction
4,2,The CM received an email about the client,The client's birth certificate was mentioned,neutral,contradiction
5,2,The CM received an email about the client,The client's Texas State ID was mentioned,neutral,entailment
6,2,The CM received an email about the client,The client signed ROIs,neutral,entailment
7,2,The CM received an email about the client,The case manager created or updated the client's HMIS profile,neutral,neutral
8,2,The CM received an email about the client,The case manager conducted Client Assessment,neutral,neutral


3


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
3,3,The Client did not show up.,The Client did not show up,entailment,contradiction
0,3,The Client did not show up.,The case manager met or spoke with the client,neutral,neutral
1,3,The Client did not show up.,The case manager sent or received information about the client,neutral,entailment
2,3,The Client did not show up.,The case manager attempted client contact,neutral,entailment
3,3,The Client did not show up.,There were legal issues with the client,neutral,contradiction
4,3,The Client did not show up.,The client's birth certificate was mentioned,neutral,contradiction
5,3,The Client did not show up.,The client's Texas State ID was mentioned,neutral,entailment
6,3,The Client did not show up.,The client signed ROIs,neutral,entailment
7,3,The Client did not show up.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,3,The Client did not show up.,The case manager conducted Client Assessment,neutral,neutral


4


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
4,4,This CM tried to locate the client,The case manager attempted client contact,entailment,contradiction
0,4,This CM tried to locate the client,The case manager met or spoke with the client,neutral,neutral
1,4,This CM tried to locate the client,The case manager sent or received information about the client,neutral,entailment
2,4,This CM tried to locate the client,The Client did not show up,neutral,entailment
3,4,This CM tried to locate the client,There were legal issues with the client,neutral,contradiction
4,4,This CM tried to locate the client,The client's birth certificate was mentioned,neutral,contradiction
5,4,This CM tried to locate the client,The client's Texas State ID was mentioned,neutral,entailment
6,4,This CM tried to locate the client,The client signed ROIs,neutral,entailment
7,4,This CM tried to locate the client,The case manager created or updated the client's HMIS profile,neutral,neutral
8,4,This CM tried to locate the client,The case manager conducted Client Assessment,neutral,neutral


5


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
5,5,The Client met with judge at court regarding a case.,There were legal issues with the client,entailment,entailment
0,5,The Client met with judge at court regarding a case.,The case manager met or spoke with the client,neutral,neutral
1,5,The Client met with judge at court regarding a case.,The case manager sent or received information about the client,neutral,entailment
2,5,The Client met with judge at court regarding a case.,The Client did not show up,neutral,entailment
3,5,The Client met with judge at court regarding a case.,The case manager attempted client contact,neutral,contradiction
4,5,The Client met with judge at court regarding a case.,The client's birth certificate was mentioned,neutral,contradiction
5,5,The Client met with judge at court regarding a case.,The client's Texas State ID was mentioned,neutral,entailment
6,5,The Client met with judge at court regarding a case.,The client signed ROIs,neutral,entailment
7,5,The Client met with judge at court regarding a case.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,5,The Client met with judge at court regarding a case.,The case manager conducted Client Assessment,neutral,neutral


6


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
6,6,The client's case in court is still indicted.,There were legal issues with the client,entailment,entailment
0,6,The client's case in court is still indicted.,The case manager met or spoke with the client,neutral,neutral
1,6,The client's case in court is still indicted.,The case manager sent or received information about the client,neutral,entailment
2,6,The client's case in court is still indicted.,The Client did not show up,neutral,entailment
3,6,The client's case in court is still indicted.,The case manager attempted client contact,neutral,contradiction
4,6,The client's case in court is still indicted.,The client's birth certificate was mentioned,neutral,contradiction
5,6,The client's case in court is still indicted.,The client's Texas State ID was mentioned,neutral,entailment
6,6,The client's case in court is still indicted.,The client signed ROIs,neutral,entailment
7,6,The client's case in court is still indicted.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,6,The client's case in court is still indicted.,The case manager conducted Client Assessment,neutral,neutral


7


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
7,7,The client spoke with the attorney.,There were legal issues with the client,entailment,neutral
0,7,The client spoke with the attorney.,The case manager met or spoke with the client,neutral,neutral
1,7,The client spoke with the attorney.,The case manager sent or received information about the client,neutral,entailment
2,7,The client spoke with the attorney.,The Client did not show up,neutral,entailment
3,7,The client spoke with the attorney.,The case manager attempted client contact,neutral,contradiction
4,7,The client spoke with the attorney.,The client's birth certificate was mentioned,neutral,contradiction
5,7,The client spoke with the attorney.,The client's Texas State ID was mentioned,neutral,entailment
6,7,The client spoke with the attorney.,The client signed ROIs,neutral,entailment
7,7,The client spoke with the attorney.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,7,The client spoke with the attorney.,The case manager conducted Client Assessment,neutral,neutral


8


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
8,8,The CM ordered a birth certificate for the client,The client's birth certificate was mentioned,entailment,neutral
0,8,The CM ordered a birth certificate for the client,The case manager met or spoke with the client,neutral,neutral
1,8,The CM ordered a birth certificate for the client,The case manager sent or received information about the client,neutral,entailment
2,8,The CM ordered a birth certificate for the client,The Client did not show up,neutral,entailment
3,8,The CM ordered a birth certificate for the client,The case manager attempted client contact,neutral,contradiction
4,8,The CM ordered a birth certificate for the client,There were legal issues with the client,neutral,contradiction
5,8,The CM ordered a birth certificate for the client,The client's Texas State ID was mentioned,neutral,entailment
6,8,The CM ordered a birth certificate for the client,The client signed ROIs,neutral,entailment
7,8,The CM ordered a birth certificate for the client,The case manager created or updated the client's HMIS profile,neutral,neutral
8,8,The CM ordered a birth certificate for the client,The case manager conducted Client Assessment,neutral,neutral


9


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
9,9,The CM ordered a Texas State ID for the client,The client's Texas State ID was mentioned,entailment,entailment
0,9,The CM ordered a Texas State ID for the client,The case manager met or spoke with the client,neutral,neutral
1,9,The CM ordered a Texas State ID for the client,The case manager sent or received information about the client,neutral,entailment
2,9,The CM ordered a Texas State ID for the client,The Client did not show up,neutral,entailment
3,9,The CM ordered a Texas State ID for the client,The case manager attempted client contact,neutral,contradiction
4,9,The CM ordered a Texas State ID for the client,There were legal issues with the client,neutral,contradiction
5,9,The CM ordered a Texas State ID for the client,The client's birth certificate was mentioned,neutral,entailment
6,9,The CM ordered a Texas State ID for the client,The client signed ROIs,neutral,entailment
7,9,The CM ordered a Texas State ID for the client,The case manager created or updated the client's HMIS profile,neutral,neutral
8,9,The CM ordered a Texas State ID for the client,The case manager conducted Client Assessment,neutral,neutral


10


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
10,10,The CM completed ROI for several organizations,The client signed ROIs,entailment,contradiction
0,10,The CM completed ROI for several organizations,The case manager met or spoke with the client,neutral,neutral
1,10,The CM completed ROI for several organizations,The case manager sent or received information about the client,neutral,entailment
2,10,The CM completed ROI for several organizations,The Client did not show up,neutral,entailment
3,10,The CM completed ROI for several organizations,The case manager attempted client contact,neutral,contradiction
4,10,The CM completed ROI for several organizations,There were legal issues with the client,neutral,contradiction
5,10,The CM completed ROI for several organizations,The client's birth certificate was mentioned,neutral,entailment
6,10,The CM completed ROI for several organizations,The client's Texas State ID was mentioned,neutral,entailment
7,10,The CM completed ROI for several organizations,The case manager created or updated the client's HMIS profile,neutral,neutral
8,10,The CM completed ROI for several organizations,The case manager conducted Client Assessment,neutral,neutral


11


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
11,11,The CM completed Release of Information forms.,The client signed ROIs,entailment,entailment
0,11,The CM completed Release of Information forms.,The case manager met or spoke with the client,neutral,neutral
1,11,The CM completed Release of Information forms.,The case manager sent or received information about the client,neutral,entailment
2,11,The CM completed Release of Information forms.,The Client did not show up,neutral,entailment
3,11,The CM completed Release of Information forms.,The case manager attempted client contact,neutral,contradiction
4,11,The CM completed Release of Information forms.,There were legal issues with the client,neutral,contradiction
5,11,The CM completed Release of Information forms.,The client's birth certificate was mentioned,neutral,entailment
6,11,The CM completed Release of Information forms.,The client's Texas State ID was mentioned,neutral,entailment
7,11,The CM completed Release of Information forms.,The case manager created or updated the client's HMIS profile,neutral,neutral
8,11,The CM completed Release of Information forms.,The case manager conducted Client Assessment,neutral,neutral


12


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
12,12,The client updated contact info in HMIS.,The case manager created or updated the client's HMIS profile,entailment,neutral
0,12,The client updated contact info in HMIS.,The case manager met or spoke with the client,neutral,neutral
1,12,The client updated contact info in HMIS.,The case manager sent or received information about the client,neutral,entailment
2,12,The client updated contact info in HMIS.,The Client did not show up,neutral,entailment
3,12,The client updated contact info in HMIS.,The case manager attempted client contact,neutral,contradiction
4,12,The client updated contact info in HMIS.,There were legal issues with the client,neutral,contradiction
5,12,The client updated contact info in HMIS.,The client's birth certificate was mentioned,neutral,entailment
6,12,The client updated contact info in HMIS.,The client's Texas State ID was mentioned,neutral,entailment
7,12,The client updated contact info in HMIS.,The client signed ROIs,neutral,neutral
8,12,The client updated contact info in HMIS.,The case manager conducted Client Assessment,neutral,neutral


13


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
13,13,The CM met with the client and completed DACC CM Intake Paperwork,The case manager conducted Client Assessment,entailment,entailment
0,13,The CM met with the client and completed DACC CM Intake Paperwork,The case manager met or spoke with the client,neutral,neutral
1,13,The CM met with the client and completed DACC CM Intake Paperwork,The case manager sent or received information about the client,neutral,entailment
2,13,The CM met with the client and completed DACC CM Intake Paperwork,The Client did not show up,neutral,entailment
3,13,The CM met with the client and completed DACC CM Intake Paperwork,The case manager attempted client contact,neutral,contradiction
4,13,The CM met with the client and completed DACC CM Intake Paperwork,There were legal issues with the client,neutral,contradiction
5,13,The CM met with the client and completed DACC CM Intake Paperwork,The client's birth certificate was mentioned,neutral,entailment
6,13,The CM met with the client and completed DACC CM Intake Paperwork,The client's Texas State ID was mentioned,neutral,entailment
7,13,The CM met with the client and completed DACC CM Intake Paperwork,The client signed ROIs,neutral,neutral
8,13,The CM met with the client and completed DACC CM Intake Paperwork,The case manager created or updated the client's HMIS profile,neutral,neutral


14


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
14,14,The client arrived for his intake with this CM.,The case manager conducted Client Assessment,entailment,neutral
0,14,The client arrived for his intake with this CM.,The case manager met or spoke with the client,neutral,neutral
1,14,The client arrived for his intake with this CM.,The case manager sent or received information about the client,neutral,entailment
2,14,The client arrived for his intake with this CM.,The Client did not show up,neutral,entailment
3,14,The client arrived for his intake with this CM.,The case manager attempted client contact,neutral,contradiction
4,14,The client arrived for his intake with this CM.,There were legal issues with the client,neutral,contradiction
5,14,The client arrived for his intake with this CM.,The client's birth certificate was mentioned,neutral,entailment
6,14,The client arrived for his intake with this CM.,The client's Texas State ID was mentioned,neutral,entailment
7,14,The client arrived for his intake with this CM.,The client signed ROIs,neutral,neutral
8,14,The client arrived for his intake with this CM.,The case manager created or updated the client's HMIS profile,neutral,neutral


15


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
15,15,The client completed CA at DACC.,A coordinated assessment was completed,entailment,entailment
0,15,The client completed CA at DACC.,The case manager met or spoke with the client,neutral,neutral
1,15,The client completed CA at DACC.,The case manager sent or received information about the client,neutral,entailment
2,15,The client completed CA at DACC.,The Client did not show up,neutral,entailment
3,15,The client completed CA at DACC.,The case manager attempted client contact,neutral,contradiction
4,15,The client completed CA at DACC.,There were legal issues with the client,neutral,contradiction
5,15,The client completed CA at DACC.,The client's birth certificate was mentioned,neutral,entailment
6,15,The client completed CA at DACC.,The client's Texas State ID was mentioned,neutral,entailment
7,15,The client completed CA at DACC.,The client signed ROIs,neutral,neutral
8,15,The client completed CA at DACC.,The case manager created or updated the client's HMIS profile,neutral,neutral


16


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
16,16,The client scored 16 on CA,A coordinated assessment was completed,entailment,neutral
0,16,The client scored 16 on CA,The case manager met or spoke with the client,neutral,neutral
1,16,The client scored 16 on CA,The case manager sent or received information about the client,neutral,entailment
2,16,The client scored 16 on CA,The Client did not show up,neutral,entailment
3,16,The client scored 16 on CA,The case manager attempted client contact,neutral,contradiction
4,16,The client scored 16 on CA,There were legal issues with the client,neutral,contradiction
5,16,The client scored 16 on CA,The client's birth certificate was mentioned,neutral,entailment
6,16,The client scored 16 on CA,The client's Texas State ID was mentioned,neutral,entailment
7,16,The client scored 16 on CA,The client signed ROIs,neutral,neutral
8,16,The client scored 16 on CA,The case manager created or updated the client's HMIS profile,neutral,neutral


17


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
17,17,The client completed Coordinated Assessment at DACC,A coordinated assessment was completed,entailment,contradiction
0,17,The client completed Coordinated Assessment at DACC,The case manager met or spoke with the client,neutral,neutral
1,17,The client completed Coordinated Assessment at DACC,The case manager sent or received information about the client,neutral,entailment
2,17,The client completed Coordinated Assessment at DACC,The Client did not show up,neutral,entailment
3,17,The client completed Coordinated Assessment at DACC,The case manager attempted client contact,neutral,contradiction
4,17,The client completed Coordinated Assessment at DACC,There were legal issues with the client,neutral,contradiction
5,17,The client completed Coordinated Assessment at DACC,The client's birth certificate was mentioned,neutral,entailment
6,17,The client completed Coordinated Assessment at DACC,The client's Texas State ID was mentioned,neutral,entailment
7,17,The client completed Coordinated Assessment at DACC,The client signed ROIs,neutral,neutral
8,17,The client completed Coordinated Assessment at DACC,The case manager created or updated the client's HMIS profile,neutral,neutral


18


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
18,18,The CM scheduled a CA with client.,A coordinated assessment was scheduled,entailment,entailment
0,18,The CM scheduled a CA with client.,The case manager met or spoke with the client,neutral,neutral
1,18,The CM scheduled a CA with client.,The case manager sent or received information about the client,neutral,entailment
2,18,The CM scheduled a CA with client.,The Client did not show up,neutral,entailment
3,18,The CM scheduled a CA with client.,The case manager attempted client contact,neutral,contradiction
4,18,The CM scheduled a CA with client.,There were legal issues with the client,neutral,contradiction
5,18,The CM scheduled a CA with client.,The client's birth certificate was mentioned,neutral,entailment
6,18,The CM scheduled a CA with client.,The client's Texas State ID was mentioned,neutral,entailment
7,18,The CM scheduled a CA with client.,The client signed ROIs,neutral,neutral
8,18,The CM scheduled a CA with client.,The case manager created or updated the client's HMIS profile,neutral,neutral


19


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
19,19,The client agreed to completed Coordinated Assessment.,A coordinated assessment was scheduled,entailment,entailment
0,19,The client agreed to completed Coordinated Assessment.,The case manager met or spoke with the client,neutral,neutral
1,19,The client agreed to completed Coordinated Assessment.,The case manager sent or received information about the client,neutral,entailment
2,19,The client agreed to completed Coordinated Assessment.,The Client did not show up,neutral,entailment
3,19,The client agreed to completed Coordinated Assessment.,The case manager attempted client contact,neutral,contradiction
4,19,The client agreed to completed Coordinated Assessment.,There were legal issues with the client,neutral,contradiction
5,19,The client agreed to completed Coordinated Assessment.,The client's birth certificate was mentioned,neutral,entailment
6,19,The client agreed to completed Coordinated Assessment.,The client's Texas State ID was mentioned,neutral,entailment
7,19,The client agreed to completed Coordinated Assessment.,The client signed ROIs,neutral,neutral
8,19,The client agreed to completed Coordinated Assessment.,The case manager created or updated the client's HMIS profile,neutral,neutral


20


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
20,20,The CM met with client and completed intake paperwork,Plan of action signed and completed by the client.,entailment,entailment
0,20,The CM met with client and completed intake paperwork,The case manager met or spoke with the client,neutral,neutral
1,20,The CM met with client and completed intake paperwork,The case manager sent or received information about the client,neutral,entailment
2,20,The CM met with client and completed intake paperwork,The Client did not show up,neutral,entailment
3,20,The CM met with client and completed intake paperwork,The case manager attempted client contact,neutral,contradiction
4,20,The CM met with client and completed intake paperwork,There were legal issues with the client,neutral,contradiction
5,20,The CM met with client and completed intake paperwork,The client's birth certificate was mentioned,neutral,entailment
6,20,The CM met with client and completed intake paperwork,The client's Texas State ID was mentioned,neutral,entailment
7,20,The CM met with client and completed intake paperwork,The client signed ROIs,neutral,neutral
8,20,The CM met with client and completed intake paperwork,The case manager created or updated the client's HMIS profile,neutral,neutral


21


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
21,21,The CM ran client's DPS report,The client's DPS report was run,entailment,contradiction
0,21,The CM ran client's DPS report,The case manager met or spoke with the client,neutral,neutral
1,21,The CM ran client's DPS report,The case manager sent or received information about the client,neutral,entailment
2,21,The CM ran client's DPS report,The Client did not show up,neutral,entailment
3,21,The CM ran client's DPS report,The case manager attempted client contact,neutral,contradiction
4,21,The CM ran client's DPS report,There were legal issues with the client,neutral,contradiction
5,21,The CM ran client's DPS report,The client's birth certificate was mentioned,neutral,entailment
6,21,The CM ran client's DPS report,The client's Texas State ID was mentioned,neutral,entailment
7,21,The CM ran client's DPS report,The client signed ROIs,neutral,neutral
8,21,The CM ran client's DPS report,The case manager created or updated the client's HMIS profile,neutral,neutral


22


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
22,22,The client received the client's FBI background check,The client's DPS report was run,entailment,entailment
0,22,The client received the client's FBI background check,The case manager met or spoke with the client,neutral,neutral
1,22,The client received the client's FBI background check,The case manager sent or received information about the client,neutral,entailment
2,22,The client received the client's FBI background check,The Client did not show up,neutral,entailment
3,22,The client received the client's FBI background check,The case manager attempted client contact,neutral,contradiction
4,22,The client received the client's FBI background check,There were legal issues with the client,neutral,contradiction
5,22,The client received the client's FBI background check,The client's birth certificate was mentioned,neutral,entailment
6,22,The client received the client's FBI background check,The client's Texas State ID was mentioned,neutral,entailment
7,22,The client received the client's FBI background check,The client signed ROIs,neutral,neutral
8,22,The client received the client's FBI background check,The case manager created or updated the client's HMIS profile,neutral,neutral


23


Unnamed: 0,id,premise,hypothesis,gold_3,PREDS
23,23,The CM asked client to complete CSR,The client was assigned CSR,entailment,neutral
0,23,The CM asked client to complete CSR,The case manager met or spoke with the client,neutral,neutral
1,23,The CM asked client to complete CSR,The case manager sent or received information about the client,neutral,entailment
2,23,The CM asked client to complete CSR,The Client did not show up,neutral,entailment
3,23,The CM asked client to complete CSR,The case manager attempted client contact,neutral,contradiction
4,23,The CM asked client to complete CSR,There were legal issues with the client,neutral,contradiction
5,23,The CM asked client to complete CSR,The client's birth certificate was mentioned,neutral,entailment
6,23,The CM asked client to complete CSR,The client's Texas State ID was mentioned,neutral,entailment
7,23,The CM asked client to complete CSR,The client signed ROIs,neutral,neutral
8,23,The CM asked client to complete CSR,The case manager created or updated the client's HMIS profile,neutral,neutral
