# Import 

In [1]:
import faiss

In [23]:
import matplotlib.pyplot as plt
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from spacy.lang.en import English
from medspacy_io.reader.ehost_reader import EhostDocReader, EhostDirReader
from spacy import displacy
import medspacy
from pathlib import Path
import joblib
from spacy.tokens import Doc
from typing import List
import random

In [3]:
pwd

'Q:\\ORD_Bress_202112038D\\NLP\\notebooks_vb_exp'

## Read all the eHOST docs

In [4]:
nlp=English()
nlp.add_pipe('medspacy_pyrush')

<PyRuSH.PyRuSHSentencizer.PyRuSHSentencizer at 0x24918fc9180>

In [11]:
merged_dir=r'..\data\merged_data1_8'
dir_reader = EhostDirReader(nlp=nlp, recursive=False, use_adjudication=False,
                            schema_file=str(Path(merged_dir, 'config', 'projectschema.xml')), support_overlap=True)

ALL ATTRIBUTES FROM SCHEMA: {'ANNOT_Worsening'}


In [12]:
docs=dir_reader.read(txt_dir=str(Path(merged_dir, 'corpus')))



In [13]:
len(docs)

772

In [19]:
joblib.dump(docs, '../data/merged_spacy_docs_1_8.joblib')

['../data/merged_spacy_docs_1_8.joblib']

## load pickled docs

In [7]:
docs=joblib.load('../data/merged_spacy_docs_1_8.joblib')

# Define sampling functions

In [8]:
rounds =10
seed= 14
train_total_docs, test_docs=train_test_split(docs, test_size=.33, random_state=seed)

In [11]:
len(train_total_docs), len(test_docs)

(517, 255)

In [100]:
def spans_to_bio(doc:Doc, anno_types:List[str], abbr:bool=False)->str:
  """
  Converts spans in a spaCy Doc object to a BIO-formatted string, with an option
  to abbreviate the entity labels. It adds an empty line between sentences to improve
  readability.

  Parameters:
  - doc (Doc): The spaCy Doc object containing the text and its annotations, including
                entities and sentence boundaries.
  - anno_types (List[str]): A list of annotation types to include in the output. These
                            types should correspond to the keys in `doc.spans`.
  - abbr (bool, optional): If True, entity labels are abbreviated to their initials.
                            Defaults to True.

  Returns:
  - str: A string where each token is followed by its BIO tag (with the entity label if applicable),
          formatted as "token B-entity" or "token I-entity" for tokens within entities, and
          "token O" for tokens outside any entities. Sentences are separated by an empty line.
  """
  # Initialize a dictionary to hold BIO tags for each token index
  bio_tags = {token.i: 'O' for token in doc}  # Default to 'O' for outside any entity

  # Preprocess spans to assign BIO tags
  for anno_type, spans in doc.spans.items():
    if anno_type not in anno_types:
        continue
    if len(spans)==0:
        continue
    for span in spans:
        if span:  # Check if span is not empty
          label=span.label_
          if label not in anno_types:
            continue
          if abbr:
            label=''.join([w[0] for w in label.split('_')])
          bio_tags[span.start] = f"B-{label}"  # Begin tag for the first token in the span
          for token in span[1:]:  # Inside tags for the rest of the tokens in the span
            bio_tags[token.i] = f"I-{label}"

  # Generate BIO format string
  bio_text = []
  bio_data={'sentence_id':[],'token':[],'label':[]}
  for s,sent in enumerate(doc.sents):
    for i,token in enumerate(sent):
      # trim the whitespaces on both sides of a sentence
      if (i==0 or i==len(sent)-1) and str(token).strip()=='':
        bio_text.append('')
        continue
      elif str(token).strip()=='':
        # clean up extra whitespaces within a sentence.
        bio_text.append(f' \t{bio_tags[token.i]}')
        bio_data['label'].append(bio_tags[token.i])
      else:
        bio_text.append(f"{token.text} {bio_tags[token.i]}")
        bio_data['label'].append(bio_tags[token.i])
      bio_data['token'].append(token)
      bio_data['sentence_id'].append(s)
    bio_text.append('')  # Empty line between sentences
  return '\n'.join(bio_text), pd.DataFrame(bio_data)

# We will focus on two types of concepts here
def convert_docs(docs:List[Doc], anno_types=['FAM_COLON_CA','COLON_CA']):
  all_conll=[]
  offset=0
  dfs=[]
  for d in docs:
    data, df=spans_to_bio(d, anno_types=anno_types)
    all_conll.append(data)
    df['sentence_id']+=offset
    offset+=df.shape[0]
    dfs.append(df)
  return '\n\n'.join(all_conll), pd.concat(dfs)
def word2features(sent, i):
    word = sent[i]
    postag = word.pos_
    word=str(word)

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1]
        postag1 = word1.pos_
        word1=str(word1)
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]
        postag1 = word1.pos_
        word1=str(word1)
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [101]:
def compute_metrics_and_averages(y_true, y_pred):
    def extract_entities(sentence_tags, row_id):
        entities = []
        current_entity = None
        for i, tag in enumerate(sentence_tags):
            if tag.startswith('B-'):
                if current_entity:
                    entities.append(current_entity)
                current_entity = {'type': tag[2:], 'start': i, 'end': i, 'row_id': row_id}
            elif tag.startswith('I-') and current_entity and current_entity['type'] == tag[2:]:
                current_entity['end'] = i
            else:
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
        if current_entity:
            entities.append(current_entity)
        return entities

    # Initialize containers
    metrics = {}
    FP_ids = {}
    FN_ids = {}

    for row_id, (true_tags, pred_tags) in enumerate(zip(y_true, y_pred)):
        true_entities = extract_entities(true_tags, row_id)
        pred_entities = extract_entities(pred_tags, row_id)

        for entity in true_entities + pred_entities:
            entity_type = entity['type']
            if entity_type not in metrics:
                metrics[entity_type] = {'TP': 0, 'FP': 0, 'FN': 0}
                FP_ids[entity_type] = []
                FN_ids[entity_type] = []

        for pred_entity in pred_entities:
            matched = False
            for true_entity in true_entities:
                if pred_entity['type'] == true_entity['type'] and not (pred_entity['end'] < true_entity['start'] or pred_entity['start'] > true_entity['end']):
                    metrics[pred_entity['type']]['TP'] += 1
                    matched = True
                    true_entities.remove(true_entity)
                    break
            if not matched:
                metrics[pred_entity['type']]['FP'] += 1
                FP_ids[pred_entity['type']].append(pred_entity['row_id'])

        for true_entity in true_entities:
            metrics[true_entity['type']]['FN'] += 1
            FN_ids[true_entity['type']].append(true_entity['row_id'])

    # Calculate micro and macro averages
    total_TP = sum(metrics[etype]['TP'] for etype in metrics)
    total_FP = sum(metrics[etype]['FP'] for etype in metrics)
    total_FN = sum(metrics[etype]['FN'] for etype in metrics)

    micro_precision = total_TP / (total_TP + total_FP) if total_TP + total_FP > 0 else 0
    micro_recall = total_TP / (total_TP + total_FN) if total_TP + total_FN > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if micro_precision + micro_recall > 0 else 0

    precisions = [metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FP']) if metrics[etype]['TP'] + metrics[etype]['FP'] > 0 else 0 for etype in metrics]
    recalls = [metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FN']) if metrics[etype]['TP'] + metrics[etype]['FN'] > 0 else 0 for etype in metrics]
    macro_precision = sum(precisions) / len(metrics) if metrics else 0
    macro_recall = sum(recalls) / len(metrics) if metrics else 0
    macro_f1 = 2 * macro_precision * macro_recall / (macro_precision + macro_recall) if macro_precision + macro_recall > 0 else 0

    # Prepare DataFrame
    data = {
        'Entity Type': list(metrics.keys()) + ['Micro Average', 'Macro Average'],
        'Precision': [metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FP']) if metrics[etype]['TP'] + metrics[etype]['FP'] > 0 else 0 for etype in metrics] + [micro_precision, macro_precision],
        'Recall': [metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FN']) if metrics[etype]['TP'] + metrics[etype]['FN'] > 0 else 0 for etype in metrics] + [micro_recall, macro_recall],
        'F1': [2 * (metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FP']) * metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FN'])) / ((metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FP'])) + (metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FN']))) if (metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FP'])) + (metrics[etype]['TP'] / (metrics[etype]['TP'] + metrics[etype]['FN'])) > 0 else 0 for etype in metrics] + [micro_f1, macro_f1]
    }

    results_df = pd.DataFrame(data)
    return results_df, FP_ids, FN_ids


### Define CRF Wrapper

In [102]:
## Get all annotation types: 
annos=set()
for d in train_total_docs:
    for anno in d.spans.keys():
        annos.add(anno)
annos.remove('PRE')   
print(annos)

{'PsychDisorder', 'CFI', 'PsychHospitalization', 'Dementia', 'OtherCause', 'Intoxication', 'Delirium', 'Interference'}


In [103]:
class CRFWrapper(object):
    def __init__(self, anno_types=[]):
        self.crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True)
        self.anno_types=anno_types

    def __df2features(self, df:pd.DataFrame):
        X=[sent2features(list(sdf['token'])) for id,sdf in df.groupby('sentence_id')]
        y=[list(sdf['label']) for id,sdf in df.groupby('sentence_id')]
        return X,y
        
    def fit(self, docs: List[Doc]):
        _, train_df=convert_docs(docs, anno_types=self.anno_types)
        X_train, y_train=self.__df2features(train_df)
        self.crf.fit(X_train, y_train)

    def transform(self, docs:List[Doc], anno_types=[]):
        _, test_df=convert_docs(docs, anno_types=self.anno_types)
        X_test, y_test=self.__df2features(test_df)
        y_pred = self.crf.predict(X_test)
        return y_test, y_pred 

    def eval(self, docs:List[Doc]):
        y_test, y_pred =self.transform(docs)
        results_df, FP_ids, FN_ids=compute_metrics_and_averages(y_test, y_pred)
        return results_df, FP_ids, FN_ids
        

In [104]:
crf_model=CRFWrapper(annos)

In [105]:
crf_model.fit(round0)

In [99]:
results_df, FP_ids, FN_ids=crf_model.eval(test_docs)
results_df

ZeroDivisionError: division by zero

In [90]:
_, train_df=convert_docs(train_total_docs, anno_types=annos)

In [91]:
labs=train_df[train_df['label']!='O']

In [92]:
labs

Unnamed: 0,sentence_id,token,label
123,5174,Overestimates,B-C
124,5174,/,I-C
125,5174,forgets,I-C
126,5174,limit[x]15,I-C
492,5211,dementia,B-D
...,...,...,...
1323,413805,Alteration,B-C
1324,413805,in,I-C
1325,413805,mental,I-C
1326,413805,status,I-C


In [93]:
results_df, FP_ids, FN_ids=crf_model.eval(test_docs)

In [94]:
results_df

Unnamed: 0,Entity Type,Precision,Recall,F1
0,O,0.615385,0.377953,0.468293
1,C,0.642857,0.305085,0.413793
2,I,0.28125,0.214286,0.243243
3,D,0.866667,0.639344,0.735849
4,P,1.0,0.52381,0.6875
5,Micro Average,0.644144,0.387534,0.483926
6,Macro Average,0.681232,0.412095,0.513538


### Define completely random sampling

In [35]:
def rand_sample(data, num=10, seed=14):
    random.seed(seed)  
    sampled_indices=random.sample(range(len(data)), num);
    sampeld_sublist=[data[i] for i in sampled_indices]
    sampled_indices=set(sampled_indices)
    remaining_sublist = [d for i,d in enumerate(data) if i not in sampled_indices]
    return sampeld_sublist, remaining_sublist

In [36]:
round0, remain=rand_sample(train_total_docs, num=51)

In [37]:
crf_model=CRFWrapper()

In [38]:
crf_model.fit(round0)

In [None]:
def incremental_uncertainty_sampling(model, X_unlabeled, y_unlabeled, n_instances=1):
    uncertainties = []
    for xseq in X_unlabeled:
        marginals = model.predict_marginals_single(xseq)
        seq_uncertainty = np.mean([1 - max(token.values()) for token in marginals])
        uncertainties.append(seq_uncertainty)
    # Get indices of the n_instances most uncertain samples
    selected_indices = np.argsort(uncertainties)[-n_instances:]
    return selected_indices

In [None]:
# Initialize the labeled and unlabeled pools
initial_subset_size = len(data_df) // 10
labeled_df = data_df.sample(n=initial_subset_size)
unlabeled_df = data_df.drop(labeled_df.index)

# For storing evaluation results
f1_scores = []

# Perform 10 iterations of sampling and training
for i in range(1, 11):
    print(f"Iteration: {i}")
    
    # Preprocess and train
    train_sents = df_to_sentences(labeled_df)
    X_train = sentences_to_features(train_sents)
    y_train = sentences_to_labels(train_sents)
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
    crf.fit(X_train, y_train)
    
    # Evaluate
    eval_sents = df_to_sentences(eval_df)
    X_eval = sentences_to_features(eval_sents)
    y_eval = sentences_to_labels(eval_sents)
    y_pred = crf.predict(X_eval)
    f1 = flat_f1_score(y_eval, y_pred, average='weighted')
    f1_scores.append(f1)
    
    if len(unlabeled_df) == 0:
        break
    
    # Incrementally sample from the unlabeled pool
    unlabeled_sents = df_to_sentences(unlabeled_df)
    X_unlabeled = sentences_to_features(unlabeled_sents)
    y_unlabeled_dummy = sentences_to_labels(unlabeled_sents)  # Placeholder, not used for sampling
    selected_indices = incremental_uncertainty_sampling(crf, X_unlabeled, y_unlabeled_dummy, n_instances=initial_subset_size)
    
    # Update the labeled and unlabeled pools
    selected_df = unlabeled_df.iloc[selected_indices]
    labeled_df = pd.concat([labeled_df, selected_df])
    unlabeled_df = unlabeled_df.drop(selected_df.index)

# Plotting the F1 scores
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(f1_scores) + 1), f1_scores, marker='o', linestyle='-')
plt.xlabel('Iteration')
plt.ylabel('Weighted F1 Score')
plt.title('F1 Score over Iterations of Uncertainty Sampling')
plt.xticks(range(1, len(f1_scores) + 1))
plt.grid(True)
plt.show()