In [None]:
from cava_nlp import CaVaLang
import pandas as pd
from pathlib import Path

from spacy.language import Language
from cava_nlp.normalisation.normaliser import ClinicalNormalizer
from dataclasses import dataclass
from cava_nlp.rule_engine import RuleEngine
data_dir = Path('./data')

In [None]:
def get_scores(doc):
    scores = []
    if doc.spans:
        for sp in doc.spans.get('ecog'):
            scores.append(sp._.value)
    return scores

def get_snips(doc):
    snips = []
    for e in doc.ents:
        if e.label_ == 'ECOG':
            snips.append(doc[max(0, e.start-5):min(e.end+5, len(doc))].text)
    return snips

def get_offsets(doc):
    offsets = []
    for e in doc.ents:
        if e.label_ == 'ECOG':
            offsets.append(e.start)
    return offsets

In [None]:
n = CaVaLang()
n.add_pipe("clinical_normalizer", first=True)
# todo: iterate csv and pull in all in-scope rule engines to validate
n.add_pipe(
    "rule_engine",
    name="ecog_value",
    config={
        "engine_config_path": None,       # loading and testing default rule engine only
        "component_name": "ecog_status",   
    },
)
n.add_pipe(
    "rule_engine",
    name="variants_of_interest",
    config={
        "engine_config_path": None,       # loading and testing default rule engine only
        "component_name": "variants_of_interest",   
    },
)


In [None]:
text_samples = pd.read_csv(data_dir / 'simple_egs.csv', delimiter='\t')

In [None]:
text_samples['doc'] = text_samples['InputData'].apply(n)

In [None]:
text_samples['ecog_scores'] = text_samples['doc'].map(get_scores)
text_samples['ecog_snips'] = text_samples['doc'].map(get_snips)
text_samples['ecog_offsets'] = text_samples['doc'].map(get_offsets)

In [None]:
text_samples.ecog_scores = text_samples.ecog_scores.apply(lambda x: x[0] if len(x) > 0 else '')
text_samples.ecog_offsets = text_samples.ecog_offsets.apply(lambda x: x[0] if len(x) > 0 else '')
text_samples.ecog_snips = text_samples.ecog_snips.apply(lambda x: x[0] if len(x) > 0 else '')

In [None]:
text_samples.head()