In [None]:
from cava_nlp import CaVaLang, CaVaRetokenizer
import pandas as pd
from pathlib import Path

data_dir = Path('./data')

In [None]:
def get_scores(doc):
    scores = []
    for e in doc.ents:
        if e.label_ == 'ECOG_STATUS':
            scores.append(doc[e.start]._.ecog_status_value)
    return scores

def get_snips(doc):
    snips = []
    for e in doc.ents:
        if e.label_ == 'ECOG_STATUS':
            snips.append(doc[max(0, e.start-5):min(e.end+5, len(doc))].text)
    return snips

def get_offsets(doc):
    offsets = []
    for e in doc.ents:
        if e.label_ == 'ECOG_STATUS':
            offsets.append(e.start)
    return offsets


def char_offset(doc, offset):
    e = doc[offset]
    start = e.idx
    end = start + len(e)
    return (start, end)
    
    
def get_value_from_span(span, value_label):
   for t in span:
       val = t._.get(value_label)
       if val != -1:
           return val 

In [None]:
nlp = CaVaLang()
nlp.add_pipe('ecog_status', first=True)
nlp.tokenizer = CaVaRetokenizer(nlp)

In [None]:
text_samples = pd.read_csv(data_dir / 'simple_egs.csv', delimiter='\t')

In [None]:
text_samples['doc'] = text_samples['InputData'].apply(nlp)

In [None]:
text_samples['ecog_scores'] = text_samples['doc'].map(get_scores)
text_samples['ecog_snips'] = text_samples['doc'].map(get_snips)
text_samples['ecog_offsets'] = text_samples['doc'].map(get_offsets)

In [None]:
text_samples.ecog_scores = text_samples.ecog_scores.apply(lambda x: x[0] if len(x) > 0 else '')
text_samples.ecog_offsets = text_samples.ecog_offsets.apply(lambda x: x[0] if len(x) > 0 else '')
text_samples.ecog_snips = text_samples.ecog_snips.apply(lambda x: x[0] if len(x) > 0 else '')

In [None]:
text_samples.head()