## Import necessary libraries

In [12]:
import numpy
import pandas as pd
import sklearn
import spacy
from collections import Counter
import sklearn_crfsuite
from sklearn_crfsuite import metrics

print("NumPy:", numpy.__version__)
print("Pandas:", pd.__version__)
print("Scikit-learn:", sklearn.__version__)
print("spacy:", spacy.__version__)

NumPy: 1.26.4
Pandas: 2.2.3
Scikit-learn: 1.7.0
spacy: 3.8.6


In [33]:
# Custom function to combine words to make sentence
def words_to_sentence(file_path_word, file_path_label):
    current_sentence = []
    all_sentence = []
    sentence_list = []
    label_list = []
    current_sentence = []
    current_label = []
    with open(file_path_word, mode='r', encoding='utf-8') as word_file, open(file_path_label, mode='r', encoding='utf-8') as label_file:
        for word_line, label_line in zip(word_file, label_file):
            word = word_line.strip()
            label = label_line.strip()
            
            # When empty line entered i.e end of sentence.
            if (word == '' and label == ''):
                if(current_sentence and current_label):
                    sentence_list.append(current_sentence)
                    label_list.append(current_label)
                    current_sentence = []
                    current_label = []
            else:
                current_sentence.append(word)
                current_label.append(label)

        # For the last sentence where no empty line after is present
        if(current_sentence and current_label):
            sentence_list.append(current_sentence)
            label_list.append(current_label)

    return sentence_list, label_list

def isValid(token):
    if(len(token.text) > 2 and token.pos_ in ['NOUN','PROPN'] and not token.is_stop and not token.is_punct):
        return True
    else:
        return False

def create_tagged_data(sentences, labels):
    tagged_sentences = []
    for tokens, tags in zip(sentences, labels):
        doc = nlp(" ".join(tokens))
        pos_tags = [token.pos_ for token in doc]
        tagged = list(zip(tokens, pos_tags, tags))
        tagged_sentences.append(tagged)
    return tagged_sentences


# Data preprocessing

In [41]:
train_sentences, train_labels = words_to_sentence('train_sent', 'train_label')
test_sentences, test_labels = words_to_sentence('test_sent', 'test_label')

# Print first 5 sentences and labels
for i in range(5):
    print(f"{' '.join(train_sentences[i])} - {' '.join(train_labels[i])}")
    print(f"{' '.join(test_sentences[i])} - {' '.join(test_labels[i])}")
    
# Count no. of train and test data
print(f"no. of train sentence: {len(train_sentences)}")
print(f"no. of train label: {len(train_labels)}")
print(f"no. of test sentence: {len(test_sentences)}")
print(f"no. of test label: {len(test_labels)}")

All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status ) - O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 ) - O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 ) - O O O O O O O O O O O O O O O O O O O O O O O O O
As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration - O O O O O O O O O O O O O O O O O O O
Abnormal presentation was the most common indication ( 25.6 % , 88 of

# Concept identification

## Print frequency of NOUN and PROPN and 25 most used NOUN and PROPN

In [50]:
nlp = spacy.load('en_core_web_sm')
all_sentences = train_sentences + test_sentences
noun_counter = Counter()

for sentence in all_sentences:
    tokenized_sent = nlp(" ".join(sentence))
    for token in tokenized_sent:
        if isValid(token):
            noun_counter[token.text.lower()] += 1
            
# Print nouns and their frequency
# for word, count in noun_counter.items():
#     print(f"{word}-- {count}")

# Print 25 most common nouns
print(f"\n 25 most common used nouns")
for word, count in noun_counter.most_common(25):
    print(f"{word} -- {count}")


 25 most common used nouns
patients -- 507
treatment -- 304
cancer -- 211
therapy -- 177
study -- 174
disease -- 149
cell -- 142
lung -- 118
results -- 116
group -- 111
effects -- 99
gene -- 91
chemotherapy -- 91
use -- 87
effect -- 82
women -- 81
analysis -- 76
risk -- 74
surgery -- 73
cases -- 72
rate -- 68
survival -- 67
response -- 66
children -- 66
dose -- 65


## Defining the features for CRF

In [61]:
def getFeaturesForOneWord(sentence, index):
  word = sentence[index]

  features = [
    'word.lower=' + word.lower_, # serves as word id
    'word.pos=' + word.pos_
  ]

  if(index > 0):
    prev_word = sentence[index-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower_,
    'prev_word.pos=' + prev_word.pos_
  ])
  else:
    features.append('BEG') # feature to track begin of sentence 

  if(index == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

## Get features and labels

In [66]:
# Define a function to get features for a sentence 
# using the 'getFeaturesForOneWord' function.
def getFeaturesForOneSentence(sentence):
  sentence_list = nlp(sentence)
  return [getFeaturesForOneWord(sentence_list, pos) for pos in range(len(sentence_list))]

# Define a function to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels    

In [67]:
X_train = [getFeaturesForOneSentence("".join(sentence)) for sentence in train_sentences]
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]

X_test = [getFeaturesForOneSentence("".join(sentence)) for sentence in test_sentences]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]


In [95]:
print(len(X_test), len(Y_test)) 
filtered = [(x, y) for x, y in zip(X_train, Y_train) if len(x) == len(y)]
filtered_test = [(x, y) for x, y in zip(X_test, Y_test) if len(x) == len(y)]
X_train, Y_train = map(list, zip(*filtered))
X_test, Y_test = map(list, zip(*filtered_test))

31 31


## Build the model

In [102]:
crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(X_train, Y_train)


y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, y_pred, average='weighted')

1.0

In [None]:
# Let's verify the label distribution
flat_labels = [label for sent in y_train_filtered for label in sent]
label_counts = Counter(flat_labels)

print("Count of labels:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

In [101]:
id = 20
print("Sentence:", test_sentences[id])
print("Sentence length:", len(test_sentences[id]))
print("Orig Labels:", Y_test[id])
print("Pred Labels:", y_pred[id])

Sentence: ['Sequelae', 'include', 'severe', 'developmental', 'delay', 'and', 'asymmetric', 'double', 'hemiplegia']
Sentence length: 9
Orig Labels: ['O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O']


In [106]:
from collections import defaultdict

# Initialize dictionary for mapping diseases to their treatments
disease_to_treatments = defaultdict(list)

for tokens, preds in zip(test_sentences, y_pred):
    # Find all diseases in this sentence
    for idx, (token, label) in enumerate(zip(tokens, preds)):
        if label == 'D':
            disease = token
            # Find all treatments in the same sentence
            treatments = [tokens[j] for j, l in enumerate(preds) if l == 'T']
            # You may want to avoid duplicates for each disease
            for treatment in treatments:
                if treatment not in disease_to_treatments[disease]:
                    disease_to_treatments[disease].append(treatment)

# Convert to normal dict for display
disease_to_treatments = dict(disease_to_treatments)
print(disease_to_treatments)
for d, tlist in disease_to_treatments.items():
    print(f"{d}: {', '.join(tlist)}")


{}
