## Import necessary libraries

In [130]:
import numpy
import pandas as pd
import sklearn
import spacy
from collections import Counter
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from collections import defaultdict

print("NumPy:", numpy.__version__)
print("Pandas:", pd.__version__)
print("Scikit-learn:", sklearn.__version__)
print("spacy:", spacy.__version__)

NumPy: 1.26.4
Pandas: 2.2.3
Scikit-learn: 1.7.0
spacy: 3.8.6


In [131]:
# Custom function to combine words to make sentence

def words_to_sentence(file_path_word, file_path_label):
    sentence_list = []
    label_list = []
    current_sentence = []
    current_label = []
    with open(file_path_word, mode='r', encoding='utf-8') as word_file, \
         open(file_path_label, mode='r', encoding='utf-8') as label_file:
        for word_line, label_line in zip(word_file, label_file):
            word = word_line.strip()
            label = label_line.strip()
            # When empty line entered i.e end of sentence.
            if (word == '' and label == ''):
                if current_sentence and current_label:
                    sentence_list.append(current_sentence)
                    label_list.append(current_label)
                    current_sentence = []
                    current_label = []
            else:
                current_sentence.append(word)
                current_label.append(label)
        # For the last sentence (if no blank line after it)
        if current_sentence and current_label:
            sentence_list.append(current_sentence)
            label_list.append(current_label)
    return sentence_list, label_list

def isValid(token):
    if(len(token.text) > 2 and token.pos_ in ['NOUN','PROPN'] and not token.is_stop and not token.is_punct):
        return True
    else:
        return False

# Data preprocessing

In [132]:
train_sentences, train_labels = words_to_sentence('train_sent', 'train_label')
test_sentences, test_labels = words_to_sentence('test_sent', 'test_label')

# Print first 5 sentences and labels
for i in range(5):
    print(f"{' '.join(train_sentences[i])} - {' '.join(train_labels[i])}")
    print(f"{' '.join(test_sentences[i])} - {' '.join(test_labels[i])}")
    
# Count no. of train and test data
print(f"no. of train sentence: {len(train_sentences)}")
print(f"no. of train label: {len(train_labels)}")
print(f"no. of test sentence: {len(test_sentences)}")
print(f"no. of test label: {len(test_labels)}")

All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status ) - O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 ) - O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 ) - O O O O O O O O O O O O O O O O O O O O O O O O O
As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration - O O O O O O O O O O O O O O O O O O O
Abnormal presentation was the most common indication ( 25.6 % , 88 of

# Concept identification

## Print frequency of NOUN and PROPN and 25 most used NOUN and PROPN

In [133]:
nlp = spacy.load('en_core_web_sm')
all_sentences = train_sentences + test_sentences
noun_counter = Counter()

for sentence in all_sentences:
    tokenized_sent = nlp(" ".join(sentence))
    for token in tokenized_sent:
        if isValid(token):
            noun_counter[token.text.lower()] += 1
            
# Print nouns and their frequency
for word, count in noun_counter.items():
    print(f"{word}-- {count}")

# Print 25 most common nouns
print(f"\n 25 most common used nouns")
for word, count in noun_counter.most_common(25):
    print(f"{word} -- {count}")

births-- 2
weeks-- 37
university-- 7
vermont-- 1
delivery-- 22
route-- 2
indication-- 5
age-- 56
parity-- 4
practice-- 36
group-- 111
risk-- 74
status-- 17
rate-- 68
presentation-- 7
medicine-- 42
patients-- 507
arrest-- 6
dilation-- 4
subgroups-- 4
rates-- 35
care-- 56
hospitals-- 10
community-- 16
groups-- 34
trimester-- 4
fluid-- 8
index-- 14
afi-- 6
temperature-- 10
increases-- 6
decrease-- 6
june-- 2
august-- 6
period-- 28
heat-- 5
women-- 81
singleton-- 1
pregnancies-- 7
gestation-- 4
testing-- 26
determinations-- 2
area-- 13
day-- 36
test-- 22
date-- 3
spearman-- 1
rank-- 2
correlation-- 19
relationship-- 14
account-- 4
measure-- 4
study-- 174
population-- 31
diabetes-- 34
screening-- 16
clinic-- 6
hadassah-- 1
medical-- 6
center-- 10
year-- 47
treatment-- 304
week-- 11
protocol-- 6
control-- 57
characteristics-- 20
differences-- 24
mode-- 3
apgar-- 1
scores-- 8
infants-- 25
cases-- 72
aim-- 9
contribution-- 4
markers-- 22
diagnosis-- 52
fetuses-- 9
years-- 58
syndrome-- 58
edwa

## Defining the features for CRF

In [134]:
import spacy
nlp = spacy.load("en_core_web_sm")

def getFeaturesForOneWord(doc, index):
    token = doc[index]
    features = [
        'word.lower=' + token.lower_,
        'word[-3:]=' + token.text[-3:],
        'word[-2:]=' + token.text[-2:],
        'word.isupper=' + str(token.is_upper),
        'word.postag=' + token.pos_,
    ]
    if index > 0:
        prev_token = doc[index - 1]
        features.extend([
            'prevword.lower=' + prev_token.lower_,
            'prevword.isupper=' + str(prev_token.is_upper),
            'prevword.postag=' + prev_token.pos_
        ])
    else:
        features.append('BEG')
    if index == len(doc) - 1:
        features.append('END')
    return features

# here tokens: List of strings e.g ['abc', 'xyz']
def getFeaturesForOneSentence(tokens):
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    nlp.get_pipe("tagger")(doc)
    return [getFeaturesForOneWord(doc, i) for i in range(len(doc))]


## Get features and labels

In [135]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
Y_train = [labels for labels in train_labels]

X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]
Y_test = [labels for labels in test_labels]


In [136]:
print(len(X_test[20]), len(Y_test[20])) 

# Check for any mismatch in X_test and Y_test
for idx, (tokens, labels) in enumerate(zip(X_test, Y_test)):
    if len(tokens) != len(labels):
        print(f"Mismatch at index {idx}:")
        print(f"  Tokens ({len(tokens)}): {tokens}")
        print(f"  Labels ({len(labels)}): {labels}")
mismatch_count = sum(1 for tokens, labels in zip(test_sentences, test_labels) if len(tokens) != len(labels))
print(f"Number of mismatches in test: {mismatch_count}")

mismatch_count_train = sum(1 for tokens, labels in zip(train_sentences, train_labels) if len(tokens) != len(labels))
print(f"Number of mismatches in train: {mismatch_count_train}")

index = 20
print(X_test[index])
print(len(X_test[index]))
print(Y_test[index])


9 9
Number of mismatches in test: 0
Number of mismatches in train: 0
[['word.lower=sequelae', 'word[-3:]=lae', 'word[-2:]=ae', 'word.isupper=False', 'word.postag=', 'BEG'], ['word.lower=include', 'word[-3:]=ude', 'word[-2:]=de', 'word.isupper=False', 'word.postag=', 'prevword.lower=sequelae', 'prevword.isupper=False', 'prevword.postag='], ['word.lower=severe', 'word[-3:]=ere', 'word[-2:]=re', 'word.isupper=False', 'word.postag=', 'prevword.lower=include', 'prevword.isupper=False', 'prevword.postag='], ['word.lower=developmental', 'word[-3:]=tal', 'word[-2:]=al', 'word.isupper=False', 'word.postag=', 'prevword.lower=severe', 'prevword.isupper=False', 'prevword.postag='], ['word.lower=delay', 'word[-3:]=lay', 'word[-2:]=ay', 'word.isupper=False', 'word.postag=', 'prevword.lower=developmental', 'prevword.isupper=False', 'prevword.postag='], ['word.lower=and', 'word[-3:]=and', 'word[-2:]=nd', 'word.isupper=False', 'word.postag=', 'prevword.lower=delay', 'prevword.isupper=False', 'prevword.

## Build the model

In [137]:
crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(X_train, Y_train)

y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, y_pred, average='weighted')

0.9040232893560951

In [138]:
# Let's verify the label distribution
# flat_labels = [label for sent in y_train_filtered for label in sent]
# label_counts = Counter(flat_labels)

# print("Count of labels:")
# for label, count in label_counts.items():
#     print(f"{label}: {count}")

In [139]:
# verify for one sample
id = 20
print("Sentence:", test_sentences[id])
print("Labels:", test_labels[id])
print("Sentence length:", len(test_sentences[id]))
print("Orig Labels:", Y_test[id])
print("Pred Labels:", y_pred[id])


Sentence: ['Sequelae', 'include', 'severe', 'developmental', 'delay', 'and', 'asymmetric', 'double', 'hemiplegia']
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'D']
Sentence length: 9
Orig Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'D']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'D', 'D', 'D']


# Prepare a key value table of diseaase and treatment. Identify treatment for a disease

In [140]:
disease_treatments = defaultdict(list)

for tokens, preds in zip(test_sentences, y_pred):
    diseases = [tokens[i] for i, label in enumerate(preds) if label == 'D']
    treatments = [tokens[i] for i, label in enumerate(preds) if label == 'T']

    for disease in diseases:
        for treatment in treatments:
            if treatment not in disease_treatments[disease]:
                disease_treatments[disease].append(treatment)

# Print as a table
print(f"{'KEY':<20} {'VALUE'}")
for disease, treatments in disease_treatments.items():
    print(f"{disease:} {', '.join(treatments)}")

# Check for the disease and ts treatment
disease_name = "hereditary retinoblastoma"
disease_name_split = disease_name.split(" ")

treatments = disease_treatments.get(disease_name_split[0])
if treatments:
    print(f"Predicted treatments for '{disease_name}': {', '.join(treatments)}")
else:
    print(f"No predicted treatments found for '{disease_name}'.")

KEY                  VALUE
unstable roxithromycin
angina roxithromycin
or roxithromycin, thrombolytic, therapy
non-Q-wave roxithromycin
myocardial roxithromycin, Thrombolytic, therapy
infarction roxithromycin, Thrombolytic, therapy
coronary-artery Antichlamydial, antibiotics
disease Antichlamydial, antibiotics, Ropinirole, monotherapy, Microelectrode-guided, posteroventral, pallidotomy, platinum, dose, (, cisplatin, plus, carboplatin, ), in, combination, chemotherapy, therapy, with, alone
pulmonary fenfluramines, Thrombolytic, treatment, right-side, hemodynamics, chemotherapy
hypertension fenfluramines, intrauterine, insemination, with, donor, sperm, versus
( fenfluramines, intrauterine, insemination, with, donor, sperm, versus, got, surgical, treatment, radiotherapy, chemotherapy, and, ), cisplatin, a, combination, bleomycin, methotrexate, weekly, ,, doxorubicin, every, antibiotic, oral, budesonide
PPH fenfluramines
) fenfluramines, intrauterine, insemination, with, donor, sperm, vers