In [1]:
import json

from langchain.embeddings import OpenAIEmbeddings
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
embeddings_model = OpenAIEmbeddings()

  embeddings_model = OpenAIEmbeddings()


#### Dataset Preparation

In [4]:
import pandas as pd

df = pd.read_csv('../../data/disease_ontology.csv')

df_positive = df[df['label']==True]
df_negative = df[df['label']==False]

df_train_pos = df_positive.iloc[:200]
df_pos = df_positive.iloc[200:]

df_train_neg = df_negative.iloc[:200]
df_neg = df_negative.iloc[200:]


train_x_1 = df_train_pos['description_of_candidate_entity'].tolist() + df_train_neg['description_of_candidate_entity'].tolist()
train_x_2 = df_train_pos['description_of_focal_entity'].tolist() + df_train_neg['description_of_focal_entity'].tolist()
print(len(train_x_1))
print(len(train_x_2))
train_y = [1] * len(df_train_pos) + [0] * len(df_train_neg)
print(len(train_y))

test_x_1 = df_pos['description_of_candidate_entity'].tolist() + df_neg['description_of_candidate_entity'].tolist()
test_x_2 = df_pos['description_of_focal_entity'].tolist() + df_neg['description_of_focal_entity'].tolist()
test_y = [1] * len(df_pos) + [0] * len(df_neg)
print(len(test_x_1))
print(len(test_x_2))
print(len(test_y))

400
400
400
200
200
200


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [5]:
test_names_1 = df_pos['candidate_entity'].tolist() + df_neg['candidate_entity'].tolist()
test_names_2 = df_pos['focal_entity'].tolist() + df_neg['focal_entity'].tolist()
test_names = list(zip(test_names_1, test_names_2))

#### Construction of Embeddings derived from disease descriptions

In [6]:
train_embeddings = []

for definition1, definition2 in list(zip(train_x_1, train_x_2)):
    embedding1 = embeddings_model.embed_documents([definition1])[0]
    embedding2 = embeddings_model.embed_documents([definition2])[0]
    pair_embedding = np.concatenate((embedding1, embedding2))
    train_embeddings.append(pair_embedding)

In [7]:
print(len(train_embeddings))

400


In [8]:
text_embeddings = []

for definition1, definition2 in list(zip(test_x_1, test_x_2)):
    embedding1 = embeddings_model.embed_documents([definition1])[0]
    embedding2 = embeddings_model.embed_documents([definition2])[0]
    pair_embedding = np.concatenate((embedding1, embedding2))
    text_embeddings.append(pair_embedding)

In [9]:
print(len(text_embeddings))

200


In [10]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(train_embeddings, train_y, random_state=42)

In [11]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### EVALUATION WITH DIFFERENT CLASSIFIERS AND THEIR RESULTS

In [13]:
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)
logistic_predictions = logistic_classifier.predict(text_embeddings)
print("Logistic Regression:")
print(classification_report(test_y, logistic_predictions))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.71      0.65      0.68       100
           1       0.68      0.73      0.70       100

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.69       200
weighted avg       0.69      0.69      0.69       200



In [21]:
cm = confusion_matrix(test_y, logistic_predictions)

In [23]:

# Calculate additional metrics
accuracy = accuracy_score(test_y, logistic_predictions)
precision = precision_score(test_y, logistic_predictions)
recall = recall_score(test_y, logistic_predictions)
f1 = f1_score(test_y, logistic_predictions)

cm = confusion_matrix(test_y, logistic_predictions)
tn, fp, fn, tp = cm.ravel()  # only valid for binary classification

# Calculate specificity: TN / (TN + FP)
specificity = tn / (tn + fp)

# Print in desired order
print("Logistic Regression Results")
print(f"Accuracy:   {accuracy:.2f}")
print(f"Precision:  {precision:.2f}")
print(f"Recall:     {recall:.2f}")
print(f"Specificity:{specificity:.2f}")
print(f"F1 Score:   {f1:.2f}")

Logistic Regression Results
Accuracy:   0.69
Precision:  0.68
Recall:     0.73
Specificity:0.65
F1 Score:   0.70


In [16]:
# Naive Bayes (Gaussian)
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
naive_bayes_predictions = naive_bayes_classifier.predict(text_embeddings)
print("\nNaive Bayes:")
print(classification_report(test_y, naive_bayes_predictions))


Naive Bayes:
              precision    recall  f1-score   support

           0       0.71      0.66      0.68       100
           1       0.68      0.73      0.71       100

    accuracy                           0.69       200
   macro avg       0.70      0.70      0.69       200
weighted avg       0.70      0.69      0.69       200



In [24]:
accuracy = accuracy_score(test_y, naive_bayes_predictions)
precision = precision_score(test_y, naive_bayes_predictions)
recall = recall_score(test_y, naive_bayes_predictions)
f1 = f1_score(test_y, naive_bayes_predictions)

cm = confusion_matrix(test_y, naive_bayes_predictions)
tn, fp, fn, tp = cm.ravel()  # only valid for binary classification

# Calculate specificity: TN / (TN + FP)
specificity = tn / (tn + fp)

# Print in desired order
print("GaussianNB Prediction RESULTS")
print(f"Accuracy:   {accuracy:.2f}")
print(f"Precision:  {precision:.2f}")
print(f"Recall:     {recall:.2f}")
print(f"Specificity:{specificity:.2f}")
print(f"F1 Score:   {f1:.2f}")

GaussianNB Prediction RESULTS
Accuracy:   0.69
Precision:  0.68
Recall:     0.73
Specificity:0.66
F1 Score:   0.71


In [18]:
# MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(512, 128), max_iter=500, random_state=42)
mlp_classifier.fit(X_train, y_train)
mlp_predictions = mlp_classifier.predict(text_embeddings)
print("\nMLP Classifier:")
print(classification_report(test_y, mlp_predictions))


MLP Classifier:
              precision    recall  f1-score   support

           0       0.69      0.68      0.68       100
           1       0.68      0.69      0.69       100

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.68       200
weighted avg       0.69      0.69      0.68       200



In [25]:
cm = confusion_matrix(test_y, mlp_predictions)
tn, fp, fn, tp = cm.ravel()  # only valid for binary classification

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Print in desired order
print("MLP Classifier RESULTS")
print(f"Accuracy:    {accuracy:.2f}")
print(f"Precision:   {precision:.2f}")
print(f"Recall:      {recall:.2f}")
print(f"Specificity: {specificity:.2f}")
print(f"F1 Score:    {f1:.2f}")

MLP Classifier RESULTS
Accuracy:    0.69
Precision:   0.68
Recall:      0.69
Specificity: 0.68
F1 Score:    0.69


In [19]:
# Compare predictions
print("Common Mistakes:")

# Identify common mistakes
common_mistakes = []
common_mistake_tuples_with_label = []
common_mistake_tuples = []
for i in range(len(test_y)):
    actual = test_y[i]
    mlp_pred = mlp_predictions[i]
    nb_pred = naive_bayes_predictions[i]
    lr_pred = logistic_predictions[i]

    if actual != mlp_pred and actual != nb_pred and actual != lr_pred:
        common_mistakes.append(i)
        common_mistake_tuples_with_label.append((test_names[i], actual))
        common_mistake_tuples.append(test_names[i])

print(len(common_mistakes))
print(common_mistake_tuples)
print()

print('ratio for common mistakes to mlp mistakes')
print(len(common_mistakes)/sum([y_!=pred for y_,pred in list(zip(test_y, mlp_predictions))]))

print('ratio for common mistakes to naive bayes mistakes')
print(len(common_mistakes)/sum([y_!=pred for y_,pred in list(zip(test_y, naive_bayes_predictions))]))

print('ratio for common mistakes to logistic prediction mistakes')
print(len(common_mistakes)/sum([y_!=pred for y_,pred in list(zip(test_y, logistic_predictions))]))


Common Mistakes:
42
[('ovarian primitive germ cell tumor', 'polyembryoma of the ovary'), ('ovary epithelial cancer', 'malignant ovarian surface epithelial-stromal neoplasm'), ('schistosomiasis', 'cercarial dermatitis'), ('articular cartilage disease', 'chondromalacia'), ('dyskeratosis congenita', 'Revesz syndrome'), ('histidine metabolism disease', 'histidinemia'), ('lysosomal acid lipase deficiency', 'Wolman disease'), ('bone resorption disease', 'osteoporosis'), ('autosomal genetic disease', 'familial adenomatous polyposis'), ('ovary adenocarcinoma', 'ovarian cystadenocarcinoma'), ('epidermolysis bullosa dystrophica', 'transient bullous dermolysis of the newborn'), ('legionellosis', 'Pontiac fever'), ('language disorder', 'aphasia'), ('nonphotosensitive trichothiodystrophy', 'Sabinas brittle hair syndrome'), ('pertussis', 'Bordetella parapertussis whooping cough'), ('Kyasanur forest disease', 'Alkhurma hemorrhagic fever'), ('hydrophthalmos', 'buphthalmos'), ('writing disorder', 'agra