In [2]:
import numpy as np
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [3]:
# ----- Step 1: Load Positive and Negative Pairs -----
df_positive = pd.read_csv("positive_pairs.csv")
df_negative = pd.read_csv("negative_pairs.csv")

# Combine into one DataFrame.
df_pairs = pd.concat([df_positive, df_negative], ignore_index=True)

In [4]:
# ----- Step 2: Create a Text Representation for Each Paper -----
# Here we combine title and abstract into a single "definition" for each paper.
df_pairs["paper1_text"] = df_pairs["title_1"] + ". " + df_pairs["abstract_1"]
df_pairs["paper2_text"] = df_pairs["title_2"] + ". " + df_pairs["abstract_2"]

In [5]:
# ----- Step 3: Prepare Data for Embedding Generation -----
X1 = df_pairs['paper1_text'].tolist()
X2 = df_pairs['paper2_text'].tolist()
y = df_pairs['label'].tolist()

In [6]:
num_ones = y.count(1)
num_zeros = y.count(0)

print(f"Number of 1s: {num_ones}")
print(f"Number of 0s: {num_zeros}")

Number of 1s: 500
Number of 0s: 500


In [7]:
# ----- Step 4: Generate Embeddings Using OpenAIEmbeddings -----
embeddings_model = OpenAIEmbeddings()

# Generate embeddings for each paper text (this may take a while if you have many pairs)
embeddings1 = embeddings_model.embed_documents(X1)
embeddings2 = embeddings_model.embed_documents(X2)

In [8]:
# ----- Step 5: Combine Embeddings into a Single Feature Vector per Pair -----
# Here we concatenate the two embeddings.
features = [np.concatenate([np.array(e1), np.array(e2)]) for e1, e2 in zip(embeddings1, embeddings2)]
features = np.array(features)

In [9]:
# ----- Step 6: Train/Test Split -----
train_X, test_X, train_y, test_y = train_test_split(features, y, test_size=0.2, random_state=42)

In [13]:
# ----- Step 7: Define and Train Classifiers -----
# Initialize classifiers.
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "GaussianNB": GaussianNB(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
}

# Train and evaluate each classifier.
for clf_name, clf in classifiers.items():
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)

    print(f"=== {clf_name} ===")

    y_true = test_y
    y_pred = pred_y

    # Compute accuracy, precision, recall, and F1 score
    # We assume 1 is the positive class. Adjust pos_label if needed.
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)

    # Compute the confusion matrix to extract specificity.
    # Specify the order of labels to ensure consistency.
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Generate the full classification report
    report = classification_report(y_true, y_pred, zero_division=0)

    # Print the results
    print("Classification Report:\n", report)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()

=== Logistic Regression ===
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.55      0.61       104
           1       0.60      0.74      0.66        96

    accuracy                           0.64       200
   macro avg       0.65      0.64      0.64       200
weighted avg       0.65      0.64      0.64       200

Accuracy: 0.6400
Precision: 0.6017
Recall (Sensitivity): 0.7396
Specificity: 0.5481
F1 Score: 0.6636

=== GaussianNB ===
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.54      0.60       104
           1       0.59      0.73      0.65        96

    accuracy                           0.63       200
   macro avg       0.64      0.63      0.63       200
weighted avg       0.64      0.63      0.63       200

Accuracy: 0.6300
Precision: 0.5932
Recall (Sensitivity): 0.7292
Specificity: 0.5385
F1 Score: 0.6542

=== MLPClassifier ===
Classification Report:
  

In [None]:
print(len(train_X))
print(len(test_X))
print(len(train_y))
print(len(test_y))

800
200
800
200
