In [18]:
import numpy as np
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [19]:
# ----- Step 1: Load Positive and Negative Pairs -----
df = pd.read_csv('arxiv_research_areas.csv')
df_positive = df[df['label']==True]
df_negative = df[df['label']==False]

# Combine into one DataFrame.
df_pairs = pd.concat([df_positive, df_negative], ignore_index=True)

In [20]:
# ----- Step 2: Create a Text Representation for Each Paper -----
# Here we combine title and abstract into a single "definition" for each paper.
df_pairs["paper1_text"] = df_pairs["focal_entity"] + ". " + df_pairs["description_of_focal_entity"]
df_pairs["paper2_text"] = df_pairs["candidate_entity"] + ". " + df_pairs["description_of_candidate_entity"]

In [21]:
# ----- Step 3: Prepare Data for Embedding Generation -----
X1 = df_pairs['paper1_text'].tolist()
X2 = df_pairs['paper2_text'].tolist()
y = df_pairs['label'].tolist()

In [None]:
# ----- Step 4: Generate Embeddings Using OpenAIEmbeddings -----
embeddings_model = OpenAIEmbeddings()

# Generate embeddings for each paper text (this may take a while if you have many pairs)
embeddings1 = embeddings_model.embed_documents(X1)
embeddings2 = embeddings_model.embed_documents(X2)

In [None]:
# ----- Step 5: Combine Embeddings into a Single Feature Vector per Pair -----
# Here we concatenate the two embeddings.
features = [np.concatenate([np.array(e1), np.array(e2)]) for e1, e2 in zip(embeddings1, embeddings2)]
features = np.array(features)

In [None]:
# ----- Step 6: Train/Test Split -----
train_X, test_X, train_y, test_y = train_test_split(features, y, test_size=0.2, random_state=42)

In [None]:
# ----- Step 7: Train a Logistic Regression Classifier -----
clf = LogisticRegression(max_iter=1000)
clf.fit(train_X, train_y)

In [None]:
# ----- Step 8: Evaluate the Classifier -----
pred_y = clf.predict(test_X)
report = classification_report(test_y, pred_y)
print("Classification Report:\n", report)

In [17]:
# ----- Step 7: Define and Train Classifiers -----
# Initialize classifiers.
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "GaussianNB": GaussianNB(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
}

# Train and evaluate each classifier.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

for clf_name, clf in classifiers.items():
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)
    print(f"=== {clf_name} ===")
    cm = confusion_matrix(test_y, pred_y)
    tn, fp, fn, tp = cm.ravel()  # only valid for binary classification

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Print in desired order
    print(f"Accuracy:    {accuracy:.2f}")
    print(f"Precision:   {precision:.2f}")
    print(f"Recall:      {recall:.2f}")
    print(f"Specificity: {specificity:.2f}")
    print(f"F1 Score:    {f1:.2f}")



=== Logistic Regression ===
Accuracy:    0.68
Precision:   0.64
Recall:      0.77
Specificity: 0.60
F1 Score:    0.70
=== GaussianNB ===
Accuracy:    0.69
Precision:   0.64
Recall:      0.80
Specificity: 0.58
F1 Score:    0.71
=== MLPClassifier ===
Accuracy:    0.59
Precision:   0.57
Recall:      0.58
Specificity: 0.60
F1 Score:    0.58
