In [1]:
import numpy as np
import pandas as pd
from langchain_openai import OpenAIEmbeddings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [4]:
# ----- Step 1: Load Data -----
df_pairs = pd.read_csv("../../data/products_substitution.csv")

In [6]:
# ----- Step 2: Create a Text Representation for Each Entity -----
df_pairs["product1"] = df_pairs["focal_entity"]
df_pairs["product2"] = df_pairs["candidate_entity"]

In [7]:
# ----- Step 3: Prepare Data for Embedding Generation -----
X1 = df_pairs['product1'].tolist()
X2 = df_pairs['product2'].tolist()
y = df_pairs['label'].tolist()

In [8]:
num_ones = y.count(1)
num_zeros = y.count(0)

print(f"Number of 1s: {num_ones}")
print(f"Number of 0s: {num_zeros}")

Number of 1s: 170
Number of 0s: 237


In [9]:
# ----- Step 4: Generate Embeddings Using OpenAIEmbeddings -----
embeddings_model = OpenAIEmbeddings()

# Generate embeddings for each entity text
embeddings1 = embeddings_model.embed_documents(X1)
embeddings2 = embeddings_model.embed_documents(X2)

In [10]:
# ----- Step 5: Combine Embeddings into a Single Feature Vector per Pair -----
features = [np.concatenate([np.array(e1), np.array(e2)]) for e1, e2 in zip(embeddings1, embeddings2)]
features = np.array(features)

In [11]:
# ----- Step 6: Train/Test Split -----
train_X, test_X, train_y, test_y = train_test_split(features, y, test_size=0.2, random_state=42)

In [12]:
# ----- Step 7: Define and Train Classifiers -----
# Initialize classifiers.
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "GaussianNB": GaussianNB(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
}

# Train and evaluate each classifier.
for clf_name, clf in classifiers.items():
    clf.fit(train_X, train_y)
    pred_y = clf.predict(test_X)

    print(f"=== {clf_name} ===")

    y_true = test_y
    y_pred = pred_y

    # Compute accuracy, precision, recall, and F1 score
    # We assume 1 is the positive class. Adjust pos_label if needed.
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    recall = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)

    # Compute the confusion matrix to extract specificity.
    # Specify the order of labels to ensure consistency.
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Generate the full classification report
    report = classification_report(y_true, y_pred, zero_division=0)

    # Print the results
    print("Classification Report:\n", report)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()

=== Logistic Regression ===
Classification Report:
               precision    recall  f1-score   support

       False       0.54      0.83      0.65        46
        True       0.27      0.08      0.13        36

    accuracy                           0.50        82
   macro avg       0.40      0.45      0.39        82
weighted avg       0.42      0.50      0.42        82

Accuracy: 0.5000
Precision: 0.2727
Recall (Sensitivity): 0.0833
Specificity: 0.8261
F1 Score: 0.1277

=== GaussianNB ===
Classification Report:
               precision    recall  f1-score   support

       False       0.49      0.46      0.47        46
        True       0.36      0.39      0.37        36

    accuracy                           0.43        82
   macro avg       0.42      0.42      0.42        82
weighted avg       0.43      0.43      0.43        82

Accuracy: 0.4268
Precision: 0.3590
Recall (Sensitivity): 0.3889
Specificity: 0.4565
F1 Score: 0.3733

=== MLPClassifier ===
Classification Report:
  

In [13]:
print(len(train_X))
print(len(test_X))
print(len(train_y))
print(len(test_y))

325
82
325
82
