# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# For fine-tuning w/ HuggingFace API
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

import torch
import torch.nn as nn

KeyboardInterrupt: 

# Import Dataset

In [None]:
df = pd.read_csv("TAB_binaryLabel.csv")
df.head()

# Encoding Using SPECTER

"...specifically use citations as a naturally occurring, inter-document incidental supervision signal indicating which documents are most related and formulate the signal into a triplet-loss pretraining objective. Unlike many prior works, at inference time, our model does not require any citation information."
* https://papertohtml.org/paper?id=a3e4ceb42cbcd2c807d53aff90a8cb1f5ee3f031
* Overall more up to date and proven better than SciBERT





In [None]:
# Load SPECTER (encoder)
from transformers import AutoModel

model_name = 'allenai/specter'
model = AutoModel.from_pretrained(model_name)
model.eval() # inference mode

In [None]:
# Change pandas dataframe to Hugging Face dataset
HF_df = Dataset.from_pandas(df.sample(500))

In [None]:
# Embed TAB
tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode_tab(example):
    """
    input:: Hugging Face dataset
    return:: embed TAB column to some ex) [0.123, -0.456, 0.789, ..., 0.001]  # Shape: (768,)
    NOTE: extra work is because we loaded BERT & BERT variations as end-to-end models (aka including classification) while SPECTER is embedding only
    """
    input_text = example["TAB"]
    inputs = tokenizer(input_text, return_tensors = "pt", truncation = True, max_length = 512) # Dict {'input_ids': tensor, 'token_type_ids': tensor}
    with torch.no_grad(): # disables gradient tracking...we are doing inference only, not training --> no need for backprop & storing gradient
        outputs = model(**inputs) # ** unpacks the dictionary
        cls_emb = outputs.last_hidden_state[:, 0, :]  # extract CLS token...[:, 0, :] all items, first item, all items
    return {"embedding": cls_emb.squeeze().numpy()}

In [None]:
from datasets import Features, Sequence, Value

# Add 'embedding' feature as a sequence of floats
features = HF_df.features.copy()
features["embedding"] = Sequence(Value("float32"))

tokenized_df = HF_df.map(
    encode_tab,
    features = features,
    batched = False
)

In [None]:
# Example code to better see how the encoding works
ex = HF_df['TAB'][6] # random example
ex # string

ex_token = tokenizer(HF_df['TAB'][6], return_tensors = "pt", truncation = True, max_length = 512)
ex_token['input_ids'].type # dict of shape {'input_ids': Tensor, 'attention_mask': Tensor}

ex_out = model(**ex_token)
ex_out #'BaseModelOutputWithPoolingAndCrossAttentions' object w/ attributes

ex_embedding = ex_out.last_hidden_state[:, 0, :] # ex.out.last_hidden_state has torch.Size([1, 220, 768])
ex_embedding.squeeze().shape # torch.Size([768]), in code grad is disabled so it can be converted to a numpy array

# Feature Analysis

try and see the dimensionality of the dataset based on the specter embeddings which are 768 dimension

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Plot cumulative variance explained over # features
pca = PCA().fit(tokenized_df["embedding"])

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
pca = PCA(n_components = 3)
vis_dims = pca.fit_transform(tokenized_df["embedding"])
df["PCA"] = vis_dims.tolist() # make PCs a column in df

import plotly.graph_objs as go # makes it interactive

fig = go.Figure()

labels = df["label"]
for i, label in enumerate(labels):
  sub_matrix = np.array(df[df["label"] == label]["PCA"].tolist())
  x = sub_matrix[:, 0]
  y = sub_matrix[:, 1]
  z = sub_matrix[:, 2]

  fig.add_trace(
      go.Scatter3d(
          x = x,
          y = y,
          z = z,
          mode = "markers",
          marker = dict(size = 5, color = i, colorscale = "Viridis", opacity = 0.8),
          name = label
      )
  )

fig.update_layout(
    autosize = False,
    title = "3D Scatter Plot of Cateogries",
    width = 800,
    height = 500,
    margin = dict(l = 50, r = 50, b = 100, t = 100, pad = 10),
    scene = dict(
        xaxis = dict(title = "x"),
        yaxis = dict(title = "y"),
        zaxis = dict(title = "z"),
    )
)

fig.show()

# Linear Model

It may better mimic the decision making process of scientists if it is very simple...breaking it up into whether key words are included/not

# Neural Networks

May better capture the higher dimensions of data...I think a more qualitative description of the process may be nice and thus can be built to be reflected in the layers like


1.   Largest overview/checking for this one quality
2.   Now I check in greater detail in this quality
* idea is to mimic the structure of how layers in CNNs reflect steps in human vision and V1 --> IT

Presume an embedder which returns the knowledge of a scientific reader...the info of the title and abstract are now captured, now its time to start splitting things up!


## Wendi's Code...play around to add more layers between embedding and classification DUH

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
# Split df into train, val, and test
df["embedding"] = tokenized_df["embedding"] # store SPECTER embeddings in df

train_df, tmp_df = train_test_split(
    df, test_size=0.20, random_state=42, stratify=df['label']
)
val_df, test_df = train_test_split(
    tmp_df, test_size=0.50, random_state=42, stratify=tmp_df['label']
)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# Calculate class weights
from sklearn.utils import class_weight

class_weights_array = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = {i : class_weights_array[i] for i in range(len(class_weights_array))}

print("Calculated Class Weights to counter imbalance:")
print(class_weights)

In [None]:
BATCH_SIZE = 32
AUTOTUNE   = tf.data.AUTOTUNE

def df_to_dataset(dataframe, shuffle=True):
   # Convert list of embedding arrays to a single 2D NumPy array
    embeddings = np.stack(dataframe['embedding'].values)
    labels = dataframe['label'].values

    ds = tf.data.Dataset.from_tensor_slices((embeddings, labels))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = df_to_dataset(train_df, shuffle=True)
val_ds   = df_to_dataset(val_df,   shuffle=False)
test_ds  = df_to_dataset(test_df,  shuffle=False)

In [None]:
from tensorflow.keras import layers, Input, Model

# Define input for precomputed embeddings
embedding_dim = 768
embedding_input = Input(shape=(embedding_dim,), dtype=tf.float32, name='embedding_input') # SPECTER embeddings

# Apply dropout to prevent overfitting
x = layers.Dropout(0.2)(embedding_input)

""" TODO: PLAY AROUND HERE
the idea here is that the layers and how much is "cut" each layer reflects some portion of the scientific review process
"""

# Add a hidden Dense layer
x = layers.Dense(256, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(64, activation='relu')(x)

""" END OF FOOLISHNESS """

# Output layer for binary classification
logits = layers.Dense(1, activation='sigmoid', name='classifier')(x)

model = Model(inputs=embedding_input, outputs=logits)
model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-5),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
    ]
)

model.summary()

In [None]:
# EarlyStopping to prevent overfitting and restore best weights based on val_recall
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_precision', # should I change to precision
    mode='max',
    patience=2, # Wait 2 epochs for improvement, can just switch to 1 bc atp if it starts overfitting it doesn't improve
    verbose=1,
    restore_best_weights=True
)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath='checkpoint_model.keras',
    monitor='val_recall',
    mode='max',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

print("\nStarting model training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs= 10, # Increase epochs; EarlyStopping will find the best one
    # Added class weights so recall is higher
    class_weight=class_weights,
    verbose=1
)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt

# Unfixed and untested finding threshold code

# 0.5 eval
print("\n--- Evaluation on Test Set (Standard 0.5 Threshold) ---")
loss, acc, precision, recall = model.evaluate(test_ds)
print(f"Test Recall (at 0.5 threshold): {recall:.4f}")


# val set threshold finding
print("\n--- Finding Optimal Threshold to Boost Recall ---")
y_val_pred_probs = model.predict(val_ds).ravel()
y_val_true = val_df['label'].values

precisions, recalls, thresholds = precision_recall_curve(y_val_true, y_val_pred_probs)

# Find the threshold that gives us at least 95% recall
try:
    target_recall = 0.95
    idx = np.min(np.where(recalls >= target_recall))
    high_recall_threshold = thresholds[idx]
    print(f"Threshold found for >={target_recall*100}% recall: {high_recall_threshold:.4f}")
except ValueError:
    best_recall_idx = np.argmax(recalls)
    high_recall_threshold = thresholds[best_recall_idx]
    print(f"Could not reach {target_recall*100}% recall. Using best possible recall threshold: {high_recall_threshold:.4f}")


# evaluate on the test set using our new threshold
print(f"\n--- Final Evaluation on Test Set (Threshold: {high_recall_threshold:.4f}) ---")
y_test_pred_probs = model.predict(test_ds).ravel()
y_test_true = test_df['label'].values
y_test_pred_final = (y_test_pred_probs >= high_recall_threshold).astype(int)

# class report and confusion matrix
print("\nClassification Report (Test Set):")
print(classification_report(y_test_true, y_test_pred_final, target_names=['Irrelevant (0)', 'Relevant (1)']))

import seaborn as sns
print("\nConfusion Matrix (Test Set):")
cm = confusion_matrix(y_test_true, y_test_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Irrelevant', 'Relevant'], yticklabels=['Irrelevant', 'Relevant'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

# Save model
OUTPUT_DIR = 'bert_128tokens_recall_unweighted_reg.keras'
model.save(OUTPUT_DIR)
print(f"\nModel & tokenizer saved to {OUTPUT_DIR}")