In [1]:
%pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 3.3 MB/s eta 0:00:01
[?25hCollecting transformers<5.0.0,>=4.32.0
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 77 kB/s  eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 3.1 MB/s eta 0:00:01    |██████▎                         | 6.8 MB 3.0 MB/s eta 0:00:10
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 769 kB/s eta 0:00:01
[?25hCollecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 1.7 MB/s eta 0:00:01
[?25h

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os
os.chdir("/content/drive/MyDrive")

In [2]:
import math
import random
import numpy as np
import json
import re
import torch
from torch import nn
from collections import defaultdict
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from sentence_transformers.evaluation import TripletEvaluator
from sklearn.metrics import f1_score

In [3]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Device: mps")
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Device: cuda")
else:
    device = torch.device('cpu')
    print("Device: cpu")

Device: cpu


In [4]:
file_path_train = 'scicite/train.jsonl'
file_path_dev = 'scicite/dev.jsonl'
file_path_test = 'scicite/test.jsonl'
train_data = []
dev_data = []
test_data = []
with open(file_path_train, 'r', encoding='utf-8') as file:
    for line in file:
        train_data.append(json.loads(line))
with open(file_path_dev, 'r', encoding='utf-8') as file:
    for line in file:
        dev_data.append(json.loads(line))
with open(file_path_test, 'r', encoding='utf-8') as file:
    for line in file:
        test_data.append(json.loads(line))
print("Sample data points:")
print(train_data[0])
print(train_data[1])
print(train_data[2])
print("Keys:", list(train_data[0].keys()))
print("Number of training data points:", len(train_data))
print("Label distribution:", {x['label']: sum([1 for y in train_data if y['label'] == x['label']]) for x in train_data[:100]})

Sample data points:
{'source': 'explicit', 'citeEnd': 175, 'sectionName': 'Introduction', 'citeStart': 168, 'string': 'However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).', 'label': 'background', 'label_confidence': 1.0, 'citingPaperId': '1872080baa7d30ec8fb87be9a65358cd3a7fb649', 'citedPaperId': '894be9b4ea46a5c422e81ef3c241072d4c73fdc0', 'isKeyCitation': True, 'id': '1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4ea46a5c422e81ef3c241072d4c73fdc0', 'unique_id': '1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4ea46a5c422e81ef3c241072d4c73fdc0_11', 'excerpt_index': 11}
{'source': 'explicit', 'citeStart': 16, 'sectionName': 'Novel Quantitative Trait Loci for Seminal Root Traits in Barley', 'string': 'In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological\nrobinson et al.

In [5]:
for data_point in train_data[:10]:
    print("Citation Text:", data_point['string'])
    print("Label:", data_point['label'])
    print("Label Confidence:", data_point['label_confidence'])

Citation Text: However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).
Label: background
Label Confidence: 1.0
Citation Text: In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological
robinson et al.: genomic regions influencing root traits in barley 11 of 13
maturity, dried, grain threshed by hand, and stored at −20C to preserve grain dormancy before germination testing.
Label: background
Label Confidence: 1.0
Citation Text: The drug also reduces catecholamine secretion, thereby reducing stress and leading to a modest (10-20%) reduction in heart rate and blood pressure, which may be particularly beneficial in patients with cardiovascular disease.(7) Unlike midazolam, dexmedetomidine does not affect the ventilatory response to carbon dioxide.
Label: background
Label Confidence: 

In [6]:
# Use SciBERT for mapping text to embeddings
model_name = 'allenai/scibert_scivocab_uncased'
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)

In [7]:
section_names = list(set(d['sectionName'] for d in dev_data))

In [8]:
def clean_section_name(section_name):
    pattern_1 = r'^\d+(\.\d+)*\.*\s*'
    pattern_2 = r'^\w+\.(\w+\.)*\.*\s'
    
    section_name = section_name.lower() # Lower-case folding
    cleaned_section_name = re.sub(pattern_1, '', section_name) # Remove section numbers (e.g. "2.", "1.2")
    cleaned_section_name = re.sub(pattern_2, '', cleaned_section_name) #Remove section letters (e.g. "A.", "II.")
    cleaned_section_name = re.sub('[^a-zA-Z0-9\s]', '', cleaned_section_name) # Remove special characters
    return cleaned_section_name

def preprocess_section_names(section_names):    
    res = []
    for section_name in section_names:
        cleaned_section_name = clean_section_name(section_name)
        res.append(cleaned_section_name)
    return res

In [9]:
def compute_cossim(section_names):
    section_embeddings = model.encode(cleaned_section_names)
    section_embeddings = torch.tensor(section_embeddings)
    
    # Compute cosine similarity between section names
    cos_sim = nn.functional.cosine_similarity(section_embeddings.unsqueeze(1), section_embeddings.unsqueeze(0), dim=-1)
    print(f'Shape of cossim: {cos_sim.shape}')
    return cos_sim

cleaned_section_names = preprocess_section_names(section_names)
cleaned_section_names = list(set(cleaned_section_names))
cossim_matrix = compute_cossim(cleaned_section_names)

Shape of cossim: torch.Size([150, 150])


In [10]:
def get_most_and_least_similar_sections(cos_sim):
    most_similar_sections = []
    least_similar_sections = []
    for i in range(len(cleaned_section_names)):
        # Exclude self-similarity by setting diagonal elements to negative inf
        cos_sim[i][i] = float("-inf")
    
        # Find index of section with the highest cosine similarity (most similar)
        most_similar_idx = torch.argmax(cos_sim[i]).item()
        most_similar_sections.append(cleaned_section_names[most_similar_idx])

        # Exclude self-similarity by setting diagonal elements to positive inf
        cos_sim[i][i] = float("inf")
        # Find index of section with the lowest cosine similarity (least similar)
        least_similar_idx = torch.argmin(cos_sim[i]).item()
        least_similar_sections.append(cleaned_section_names[least_similar_idx])
        
    return most_similar_sections, least_similar_sections

most_similar_sections, least_similar_sections = get_most_and_least_similar_sections(cossim_matrix)

In [11]:
class CitationsDataset:
    label_to_id = {'background': 0, 'method': 1, 'result': 2}
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return InputExample(guid=item, texts=[self.data[item]['string']], label=CitationsDataset.label_to_id[self.data[item]['label']])

In [12]:
class CitationsDataset_with_section_name:
    label_to_id = {'background': 0, 'method': 1, 'result': 2}
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return InputExample(guid=item, texts=[self.data[item]['string'], self.data[item]['sectionName']], label=CitationsDataset.label_to_id[self.data[item]['label']])

In [13]:
# Parameters
train_batch_size = 16
dev_batch_size = train_batch_size
test_batch_size = train_batch_size
num_epochs = 5

In [14]:
train_dataset = CitationsDataset(train_data)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [15]:
# def triplets_from_labeled_dataset(input_examples):
#     # Create triplets for a [(label, sentence), (label, sentence)...] dataset
#     # by using each example as an anchor and selecting randomly a
#     # positive instance with the same label and a negative instance with a different label
#     triplets = []
#     label2sentence = defaultdict(list)
#     for inp_example in input_examples:
#         print(inp_example)
#         label2sentence[inp_example.label].append(inp_example)

#     for inp_example in input_examples:
#         anchor = inp_example

#         if len(label2sentence[inp_example.label]) < 2:  # We need at least 2 examples per label to create a triplet
#             continue

#         positive = None
#         while positive is None or positive.guid == anchor.guid:
#             positive = random.choice(label2sentence[inp_example.label])

#         negative = None
#         while negative is None or negative.label == anchor.label:
#             negative = random.choice(input_examples)

#         triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

#     return triplets

In [16]:
def triplets_from_labeled_dataset_section_name(input_examples):
    # Create triplets for a [(section_name, sentence), (section_name, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a
    # positive instance with the section name of highest similarity and a negative instance with the section name of lowest similarity
    # when compared to the section name of the anchor
    triplets = []
    section2sentence = defaultdict(list)
    for inp_example in input_examples:
        section2sentence[inp_example.texts[1]].append(inp_example)
    
    for inp_example in input_examples:
        anchor = inp_example

        section_name_anchor = anchor.texts[1]
        
        if len(section2sentence[inp_example.texts[1]]) >= 2: # Randomly pick a positive instance (another of same section name)
            positive = None
            while positive is None or positive.guid == anchor.guid:
                positive = random.choice(section2sentence[inp_example.texts[1]])
        else: # Anchor's section name is unique, so pick a positive instance with section name of highest similarity
            most_similar_section = most_similar_sections[cleaned_section_names.index(section_name_anchor)]
            positive = random.choice(section2sentence[most_similar_section])
        
        least_similar_section = least_similar_sections[cleaned_section_names.index(section_name_anchor)]

        negative = random.choice(section2sentence[least_similar_section])

        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets

In [17]:
for data in dev_data:
    data['sectionName'] = clean_section_name(data['sectionName'])

In [18]:
dev_dataset = CitationsDataset_with_section_name(dev_data)
dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=dev_batch_size)
dev_triplets = triplets_from_labeled_dataset_section_name(dev_dataset)

In [19]:
print(len(dev_triplets), len(dev_dataset))
print(dev_triplets[0].texts)

916 916
['These results are in contrast with the findings of Santos et al.(16), who reported a significant association between low sedentary time and healthy CVF among Portuguese', 'Two cohort studies that included only women showed robust risk estimates (Puett et al., 2008; Miller et al., 2007), whereas one cohort including only men showed no effects (Puett et al., 2011).', '…by offering a consistent way to differentiate two types of results that provide evidence of statistical mediation, drawing on earlier work (Baron & Kenny, 1986) and more recent advances in meditational modeling (Hayes, 2013; MacKinnon et al., 2007; Preacher, 2015; Preacher & Hayes, 2004).']


In [20]:
train_loss = losses.BatchAllTripletLoss(model=model)
dev_evaluator = TripletEvaluator.from_input_examples(dev_triplets, name="scibert-dev-triplets")
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path='output/pre_trained_scibert-scicite-section-name'
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/516 [00:00<?, ?it/s]

In [2]:
# Sanity check pretraining results
model = SentenceTransformer('output/pre_trained_scibert-scicite-section-name')

sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of strings.",
    "The quick brown fox jumps over the lazy dog.",
]

embeddings = model.encode(sentences)

for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [ 1.17864096e+00 -5.84065497e-01 -6.77036941e-02  4.84504759e-01
  2.65688390e-01 -1.75641465e+00  4.85001914e-02 -2.02495754e-02
 -3.39153230e-01 -3.82293016e-01  3.26851815e-01  3.73302072e-01
 -3.38200361e-01  5.47951341e-01 -6.95361078e-01 -7.47061014e-01
  2.99089905e-02  3.50935012e-01  5.34586072e-01  7.46623456e-01
  1.51853219e-01 -9.24881458e-01 -6.83541179e-01 -1.07481077e-01
  1.60978508e+00  6.61506832e-01  1.53244331e-01  1.34893203e+00
 -7.67008424e-01  7.77025521e-01 -2.47616738e-01 -9.11269844e-01
 -5.56402326e-01 -5.28302491e-01  2.41396688e-02  4.79721457e-01
  1.01778552e-01 -1.15958810e+00 -1.33165038e+00 -4.47587878e-01
 -8.20036530e-01  2.12848619e-01  1.18425739e+00 -6.51592672e-01
  7.03265369e-01  9.06864822e-01  5.81360459e-01  5.44916876e-02
 -9.39678192e-01 -4.85538095e-01  9.51498687e-01 -1.33218265e+00
 -3.10098886e-01  2.20246151e-01 -4.07119453e-01  4.68866646e-01
 -8.68563

In [3]:
class CitationIntentClassifier(nn.Module):
    def __init__(self, model_path, num_labels):
        super(CitationIntentClassifier, self).__init__()
        self.sentence_transformer = SentenceTransformer(model_path)
        self.classifier = nn.Linear(self.sentence_transformer.get_sentence_embedding_dimension(), num_labels)

    def forward(self, input_texts):
        embeddings = self.sentence_transformer.encode(input_texts, convert_to_tensor=True)
        return self.classifier(embeddings)

In [8]:
model_path = 'output/pre_trained_scibert-scicite-section-name'
num_labels = len(CitationsDataset.label_to_id)
citation_intent_classifier = CitationIntentClassifier(model_path, num_labels).to(device)

In [16]:
# Parameters
learning_rate = 2e-5
num_epochs = 5

In [10]:
optimizer = torch.optim.Adam(citation_intent_classifier.parameters(), lr=learning_rate)
loss_func = torch.nn.CrossEntropyLoss()

In [11]:
class CitationsDatasetWithoutInputExample():
    label_to_id = {'background': 0, 'method': 1, 'result': 2}
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]['string'], CitationsDatasetWithoutInputExample.label_to_id[self.data[item]['label']]

In [17]:
train_dataset = CitationsDatasetWithoutInputExample(train_data)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [18]:
dev_dataset = CitationsDatasetWithoutInputExample(dev_data)
dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=dev_batch_size)

In [19]:
def train_epoch(model, dataloader, loss_func, optimizer):
    model.train()
    total_loss = 0
    for input_texts, labels in dataloader:
        labels = labels.to(device)
        optimizer.zero_grad()
        output = model(input_texts)
        loss = loss_func(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Training loss: {total_loss / len(dataloader)}")

def evaluate(model, dataloader, loss_func):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for input_texts, labels in dataloader:
            labels = labels.to(device)
            output = model(input_texts)
            loss = loss_func(output, labels)
            total_loss += loss.item()
            total_correct += (output.argmax(1) == labels).sum().item()
    print(f"Evaluation loss: {total_loss / len(dataloader)}")
    print(f"Evaluation accuracy: {total_correct / len(dataloader.dataset)}")

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train_epoch(citation_intent_classifier, train_dataloader, loss_func, optimizer)
    evaluate(citation_intent_classifier, dev_dataloader, loss_func)

Epoch 1/5
Training loss: 0.557525815774304
Evaluation loss: 0.4598917110726751
Evaluation accuracy: 0.8537117903930131
Epoch 2/5
Training loss: 0.22717828784397867
Evaluation loss: 0.4354725860316178
Evaluation accuracy: 0.8569868995633187
Epoch 3/5
Training loss: 0.18021156243229097
Evaluation loss: 0.45153859925681145
Evaluation accuracy: 0.8558951965065502
Epoch 4/5
Training loss: 0.16739635488610397
Evaluation loss: 0.468103614154047
Evaluation accuracy: 0.8548034934497817
Epoch 5/5
Training loss: 0.1626283915590697
Evaluation loss: 0.4811465257842993
Evaluation accuracy: 0.8548034934497817


In [20]:
test_dataset = CitationsDatasetWithoutInputExample(test_data)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=test_batch_size)

In [21]:
def test(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for input_texts, labels in dataloader:
            labels = labels.to(device)
            output = model(input_texts)
            _, predicted_labels = torch.max(output, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

predictions, true_labels = test(citation_intent_classifier, test_dataloader, device)

In [22]:
f1 = f1_score(true_labels, predictions, average='macro')
print(f"F1 Score: {f1}")

F1 Score: 0.8581489305686997
