Project set-up


In [2]:
!pip install torch transformers sentence-transformers spacy pdfplumber scikit-learn nltk faiss-cpu
!python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

Upload files

In [3]:
from google.colab import files
uploaded = files.upload()

Saving aerospace-11-00122.pdf to aerospace-11-00122.pdf
Saving aerospace-12-00674-v2.pdf to aerospace-12-00674-v2.pdf


Pdf extraction

In [4]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    return full_text

paper_a = extract_text_from_pdf("aerospace-11-00122.pdf")
paper_b = extract_text_from_pdf("aerospace-12-00674-v2.pdf")

print("Paper A length:", len(paper_a))
print("Paper B length:", len(paper_b))

Paper A length: 57362
Paper B length: 53062


Cleaning the files

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

def split_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 20]

sentences_a = split_sentences(paper_a)
sentences_b = split_sentences(paper_b)

print("Sentences A:", len(sentences_a))
print("Sentences B:", len(sentences_b))

Sentences A: 393
Sentences B: 357


In [6]:
claim_keywords = [
    "we propose", "we present", "we show", "we demonstrate",
    "our results", "significant", "improves", "outperforms",
    "novel", "first", "contrary", "however", "we conclude"
]

def extract_claims(sentences):
    claims = []
    for s in sentences:
        s_lower = s.lower()
        if any(keyword in s_lower for keyword in claim_keywords):
            claims.append(s)
    return claims

claims_a = extract_claims(sentences_a)
claims_b = extract_claims(sentences_b)

print("Claims A:", len(claims_a))
print("Claims B:", len(claims_b))

Claims A: 19
Claims B: 22


Embedding model

In [None]:
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

emb_a = embed_model.encode(claims_a)
emb_b = embed_model.encode(claims_b)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
print("Claims A:", len(claims_a))
print("Claims B:", len(claims_b))


Claims A: 19
Claims B: 22


Novelty score


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity_matrix = cosine_similarity(emb_a, emb_b)

max_similarities = similarity_matrix.max(axis=1)
novelty_score = np.mean(1 - max_similarities)

print("Novelty Score:", round(float(novelty_score), 3))


Novelty Score: 0.533


contradiction detection

In [9]:
from transformers import pipeline

nli = pipeline("text-classification", model="facebook/bart-large-mnli", device=0)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [17]:
pairs = []
for a in claims_a:
    for b in claims_b:
        pairs.append({"text": a, "text_pair": b})

results = nli(pairs, batch_size=16)

contradictions = []

for i, res in enumerate(results):
    if res['label'] == 'CONTRADICTION' and res['score'] > 0.5:

        a = claims_a[i // len(claims_b)]
        b = claims_b[i % len(claims_b)]
        contradictions.append((a, b, res['score']))

print("Number of contradictions:", len(contradictions))


Number of contradictions: 0


In [19]:
disagreements = []

for idx, res in enumerate(results):
    i, j = index_map[idx]
    similarity = similarity_matrix[i][j]

    if similarity > 0.6 and res['label'] != 'ENTAILMENT':
        disagreements.append((claims_a[i], claims_b[j], res['label'], res['score']))

print("Number of Disagreements:", len(disagreements))


Number of Disagreements: 0


In [20]:
if len(pairs) > 0:
    disagreement_score = len(disagreements) / len(pairs)
else:
    disagreement_score = 0

print("Disagreement Score:", round(disagreement_score, 3))


Disagreement Score: 0


In [18]:
pairs = []
index_map = []

# only compare semantically similar claims
for i, a in enumerate(claims_a):
    for j, b in enumerate(claims_b):
        if similarity_matrix[i][j] > 0.6:   # similarity filter
            pairs.append({"text": a, "text_pair": b})
            index_map.append((i, j))

# run NLI only on filtered pairs
results = nli(pairs, batch_size=16)

contradictions = []

for idx, res in enumerate(results):
    if res['label'] == 'CONTRADICTION' and res['score'] > 0.5:
        i, j = index_map[idx]
        a = claims_a[i]
        b = claims_b[j]
        contradictions.append((a, b, res['score']))

print("Number of contradictions:", len(contradictions))


Number of contradictions: 0


Contradiction score

In [11]:
total_comparisons = len(claims_a) * len(claims_b)
contradiction_score = len(contradictions) / total_comparisons
print("Contradiction Score:", round(contradiction_score, 3))


Contradiction Score: 0.0


Top contradictions

In [13]:
for c in contradictions[:5]:
    print("\n---")
    print("Paper A:", c[0])
    print("Paper B:", c[1])
    print("Score:", round(c[2],3))

In [14]:
from collections import Counter

labels = [r['label'] for r in results]
print(Counter(labels))


Counter({'neutral': 290, 'contradiction': 112, 'entailment': 16})


In [16]:
if res['label'] == 'CONTRADICTION' and res['score'] > 0.5:


SyntaxError: incomplete input (ipython-input-1680964515.py, line 1)