In [1]:
# Dependencies

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

from encode import encode
from retrieve import get_top_k
from rerank import climate_rerank
from classify import classify_evidence
from majority_vote import majority_vote

  from .autonotebook import tqdm as notebook_tqdm


/home/jonas/Documents/Uni/AdvancedInformationRetrieval/air25/src/a2
Loading model labels: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}
/home/jonas/Documents/Uni/AdvancedInformationRetrieval/air25/src/a2
Loading model labels: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}


In [2]:
# Load Data

evidences = pd.read_csv("../../data/processed/evidences.csv")
claims = pd.read_csv("../../data/processed/claims.csv")
mappings = pd.read_csv("../../data/processed/mappings.csv")

In [3]:
# Sentence Transformer

MODEL = "sentence-transformers/all-MiniLM-L6-v2"

transformer = SentenceTransformer(MODEL)

In [4]:
# Encode

vec_e = encode(evidences['evidence'].tolist(), transformer)
vec_c = encode(claims['claim'].tolist(), transformer)

Batches: 100%|██████████| 161/161 [00:06<00:00, 24.86it/s]
Batches: 100%|██████████| 48/48 [00:01<00:00, 35.33it/s]


In [5]:
# Save Embeddings

np.save("../../data/embeddings/evidences_embeddings.npy", vec_e)
np.save("../../data/embeddings/claims_embeddings.npy", vec_c)

In [11]:
# Retrieve Test
test_i = 9

test_claim = claims.iloc[test_i]['claim']
print(test_claim)

top_indices, top_scores = get_top_k(vec_c[test_i], vec_e, k=5)

for rank, (idx, score) in enumerate(zip(top_indices, top_scores)):
    evidence_text = evidences.iloc[idx]['evidence']
    print(f"{rank + 1} - {score:.4f} - {evidence_text}")


Sea level rise has been slow and a constant, pre-dating industrialization
1 - 0.7672 - Sea-Level Rise from the Late 19th to the Early 21st Century.
2 - 0.7672 - The rate of rise started to slow down about 8,200 years before present; the sea level was almost constant in the last 2,500 years, before the recent rising trend that started at the end of the 19th century or in the beginning of the 20th.
3 - 0.7544 - Sea level rise will continue over many centuries.
4 - 0.7479 - 20th century acceleration in global sea-level rise.
5 - 0.7177 - One of the most striking trends - over a century of global-average sea level change.


In [12]:
# Retrieve All

k = 10
all_retrievals = {}

for i in range(len(claims)):
    claim_id = claims.iloc[i]['claim_id']

    top_indices, top_scores = get_top_k(vec_c[i], vec_e, k=k)

    all_retrievals[claim_id] = []
    for idx, score in zip(top_indices, top_scores):
        all_retrievals[claim_id].append((idx, score))



In [13]:
# Rerank cross encoder

test_claim_id = claims.iloc[test_i]['claim_id']
test_claim_text = claims.iloc[test_i]['claim']

retrieved_tuples = all_retrievals[test_claim_id]

candidate_texts = [evidences.iloc[idx]['evidence'] for idx, score in retrieved_tuples]
reranked_results = climate_rerank(test_claim, candidate_texts)

print(f"Claim: {test_claim}\n")
for i, (text, score) in enumerate(reranked_results):
    print(f"Rerank {i+1}: Score {score:.4f} | {text[:100]}...")

Claim: Sea level rise has been slow and a constant, pre-dating industrialization

Rerank 1: Score 0.5233 | A number of later studies have concluded that a global sea level rise of 200 to 270 cm 6.6 to 8.9 ft...
Rerank 2: Score 0.4787 | For at least the last 100 years, sea level has been rising at an average rate of about 1.8 mm 0.07 i...
Rerank 3: Score 0.4604 | For at least the last 100 years, sea level has been rising at an average rate of about 1.8 millimetr...
Rerank 4: Score 0.3843 | Sea level rise will continue over many centuries....
Rerank 5: Score 0.3787 | 20th century acceleration in global sea-level rise....
Rerank 6: Score 0.3758 | Sea-Level Rise from the Late 19th to the Early 21st Century....
Rerank 7: Score 0.3752 | Sea level rise since 1990 was underestimated in older models, but now agrees well with observations....
Rerank 8: Score 0.3156 | One of the most striking trends - over a century of global-average sea level change....
Rerank 9: Score 0.3090 | This could mean r

In [14]:
# Majority Vote
k = 10
top_k_texts = [text for text, score in reranked_results[:k]]
verdict, details = majority_vote(test_claim_text, top_k_texts)

print(f"Claim: {test_claim_text}")
print(f"Final Decision: {verdict}")
print(f"Vote Distribution: {dict(details)}")

Claim: Sea level rise has been slow and a constant, pre-dating industrialization
Final Decision: REFUTES
Vote Distribution: {'REFUTES': 7, 'NOT_ENOUGH_INFO': 3}
