In [6]:
# Dependencies

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

from encode import encode
from retrieve import get_top_k

In [7]:
# Load Data

evidences = pd.read_csv("../../data/processed/evidences.csv")
claims = pd.read_csv("../../data/processed/claims.csv")
mappings = pd.read_csv("../../data/processed/mappings.csv")

In [8]:
# Sentence Transformer

MODEL = "sentence-transformers/all-MiniLM-L6-v2"

transformer = SentenceTransformer(MODEL)

In [9]:
# Encode

vec_e = encode(evidences['evidence'].tolist(), transformer)
vec_c = encode(claims['claim'].tolist(), transformer)

Batches: 100%|██████████| 161/161 [00:07<00:00, 20.24it/s]
Batches: 100%|██████████| 48/48 [00:01<00:00, 29.02it/s]


In [10]:
# Save Embeddings

np.save("../../data/embeddings/evidences_embeddings.npy", vec_e)
np.save("../../data/embeddings/claims_embeddings.npy", vec_c)

In [14]:
# Retrieve Test
test_i = 0
test_claim = claims.iloc[test_i]['claim']
print(test_claim)

top_indices, top_scores = get_top_k(vec_c[test_i], vec_e, k=5)

for rank, (idx, score) in enumerate(zip(top_indices, top_scores)):
    evidence_text = evidences.iloc[idx]['evidence']
    print(f"{rank + 1} - {score:.4f} - {evidence_text}")


Global warming is driving polar bears toward extinction
1 - 0.7538 - Rising global temperatures, caused by the greenhouse effect, contribute to habitat destruction, endangering various species, such as the polar bear.
2 - 0.7427 - Steven Amstrup and other U.S. Geological Survey scientists have predicted two-thirds of the worlds polar bears may disappear by 2050, based on moderate projections for the shrinking of summer sea ice caused by climate change, though the validity of this study has been debated.
3 - 0.7105 - Bear hunting caught in global warming debate.
4 - 0.6742 - Global warming is a major threat to global biodiversity.
5 - 0.6705 - The extinction risk of global warming is the risk of species becoming extinct due to the effects of global warming.


In [16]:
# Retrieve All

k = 10
all_retrievals = {}

for i in range(len(claims)):
    claim_id = claims.iloc[i]['claim_id']

    top_indices, top_scores = get_top_k(vec_c[i], vec_e, k=k)

    all_retrievals[claim_id] = []
    for idx, score in zip(top_indices, top_scores):
        all_retrievals[claim_id].append((idx, score))



In [None]:
# Rerank cross encoder

In [None]:
# BERT Calssifier