In [1]:
import random
import pandas as pd
from tqdm import tqdm

In [9]:
# Load qrels
root = "/gallery_louvre/dayoon.ko/research/sds/src/datasets/"
qrels_pth = root + "nq-train/qrels/train.tsv"
qrels = pd.read_csv(qrels_pth, sep="\t")

In [10]:
import random
qrels_sample = qrels.sample(n=1000, random_state=42)
qrels_sample

Unnamed: 0,query-id,corpus-id,score
21564,train21564,doc2962294,1
24011,train24011,doc3299462,1
30983,train30983,doc4244915,1
63770,train63770,doc8679304,1
119508,train119508,doc16274486,1
...,...,...,...
71268,train71268,doc9706714,1
126276,train126276,doc17189325,1
78607,train78607,doc10683762,1
117954,train117954,doc16055865,1


In [11]:
qrels_sample.to_csv("/gallery_louvre/dayoon.ko/research/sds/eval_retrieval/retrieval/results/bge-large-en-v1.5/nq.csv")

### Check corpus

In [12]:
# Load qrels
import json
root = "/gallery_louvre/dayoon.ko/research/sds/src/datasets/"
corpus_train = root + "nq-train/corpus.jsonl"
corpus_test = root + "nq/corpus.jsonl"
with open(corpus_train) as f:
    corpus_train = [json.loads(i) for i in f.readlines()]
with open(corpus_test) as f:
    corpus_test = [json.loads(i) for i in f.readlines()]

In [13]:
corpus_train_dict = {i["_id"]: i for i in tqdm(corpus_train)}
#corpus_test = [i for i in corpus_test if i["_id"] in qrels["corpus-id"]]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18060996/18060996 [00:13<00:00, 1361816.31it/s]


In [14]:
selected_corpus_train = [corpus_train_dict[cid] for cid in set(qrels["corpus-id"].unique())]

In [15]:
selected_corpus_train_pth = root + "nq-train/corpus_selected.jsonl"
with open(selected_corpus_train_pth, "w") as f:
    for i in selected_corpus_train:
        f.write(json.dumps(i) + "\n")

In [16]:
len(selected_corpus_train)

132803

## Check fever and climate-fever

In [3]:
# Load qrels
import json
root = "/gallery_louvre/dayoon.ko/research/sds/src/datasets/"
corpus_fever = root + "fever/corpus.jsonl"
corpus_climate_fever = root + "climate-fever/corpus.jsonl"
with open(corpus_fever) as f:
    corpus_fever = [json.loads(i)["_id"] for i in tqdm(f.readlines())]
with open(corpus_climate_fever) as f:
    corpus_climate_fever = [json.loads(i)["_id"] for i in tqdm(f.readlines())]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5416568/5416568 [00:22<00:00, 237803.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5416593/5416593 [00:22<00:00, 241846.06it/s]


In [4]:
set(corpus_climate_fever) - set(corpus_fever)

{'2006_Texas_gubernatorial_election',
 '2007_Kangaroo_Island_bushfires',
 '2010_Northern_Hemisphere_heat_waves',
 '2010_United_Kingdom_general_election',
 '2015_United_Kingdom_general_election',
 '2017_United_Kingdom_general_election',
 '2018_British_Isles_heat_wave',
 '2019_United_Kingdom_general_election',
 '2019_heat_wave_in_India_and_Pakistan',
 '2019_in_science',
 '2019–20_Australian_bushfire_season',
 'A_Scientific_Dissent_from_Darwinism',
 'African_humid_period',
 'Alcohol_(drug)',
 'Angstrom',
 'Art_Robinson',
 'Avoiding_Dangerous_Climate_Change_(2005_conference)',
 'Bernie_Sanders_2016_presidential_campaign',
 'Climate_change_(general_concept)',
 'Climate_change_in_Tuvalu',
 'Climate_system',
 'Climate_variability',
 'Coral_in_non-tropical_regions',
 'Donald_Trump_2016_presidential_campaign',
 'Earth_shelter',
 'Energy_subsidy',
 'Explosive',
 'Financial_crisis_of_2007–08',
 'GRACE_and_GRACE-FO',
 'Geysers_on_Mars',
 'Global_Energy_and_Water_Exchanges',
 'Global_temperature_re

In [5]:
# Load qrels
import pandas as pd
root = "/gallery_louvre/dayoon.ko/research/sds/src/datasets/"
qrels_pth = root + "climate-fever/qrels/test.tsv"
qrels = pd.read_csv(qrels_pth, sep="\t")

In [6]:
select_ids = set(qrels["corpus-id"].tolist()) - set(corpus_fever)

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader, DirectoryLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import DistanceStrategy

import os
import fire
import torch
from tqdm import tqdm
from glob import glob


# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    del record["text"]
    metadata.update(record)
    return metadata

In [10]:
dataset_name = "climate-fever"
glob_dir:str = "corpus.jsonl"
repo: str = "BAAI" #"facebook" #"thenlper" #"intfloat"
model_name: str = "bge-large-en-v1.5" #"contriever" #"gte-base" #"multilingual-e5-large"
data_dir: str = '/gallery_louvre/dayoon.ko/research/sds/src/datasets'
db_faiss_dir: str = f"../vectorstore/{model_name}/{dataset_name}"
batch_size: int = 256

In [11]:
# Document
loader = JSONLoader(
            f"{data_dir}/{dataset_name}/{glob_dir}", 
            jq_schema=".",  
            content_key="text",
            json_lines=True,
            metadata_func=metadata_func
        )
documents = loader.load()

In [12]:
documents = [i for i in tqdm(documents) if i.metadata["_id"] in select_ids]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5416593/5416593 [00:02<00:00, 1925383.48it/s]


In [14]:
len(documents), model_name

(57, 'bge-large-en-v1.5')

In [15]:
print(f'Document count: {len(documents)}')

# Split document
embeddings = HuggingFaceEmbeddings(
                model_name=f"{repo}/{model_name}",
                model_kwargs={
                    'device': 'cpu',
                },
                encode_kwargs={
                    'batch_size': batch_size,
                    'device': 'cpu',
                }
            )    # Make a DB

print(f'Extract db from documents {db_faiss_dir}')
#db = FAISS.from_documents(
#        documents, 
#        embeddings,
#        normalize_L2 = True,
#        distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT
#    )
db_org = FAISS.load_local(f"../vectorstore/{model_name}/fever", embeddings=embeddings, allow_dangerous_deserialization=True) 
db_org.add_documents(documents)
print(f'Saving embeddings to {db_faiss_dir}')
db_org.save_local(f'{db_faiss_dir}')
print('Saved')

Document count: 57


  warn_deprecated(


Extract db from documents ../vectorstore/bge-large-en-v1.5/climate-fever


RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char*) at /project/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open ../vectorstore/bge-large-en-v1.5/fever/index.faiss for reading: No such file or directory