To start the pipeline it is necessary to have a file in claims_unprocessed_path, separated by tab, with the columns:<br>
* id - Claim's unique numerical identifier<br>
* claim - Text with the claim to be verified<br>
* class - Optional. The expected class, in case you want to evaluate the result of the classifier.

Each step can be run in a different execution as long as it follows the order. That is, if the Kernel needs to be restarted, it is not necessary to run the previous cells, except the cell that informs the paths of the files.

Trained models must be placed in the models folder and can be obtained here:
https://drive.google.com/drive/folders/1wZj_GJJ1O9goAPuYrTNahL2sE-jTk4Mm?usp=share_link

In [None]:
from strategies import ClassifierStrategy, EvidenceSelectStrategy

class FactCheckerParameters:
    def __init__(self, document_search_method, evidence_selection_method, classifier_method, base_path, url_categories_path, claims_file):
        self.document_search_method = document_search_method
        self.evidence_selection_method = evidence_selection_method
        self.classifier_method = classifier_method
        self.base_path = base_path
        self.url_categories_path = url_categories_path
        self.claims_path = base_path  + claims_file
        self.claims_unprocessed_path = self.claims_path.replace(".tsv", "_raw.tsv")
        self.claims_sbert_path = self.claims_path.replace(".tsv", "_sbert1.pt")
        self.evidences_path = self.claims_path.replace(".tsv", "_ev" + self.evInfo() + ".tsv")
        self.evidences_classified_path = self.evidences_path.replace('.tsv',  "_" + classifier_method.name + '_classified.tsv')
        self.claims_classified_path = self.claims_path.replace('.tsv', self.classifierInfo() + '_classified.tsv')
        self.documents_path = self.claims_path.replace(".tsv", self.docInfo() + "_docret.tsv")
        self.urls_directory = self.getUrlsDir(self.documents_path)
        self.claims_classified_same_doc_path = self.claims_path.replace('.tsv', self.classifierInfo() + '_classifiedSameDoc.tsv')
        
        print("base_path:", self.base_path)
        print("url_categories_path:", self.url_categories_path)
        print("claims_path:", self.claims_path)
        print("claims_unprocessed_path:", self.claims_unprocessed_path)
        print("claims_sbert_path:", self.claims_sbert_path)
        print("evidences_path:", self.evidences_path)
        print("evidences_classified_path:", self.evidences_classified_path)
        print("claims_classified_path:", self.claims_classified_path)
        print("documents_path:", self.documents_path)
        print("urls_directory:", self.urls_directory)

    def getUrlsDir(self, doc_path):
        return doc_path.replace(".tsv", "_urls/")

    def docInfo(self):
        return "_" + self.document_search_method

    def evInfo(self):
        return self.docInfo() + "_" + self.evidence_selection_method.name

    def classifierInfo(self):
        return self.evInfo() + "_" + self.classifier_method.name

params = FactCheckerParameters(
    document_search_method = "current",
    evidence_selection_method = EvidenceSelectStrategy.Context5,
    classifier_method = ClassifierStrategy.Bert3,
    base_path = "bases/base3/",
    url_categories_path = "urls_categories.txt",
    claims_file = "base3_test.tsv",
)

## Pre-processing of claims

In [None]:
import dataset_loader
import preprocessing
from pathlib import Path
import csv

claims = dataset_loader.loadClaims(params.claims_unprocessed_path)
for claim in claims:
  claim['claim_clean'] = preprocessing.tokenize_and_join(claim['claim'])

with Path(params.claims_path).open('w', encoding="utf-8", newline='') as f2:
  fieldnames = ['id', 'claim_clean', 'class']
  writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
  writer.writeheader()
  for claim in claims:
    writer.writerow(claim)

## Document retrieval

### Google search

In [None]:
import document_retrieval
import dataset_loader

claims = dataset_loader.loadClaims(params.claims_path)
document_retrieval.documentRetrieval(claims, params.documents_path)

### Extract documents

In [None]:
import document_retrieval
document_retrieval.extractDocuments(params.documents_path, params.urls_directory, params.url_categories_path)

# Evidence Selection

### Saving embeddings

In [None]:
#Saving claims' embeddings
from select_evidences import SbertEncoder
import dataset_loader

claims = dataset_loader.loadClaims(params.claims_path)
encoder = SbertEncoder(params.urls_directory)
encoder.save_sbert_claims(claims, params.claims_sbert_path)

In [None]:
#Saving documents' embeddings

from select_evidences import SbertEncoder

documents = dataset_loader.loadDocuments(params.documents_path, params.url_categories_path)
encoder = SbertEncoder(params.urls_directory)
encoder.save_sbert_documents(documents)

### Evidence Selection

In [None]:
from select_evidences import get_evidence_selector
from select_evidences import EvidenceSelectStrategy
import dataset_loader

claims = dataset_loader.loadClaims(params.claims_path)
documents = dataset_loader.loadDocuments(params.documents_path, params.url_categories_path)
get_evidence_selector(params.evidence_selection_method, params.urls_directory).selectEvidences(claims, documents, params.claims_sbert_path, params.evidences_path)

## Classification

In [1]:
import dataset_loader
import classification

def classify(params):
  claims = dataset_loader.loadClaims(params.claims_path)
  print('Evidences:', params.evidences_path)
  evidences = dataset_loader.loadEvidences(params.evidences_path)
  
  print("Classification:", params.classifierInfo())
  
  classification.classify_claims(claims, evidences, params.evidences_classified_path, params.claims_classified_path, params.classifier_method)

classify(params)