In [1]:
import dataset_loader
import re

def normalize_url(url):
   url = url.strip()
   url = re.sub('/$', '', url)
   url = url.replace("lupa.uol.com.br/jornalismo", "piaui.folha.uol.com.br/lupa")
   return url

base_path = "bases/base3/"
claims_file = "base3_test.tsv"

claims = dataset_loader.loadClaims(base_path + claims_file)
for claim in claims:
  claim['document'] = normalize_url(claim['document'])

def search_claim(claim_id):
   for claim in claims:
      if claim['id'] == claim_id:
         return claim
   print(claim_id, 'not found')
   return 0

In [None]:
from strategies import ClassifierStrategy, EvidenceSelectStrategy
import csv
from collections import defaultdict   

class FactCheckerParameters:
    def __init__(self, document_search_method, evidence_selection_method, classifier_method, base_path, url_categories_path, claims_file):
        self.document_search_method = document_search_method
        self.evidence_selection_method = evidence_selection_method
        self.classifier_method = classifier_method
        self.base_path = base_path
        self.url_categories_path = url_categories_path
        self.claims_path = base_path  + claims_file
        self.claims_unprocessed_path = self.claims_path.replace(".tsv", "_raw.tsv")
        self.claims_sbert_path = self.claims_path.replace(".tsv", "_sbert1.pt")
        self.evidences_path = self.claims_path.replace(".tsv", "_ev" + self.evInfo() + ".tsv")
        self.evidences_classified_path = self.evidences_path.replace('.tsv',  "_" + classifier_method.name + '_classified.tsv')
        self.claims_classified_path = self.claims_path.replace('.tsv', self.classifierInfo() + '_classified.tsv')
        self.documents_path = self.claims_path.replace(".tsv", self.docInfo() + "_docret.tsv")
        self.urls_directory = self.getUrlsDir(self.documents_path)
        self.claims_classified_same_doc_path = self.claims_path.replace('.tsv', self.classifierInfo() + '_classifiedSameDoc.tsv')
        
        #print("base_path:", self.base_path)
        #print("url_categories_path:", self.url_categories_path)
        #print("claims_path:", self.claims_path)
        #print("claims_unprocessed_path:", self.claims_unprocessed_path)
        #print("claims_sbert_path:", self.claims_sbert_path)
        #print("evidences_path:", self.evidences_path)
        #print("evidences_classified_path:", self.evidences_classified_path)
        #print("claims_classified_path:", self.claims_classified_path)
        #print("documents_path:", self.documents_path)
        #print("urls_directory:", self.urls_directory)

    def getUrlsDir(self, doc_path):
        return doc_path.replace(".tsv", "_urls/")

    def docInfo(self):
        return "_" + self.document_search_method

    def evInfo(self):
        return self.docInfo() + "_" + self.evidence_selection_method.name

    def classifierInfo(self):
        return self.evInfo() + "_" + self.classifier_method.name

In [None]:
#EVALUATION OF THE WHOLE FACT-CHECKING PROCESS WITH DIFFERENT STRATEGIES FOR SELECTING THE EVIDENCE

evStrategies = [
  EvidenceSelectStrategy.Sentence1,
  EvidenceSelectStrategy.Sentence1NoQuotes,
  EvidenceSelectStrategy.TitleSentence1,
  EvidenceSelectStrategy.Context5,
  EvidenceSelectStrategy.Context5NoQuotes,
  EvidenceSelectStrategy.TitleContext5,
  EvidenceSelectStrategy.TitleContext5NoQuotes,
  EvidenceSelectStrategy.OnlyTitle,
]

classStrategies = [
  ClassifierStrategy.Bert3  
]
docStrategies = ["acurrent"]


print('method', 'true_supports','true_refutes', 'true_NEI', 'false_supports','false_refutes', 'false_NEI', 'ptrue_supports', 'ptrue_refutes', 'ptrue_NEI', 'pfalse_supports', 'pfalse_refutes', 'pfalse_NEI', 'precision_refutes', 'recall_false', 'f1_refutes', 'precision_supports', 'recall_true', 'f1_supports', 'accuracy', 'avg_precision', 'avg_recall', 'avg_f1', sep="\t")
for docStrategy in docStrategies:
  for classStrategy in classStrategies:
    for evStrategy in evStrategies:
      params = FactCheckerParameters(
          document_search_method = docStrategy,
          evidence_selection_method = evStrategy,
          classifier_method = classStrategy,
          base_path = "bases/base3/",
          url_categories_path = "urls_categories.txt",
          claims_file = "base3_test.tsv",
      )
      samples = []
      dict_count = defaultdict(int)      
      path = params.claims_classified_path
      #path = params.evidences_classified_path
      try:
        with open(path,'r', encoding='utf-8') as f:
          read_samples = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
          i = 0
          total = 0
          for sample in read_samples:             
            sample['claim_id'] = int(sample['claim_id'])
            dict_count[(sample['claim_class'], sample['predicted_label'])] += 1
            total += 1
            samples.append(sample)

        #for key in dict_count.keys():
        #  print(params.claims_classified_path, key[0], key[1], dict_count[key], sep='\t')
        true_supports = dict_count[('VERDADEIRO', 'SUPORTA')]
        true_refutes = dict_count[('VERDADEIRO', 'REFUTA')]
        true_NEI = dict_count[('VERDADEIRO', 'INSUFICIENTE')]       
        false_supports = dict_count[('FALSO', 'SUPORTA')] 
        false_refutes = dict_count[('FALSO', 'REFUTA')] 
        false_NEI = dict_count[('FALSO', 'INSUFICIENTE')] 
        accuracy = (true_supports + false_refutes)/total
        recall_true = true_supports/(true_supports + true_refutes + true_NEI)
        recall_false = false_refutes/(false_supports + false_refutes + false_NEI)
        precision_supports = true_supports/(true_supports + false_supports)
        precision_refutes = false_refutes/(true_refutes + false_refutes)
        f1_supports = 2/(precision_supports**(-1)+recall_true**(-1))
        f1_refutes = 2/(precision_refutes**(-1)+recall_false**(-1))
        avg_precision = (precision_supports + precision_refutes)/2
        avg_recall = (recall_true + recall_false)/2
        avg_f1 = (f1_supports + f1_refutes)/2
        ptrue_supports = true_supports/float(true_supports + true_refutes + true_NEI)
        ptrue_refutes = true_refutes/float(true_supports + true_refutes + true_NEI)
        ptrue_NEI = true_NEI/float(true_supports + true_refutes + true_NEI)
        pfalse_supports = false_supports/float(false_supports + false_refutes + false_NEI)
        pfalse_refutes = false_refutes/float(false_supports + false_refutes + false_NEI)
        pfalse_NEI = false_NEI/float(false_supports + false_refutes + false_NEI)

        print(path.replace(params.base_path, ""), true_supports,true_refutes, true_NEI, false_supports,false_refutes, false_NEI, ptrue_supports,ptrue_refutes, ptrue_NEI, pfalse_supports,pfalse_refutes, pfalse_NEI, precision_refutes, recall_false, f1_refutes, precision_supports, recall_true, f1_supports, accuracy, avg_precision, avg_recall, avg_f1, sep="\t")
      except:
        pass

In [None]:
#EVALUATION OF PERCENTAGE OF CLAIMS CORRECTLY CLASSIFIED WITH AT LEAST ONE PIECE OF EVIDENCE CORRECT FROM THE ANNOTATED DOCUMENT

import csv

def readDictFile(filepath, delimiter = '\t'):
  samples = []
  with open(filepath,'r', encoding='utf-8') as f:
    read_samples = csv.DictReader(f, delimiter=delimiter, skipinitialspace=True)
    for sample in read_samples:
      samples.append(sample)
  return samples



claims = []

for classStrategy in classStrategies:
  for evStrategy in evStrategies:
    params = FactCheckerParameters(
        document_search_method = "acurrent",
        evidence_selection_method = evStrategy,
        classifier_method = classStrategy,
        base_path = "bases/base3/",
        url_categories_path = "urls_categories.txt",
        claims_file = "base3_test.tsv",
    )
    evidences_classified = readDictFile(params.evidences_classified_path)
    claims_classified = readDictFile(params.claims_classified_path)

    for e in evidences_classified:
      e['url'] = normalize_url(e['url'])

    if claims == []:
      claims = dataset_loader.loadClaims(params.claims_path)
      for c in claims:
        c['document'] = normalize_url(c['document'])
      
    count_true = 0
    count_false = 0
    count_same_document = 0
    count_same_document_true = 0
    count_same_document_false = 0

    
    with open(params.claims_classified_same_doc_path, 'w',  encoding="utf-8", newline='') as f:
      fieldnames = ['claim_id', 'claim_class', 'predicted_label', 'correct', 'correct_same_document', 'document']
      writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
      writer.writeheader()
      
      for c in claims_classified:
        correct = 0
        same_document = 0
        claim_document = ''
        for claim in claims:          
          if claim['id'] == int(c['claim_id']):
            claim_document = claim['document']
            break
        if c['claim_class'] == 'FALSO':
          count_true += 1
        else:
          count_false += 1
        if (c['claim_class'] == 'FALSO' and c['predicted_label'] == 'REFUTA') or (c['claim_class'] == 'VERDADEIRO' and c['predicted_label'] == 'SUPORTA'):
          correct = 1            

          for e in evidences_classified:
            if e['claim_id'] == c['claim_id'] and e['predicted_label'] == c['predicted_label']:
              if e['url'] == claim_document:
                same_document = 1
          if same_document:
            count_same_document += 1
            if c['claim_class'] == 'FALSO':
              count_same_document_false += 1
            else:
              count_same_document_true += 1
        writer.writerow({
          'claim_id': c['claim_id'], 
          'claim_class': c['claim_class'], 
          'predicted_label': c['predicted_label'],
          'correct': correct,
          'correct_same_document': same_document,
          'document' : claim_document
          })
      print('***',evStrategy.name, len(claims_classified), count_same_document, count_true, count_same_document_true,count_false,count_same_document_false, sep='\t')