## Preprocess lupa

In [None]:
#Lupa

import os
import csv
import preprocessing
from pathlib import Path

claims = []
with open('lupa/bases/lupa_trechos_tratados.tsv','r', encoding='utf-8') as f:
    read_claims = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
    i = 0
    for claim in read_claims:
        claim['claim_clean'] = preprocessing.tokenize_and_join(claim['claim'])
        claim['evidence_clean'] = preprocessing.tokenize_and_join(claim['evidence'])
        claims.append(claim)

with Path('lupa/bases/lupa_trechos_limpos.tsv').open('w', encoding="utf-8", newline='') as f2:
    fieldnames = ['id', 'row', 'trecho', 'qtd_trechos', 'source', 'claim', 'claim_clean', 'metadata', 'class', 'evidence', 'evidence_clean']
    print(fieldnames)
    writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
    writer.writeheader()
    for trecho in claims:
        #print(trecho['evidence'])
        writer.writerow(trecho)

## Join datasets

In [None]:
import os
import csv
from pathlib import Path
import re
from datetime import datetime

def standardizeDate(date):
  date = date.replace("-","/")
  try:
    d = datetime.strptime(date, '%d/%m/%y')
    date = d.strftime("%Y/%m/%d")
  except:
    try:
      d = datetime.strptime(date, '%d/%m/%Y')
      date = d.strftime("%Y/%m/%d")
    except:
      pass
  return date


def validItemFakeRecogna(item):
  title = item['Titulo'].lower()
  invalid_words = ['?', 'confira', 'entenda', 'veja', 'leia', 'últimas notícias', 'as notícias', ' y ', 'coisas que você precisa saber']  
  invalid_words += ['casos de coronavírus no brasil']
  for i in invalid_words:
    if i in title:
      return False
  if title.split(' ')[0] == 'como':    
    return False
  return True

claims = []
global_row = 0

with open('lupa/bases/lupa_trechos_limpos.tsv','r', encoding='utf-8') as f:
    read_claims = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
    row = 0
    for item in read_claims:
      if item['class'] in ['VERDADEIRO', 'FALSO']:
        claim = {}           
        claim['id'] = global_row
        claim['id_base'] = item['id']
        claim['base'] = 'lupa'
        claim['date_published']=item['source'].replace("https://piaui.folha.uol.com.br/lupa/","")[0:10]
        claim['domain'] = 'piaui.folha.uol.com.br/lupa'
        claim['document'] = item['source']
        claim['class'] = item['class']
        claim['claim'] = item['claim']
        claim['evidence'] = item['evidence']
        claims.append(claim)        
        global_row +=1      
      row +=1  

with open('bases/FACTCKBR.tsv','r', encoding='utf-8') as f:
  read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
  row = 0
  for item in read:
    itemClass = item['alternativeName'].upper() 
    if (itemClass == 'FALSO' or itemClass == 'VERDADEIRO') and item['reviewBody'] and item['reviewBody'] !='Empty':   
      claim = {}   
      claim['id'] = global_row
      claim['id_base'] = row
      claim['base'] = 'factckbr'
      claim['date_published']=item['datePublished']
      claim['document'] = item['URL']
      claim['domain'] = item['URL'].replace("http://", "").replace("https://", "").split("/")[0]
      claim['class'] = itemClass
      claim['claim'] = item['claimReviewed']
      claim['evidence'] = item['reviewBody']
      claims.append(claim)      
      global_row +=1
    row +=1

with open('bases/fakepedia_boatos.tsv','r', encoding='utf-8') as f:
  read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
  row = 0
  for item in read:
    if item['status'] == 'OK':      
      claim = {}   
      claim['id'] = global_row
      claim['id_base'] = row
      claim['base'] = 'fakepedia'
      claim['date_published']=item['published']
      claim['document'] = item['url']
      claim['domain'] = item['url'].replace("http://", "").replace("https://", "").split("/")[0]
      claim['class'] = 'FALSO'
      claim['claim'] = item['original_news']
      claim['evidence'] = item['content']

      claim['claim'] = re.sub(r'Confira dica para não cair, nunca mais, em boatos.*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Ps.: Esse artigo é uma sugestão .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'PS: esse artigo foi uma sugestão .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'S: esse artigo foi uma sugestão .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Este texto foi escrito pelo leitor .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'P.S.: Esse artigo é uma sugestão de leitores .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Ps.: Esse artigo foi uma sugestão .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Os 10 boatos que mais bombaram.*',"", claim['claim'], flags=re.S)

      claim['claim'] = re.sub(r'<span .*</span>',"", claim['claim'], flags=re.S)
      claim['evidence'] = re.sub(r'<span .*</span>',"", claim['evidence'], flags=re.S)
      claim['claim'] = re.sub(r'Lista de fake news das eleições .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Confira a lista de todas as fake news .*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'Clique nos links “bit.ly” para acessar nossos perfis:.*',"", claim['claim'], flags=re.S)
      claim['claim'] = re.sub(r'– Siga-nos no Facebook.*',"", claim['claim'], flags=re.S)
      claim['claim'] = claim['claim'].strip()
      
      if claim['claim']:
        claims.append(claim)
        global_row +=1
    row +=1

with open('bases/fakeRecogna.tsv','r', encoding='utf-8') as f:
  read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
  row = 0  
  for item in read:    
    if item['Classe'] == '1' and validItemFakeRecogna(item):   
      claim = {}   
      claim['id'] = global_row
      claim['id_base'] = row
      claim['base'] = 'fakeRecogna'
      claim['date_published']=item['Data']
      claim['document'] = item['URL']
      claim['domain'] = item['URL'].replace("http://", "").replace("https://", "").split("/")[0]
      claim['class'] = 'VERDADEIRO'
      claim['claim'] = item['Titulo']
      claim['evidence'] = item['Noticia']
      claims.append(claim)      
      global_row +=1
    row +=1

bad_documents = []
#Put in this file urls that are not good to include in the dataset
with open('documentos_ruins.txt','r', encoding='utf-8') as f:
  read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
  for item in read:
    bad_documents.append(item['url'])

bad_claims = []
#Put in this file the text of claims that are not good to include in the dataset
with open('claims_ruins.txt','r', encoding='utf-8') as f:
  read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
  for item in read:
    bad_claims.append(item['claim'])

claims = [claim for claim in claims if claim['document'] not in bad_documents and claim['claim'] not in bad_claims]

#Removing duplicates
claims_without_duplicates = []
for i in range(len(claims)):
  duplicated = False
  for c in claims_without_duplicates:
    if claims[i]['claim'] == c['claim']:
      duplicated = True
      break
  if not duplicated:
    claims_without_duplicates.append(claims[i])
claims = claims_without_duplicates

for claim in claims:
  claim['num_words_claim'] = len(claim['claim'].split(' '))
  claim['num_words_evidence'] = len(claim['evidence'].split(' '))
  claim['num_chars_claim'] = len(claim['claim'])
  claim['num_chars_evidence'] = len(claim['evidence'])
  claim['date_published'] = claim['date_published'].replace("Publicado em  ", "").split(" ")[0]

#limit 300 characters
claims = [claim for claim in claims if claim['num_chars_claim'] <= 300]

qtd_fake = 0
qtd_true = 0
for claim in claims:
  if claim['class'] == 'FALSO':
    qtd_fake+=1
  if claim['class'] == 'VERDADEIRO':
    qtd_true+=1

#removes excess of true claims
diferenca = qtd_true - qtd_fake
removed = 0
for i in range(len(claims) - 1, -1, -1):
  if removed >= diferenca:
    break
  if claims[i]['class'] == 'VERDADEIRO':
    del claims[i]
    removed += 1
print(qtd_fake, qtd_true, 'removed' , removed)

regex = datetime.strptime

#Standardize dates
for c in claims:
  c['date_published'] = c['date_published'].replace("-","/")
  try:
    d = regex(c['date_published'], '%d/%m/%y')
    c['date_published'] = d.strftime("%Y/%m/%d")
  except:
    try:
      d = regex(c['date_published'], '%d/%m/%Y')
      c['date_published'] = d.strftime("%Y/%m/%d")
    except:
      pass

with Path('bases/base3/base3_raw.tsv').open('w', encoding="utf-8", newline='') as f2:
    fieldnames = ['id', 'base', 'claim', 'evidence','document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']
    print(fieldnames)
    writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
    writer.writeheader()
    row = 0
    for row, claim in enumerate(claims):
        claim['id'] = row
        writer.writerow(claim)

## Preprocess dataset

In [6]:

import csv
from pathlib import Path
import preprocessing

def containsText(t, ignore_sentences_containing):
    for sentence in ignore_sentences_containing:
        if sentence in t:
            return True
    return False

def clean_claim(text):
    text = text.replace('Versão 1:', '')
    text = text.replace('Versão 2:', '')
    text = text.replace('(sic)', '')
    tokens = preprocessing.tokenize(text)
    ignore_sentences_containing = ['Se você quiser sugerir um tema para o Boatos.org']
    ignore_exact_sentences = ['Ps.', 'Leia a mensagem que circula online:', '']

    tokens = [t for t in tokens if t and t not in ignore_exact_sentences ]
    tokens = [t for t in tokens if not containsText(t, ignore_sentences_containing)  ]
    return '\n'.join(tokens)

def clean_evidence(text):
    text = text.replace('_NOTICIA_ORIGINAL_', '')

    
    tokens = preprocessing.tokenize(text)
    ignore_sentences_containing = ['Esse artigo é uma sugestão de leitores do Boatos.org', 'Jornalista e caçador de falcatruas na internet', 
                           'Se você quiser sugerir um tema', 'Esse artigo é uma sugestão de diversos leitores', 'Clique nos links "bit.ly"', ]
    ignore_exact_sentences = ['Ps.', 'Leia a mensagem que circula online:', '']

    tokens = [t for t in tokens if t and t not in ignore_exact_sentences ]
    tokens = [t for t in tokens if not containsText(t, ignore_sentences_containing)  ]
    return '\n'.join(tokens)

claims = []
with Path('bases/base3/base3_raw.tsv').open('r', encoding="utf-8", newline='') as f:
    read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
    for item in read:
        item['claim_clean'] = clean_claim(item['claim'])
        item['evidence_clean'] = clean_evidence(item['evidence'])
        claims.append(item)

with Path('bases/base3/base3.tsv').open('w', encoding="utf-8", newline='') as f2:
    fieldnames = ['id', 'base', 'claim', 'evidence', 'claim_clean', 'evidence_clean', 'document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']
    writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
    writer.writeheader()
    for claim in claims:
        writer.writerow(claim)

In [10]:
import os
import csv
from pathlib import Path

def readClaims(basepath):
  documents = set()
  claims = []
  with open(basepath,'r', encoding='utf-8') as f:
    read = csv.DictReader(f, delimiter="\t", skipinitialspace=True)
    for item in read:
      item['id'] = int(item['id'])      
      claims.append(item)
      documents.add(item['document'])
    documents = list(documents)
    return claims, documents

def writeClaims(basepath, claims):
  with Path(basepath).open('w', encoding="utf-8", newline='') as f2:
    #fieldnames = ['id', 'base', 'id_base', 'claim_clean', 'evidence_clean','document','class']
    fieldnames = ['id', 'base', 'claim', 'evidence', 'claim_clean', 'evidence_clean', 'document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']
    print(fieldnames)
    writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
    writer.writeheader()
    for trecho in claims:
        #print(trecho['evidence'])
        writer.writerow(trecho)

In [None]:
claims, documents = readClaims('bases/base3/base3.tsv')

## Divide dataset into training, test and validation

In [12]:
#Divide dataset

from sklearn.model_selection import train_test_split

documents_train, documents_test = train_test_split(documents, test_size=0.2, random_state=1)
documents_train, documents_valid = train_test_split(documents_train, test_size=0.15, random_state=3)
claims_train = []
claims_valid = []
claims_test = []
for c in claims:
  if c['document'] in documents_train:
    claims_train.append(c)
  elif c['document'] in documents_test:
    claims_test.append(c)
  elif c['document'] in documents_valid:
    claims_valid.append(c)

writeClaims('bases/base3/base3_train.tsv', claims_train)

writeClaims('bases/base3/base3_valid.tsv', claims_valid)

writeClaims('bases/base3/base3_test.tsv', claims_test)

['id', 'base', 'claim', 'evidence', 'claim_clean', 'evidence_clean', 'document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']
['id', 'base', 'claim', 'evidence', 'claim_clean', 'evidence_clean', 'document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']
['id', 'base', 'claim', 'evidence', 'claim_clean', 'evidence_clean', 'document', 'date_published', 'domain', 'class', 'num_words_claim', 'num_words_evidence', 'num_chars_claim', 'num_chars_evidence']


In [13]:
claims_train, doc_train = readClaims('bases/base3/base3_train.tsv')
claims_valid, doc_valid = readClaims('bases/base3/base3_valid.tsv')
claims_test, doc_test = readClaims('bases/base3/base3_test.tsv')

## Generate negative samples (samples with class 'Not enough information')

In [None]:
from sentence_transformers import SentenceTransformer
modelsbert = SentenceTransformer('models/portuguese_sentence_transformer')
device = 'cuda'
modelsbert.to(device)

In [15]:
import scipy
import scipy.spatial

import torch
#Esse aqui fez com a média: https://medium.com/analytics-vidhya/few-shot-learning-using-sbert-95f8b08248bf
def sbert_sentence_embedding(claim, model = modelsbert):
    sentences = claim.split("\n")
    encoded = model.encode(sentences)
    return torch.mean(torch.Tensor(encoded), dim=0)

def sbert_sentence_embedding_total(claim, model = modelsbert):
    sentences = claim.split("\n")
    encoded = model.encode(sentences)
    return encoded

def cos_similarity(sent1_emb, sent2_emb):
  return 1 - scipy.spatial.distance.cosine(sent1_emb, sent2_emb)

In [16]:
def generateSbertEmbedding(samples):
    i = 0
    print(len(samples))
    for claim in samples:
        if i % 100 == 0:
            print(i)
        i += 1        
        claim['claim_sbert'] = sbert_sentence_embedding(claim['claim_clean'])
        claim['evidence_sbert'] = sbert_sentence_embedding(claim['evidence_clean'])

def sortFunction(e):
  return -e['value']

def getSimilarities(claim_sbert, evidences_sbert):
    similarities = []
    for i, evidence_sbert in enumerate(evidences_sbert):
        similarity = cos_similarity(evidence_sbert, claim_sbert)
        similarities.append({'row': i, 'value': similarity})
    similarities.sort(key=sortFunction)
    return similarities

def generateNegative(positive_samples):        
    negative_samples = []
    claim_row = 0
    for claim in positive_samples:
        if claim_row % 100 == 0:
            print(claim_row)
        claim_row += 1
        #new fields
        claim['id_claim'] = claim['id']
        claim['id_evidence'] = claim['id']
        claim['document_evidence'] = claim['document']

        #Find most similar evidences
        evidences_sbert = [evidence['evidence_sbert'] for evidence in positive_samples]
        claim_sbert = claim['claim_sbert']
        similarities = getSimilarities(claim_sbert, evidences_sbert)

        #Take 5 evidences, from documents different from the true evidence, and with high similarity to the claim.
        count = 1
        i = 0
        while count <= 5:
            evidenceRow = similarities[i]['row']
            evidence = positive_samples[evidenceRow]
            if evidence['document'] != claim['document']:
                claimNegative = claim.copy()
                claimNegative['class'] = 'INSUFICIENTE'            
                claimNegative['id'] = -(claim['id']*10 + count)
                claimNegative['id_claim'] = claim['id']
                claimNegative['id_evidence'] = evidence['id']
                claimNegative['claim_clean'] = claim['claim_clean']
                claimNegative['evidence_clean'] = evidence['evidence_clean']            
                claimNegative['document_evidence'] = evidence['document']
                negative_samples.append(claimNegative)
                count+=1
            i += 1
    negative_samples.extend(positive_samples)
    return negative_samples


In [None]:
generateSbertEmbedding(claims)

In [18]:
for c in claims_test:
  c['claim_sbert'] = claims[c['id']]['claim_sbert']
  c['evidence_sbert'] = claims[c['id']]['evidence_sbert']


for c in claims_train:
  c['claim_sbert'] = claims[c['id']]['claim_sbert']
  c['evidence_sbert'] = claims[c['id']]['evidence_sbert']

for c in claims_valid:
  c['claim_sbert'] = claims[c['id']]['claim_sbert']
  c['evidence_sbert'] = claims[c['id']]['evidence_sbert']

In [None]:
claim_valid3classes = generateNegative(claims_valid)
claim_test3classes = generateNegative(claims_test)
claim_train3classes = generateNegative(claims_train)

In [None]:
def writeClaims3classes(basepath, claims):
  with Path(basepath).open('w', encoding="utf-8", newline='') as f2:
    fieldnames = ['id', 'base', 'id_claim', 'id_evidence', 'claim_clean', 'evidence_clean','document', 'document_evidence','class']
    print(fieldnames)
    writer = csv.DictWriter(f2, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore')
    writer.writeheader()
    for trecho in claims:
        #print(trecho['evidence'])
        writer.writerow(trecho)

writeClaims3classes('bases/base3/base3_valid_3classes.tsv', claim_valid3classes)
writeClaims3classes('bases/base3/base3_train_3classes.tsv', claim_train3classes)
writeClaims3classes('bases/base3/base3_test_3classes.tsv', claim_test3classes)