In [None]:
!wget -O data.zip https://zenodo.org/record/3836810/files/ClaimBuster_Datasets.zip?download=1
!unzip data.zip
!mv ClaimBuster_Datasets dataset

In [None]:
!pip install transformers
!pip install datasets
!pip install wandb
!pip install huggingface_hub
!pip install html2text

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git lfs install

from huggingface_hub import notebook_login
notebook_login()

In [None]:
!git config --global credential.helper store

In [None]:
import wandb
wandb.login()

%env WANDB_PROJECT=claim_spotter

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
import pandas as pd
import json
with open('./dataset/datasets/2xNCS.json', 'r') as f:
  data =json.loads(f.read())

df = pd.DataFrame(data)
df.rename(columns={'label':'labels'}, inplace=True)
df = df.drop(columns=['sentence_id'])
# df['labels'] = df['labels'].map({1:'claim', 0:'not_claim'})
df.to_csv('dataset.csv', index=False)

# Build random split of the df
def build_split(df, train_size=0.8, seed=42):
    # Shuffle the dataframe
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Split the dataframe
    train_size = int(train_size * len(df))
    train = df[:train_size]
    test = df[train_size:]

    return train, test

train, test = build_split(df)   
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train':'train.csv', 'test': 'test.csv'})
dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.config.id2label = {
    1: 'claim',
    0: 'not_claim'
}
print('Loaded model')

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric = load_metric("f1")

training_args = TrainingArguments(
    output_dir="claim-spotter-multilingual", 
    evaluation_strategy="epoch", 
    report_to="wandb", 
    push_to_hub=True,
    num_train_epochs=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

# Inference

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

trained_model = 'gzomer/claim-spotter-multilingual'
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained(trained_model, num_labels=2)

In [None]:
claim = 'Ukraine is not a country'

In [None]:
def get_prediction(text, device='cuda'):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    label = int(probs.argmax().cpu().numpy())
    return model.config.id2label[label]

def get_predictions(texts, device='cuda'): 
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    labels = [model.config.id2label[item] for item in probs.argmax(1).cpu().numpy()]
    return list(zip(texts, labels))

In [None]:
from nltk import sent_tokenize

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import requests
from bs4 import BeautifulSoup
from html2text import HTML2Text

def get_page_text(url):
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'lxml')
  soup = soup.find('article')
  text = HTML2Text().handle(str(soup))
  return text

In [None]:
def has_entities(text):
  return any(c.isupper() for c in text[1:])

def filter_sentence(sent):
  if '\n\n' in sent:
    return False
  if '[' in sent or ']' in sent:
    return False
  if '(' in sent or ')' in sent:
    return False
  if not has_entities(sent):
    return False
  if len(sent) < 20 or len(sent) > 150:
    return False
  return True

def improve_sentences(sent):
  return sent.replace('\n', ' ')

def filter_sentences(sents):
  filtered_sents = [sent for sent in sents if filter_sentence(sent)]
  filtered_sents = list(set(filtered_sents))
  filtered_sents = [improve_sentences(sent) for sent in filtered_sents]
  return filtered_sents

def find_claims(url):  
  text = get_page_text(url)
  sents = sent_tokenize(text)
  filtered_sents = filter_sentences(sents)  
  preds = get_predictions(filtered_sents)
  return [item[0].strip() for item in preds if item[1] == 'claim'] 

url = 'https://www.bbc.com/news/uk-politics-61083402'
claims = find_claims(url)  

In [None]:
claims