Exploration of various datasets in the aim of building a CARDS and claim detection dataset.

Different steps explored to transform existing datasets to be relevant:
- climate-related classification
- CARDS classification

In [1]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
import torch
from datasets import load_dataset

In [6]:
device="cpu"

# Models
MAX_LEN = 256
BINARY_MODEL_DIR = "crarojasca/BinaryAugmentedCARDS"
TAXONOMY_MODEL_DIR = "crarojasca/TaxonomyAugmentedCARDS"

# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    BINARY_MODEL_DIR,
    max_length = MAX_LEN, padding = "max_length",
    return_token_type_ids = True
)

# Loading Models
## 1. Binary Model
print("Loading binary model: {}".format(BINARY_MODEL_DIR))
config = AutoConfig.from_pretrained(BINARY_MODEL_DIR)
binary_model = AutoModelForSequenceClassification.from_pretrained(BINARY_MODEL_DIR, config=config)
binary_model.to(device)

## 2. Taxonomy Model
print("Loading taxonomy model: {}".format(TAXONOMY_MODEL_DIR))
config = AutoConfig.from_pretrained(TAXONOMY_MODEL_DIR)
taxonomy_model = AutoModelForSequenceClassification.from_pretrained(TAXONOMY_MODEL_DIR, config=config)
taxonomy_model.to(device)

# Load Dataset
id2label = {
    0: '1_1', 1: '1_2', 2: '1_3', 3: '1_4', 4: '1_6', 5: '1_7', 6: '2_1',
    7: '2_3', 8: '3_1', 9: '3_2', 10: '3_3', 11: '4_1', 12: '4_2', 13: '4_4',
    14: '4_5', 15: '5_1', 16: '5_2', 17: '5_3'
}

# Example:
# text = "Climate change is just a natural phenomenon"

# tokenized_text = tokenizer(text, return_tensors = "pt")


# # Running Binary Model
# outputs = binary_model(**tokenized_text)
# binary_score = outputs.logits.softmax(dim = 1)
# binary_prediction = torch.argmax(outputs.logits, axis=1)
# binary_predictions = binary_prediction.to('cpu').item()

# # Running Taxonomy Model
# outputs = taxonomy_model(**tokenized_text)
# taxonomy_score = outputs.logits.softmax(dim = 1)
# taxonomy_prediction = torch.argmax(outputs.logits, axis=1)
# taxonomy_prediction = taxonomy_prediction.to('cpu').item()


# prediction = "0_0" if binary_prediction==0 else id2label[taxonomy_prediction]
# prediction

Loading binary model: crarojasca/BinaryAugmentedCARDS
Loading taxonomy model: crarojasca/TaxonomyAugmentedCARDS


In [7]:
def predict_cards(text):
  tokenized_text = tokenizer(text, return_tensors = "pt")

  # Running Binary Model
  outputs = binary_model(**tokenized_text)
  # binary_score = outputs.logits.softmax(dim = 1)
  binary_prediction = torch.argmax(outputs.logits, axis=1)
  binary_prediction = binary_prediction.to('cuda').item()

  if binary_prediction == 0:
    return "0_0"
  # Running Taxonomy Model
  outputs = taxonomy_model(**tokenized_text)
  # taxonomy_score = outputs.logits.softmax(dim = 1)
  taxonomy_prediction = torch.argmax(outputs.logits, axis=1)
  taxonomy_prediction = taxonomy_prediction.to('cuda').item()

  return id2label[taxonomy_prediction]

### Climate-fever

In [4]:
# Loading
ds_climate_fever = load_dataset("tdiggelm/climate_fever")
df = ds_climate_fever["test"].to_pandas()

README.md:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/869k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

In [None]:
# Cards prediction
df['cards_label_predicted'] = df['claim'].apply(lambda x: predict_cards(x))

df["cards_label"] = df.apply(lambda row: 0 if row['claim_label'] == 0 else None,axis=1)
df_to_label = df[df['cards_label'].isna()]
df_to_label.to_csv("../../data/exploration/cards-fever_to_label.csv", index=False)

df.cards_label_predicted.value_counts()

In [11]:
df_to_label.to_csv("../../data/exploration/cards-fever_to_label.csv", index=False)

df.cards_label_predicted.value_counts()

cards_label_predicted
0_0    911
5_2    407
5_1    169
5_3     44
3_2      2
3_3      1
1_3      1
Name: count, dtype: int64

The CARDS classification models are over-predicting the 5th category, as they are prominent in the original CARDS twitter dataset.

### Cards dataset

In [None]:
!wget -O sample_data/dataset_cards "https://drive.google.com/uc?export=download&id=14exmlYCT3-K2byYHFFrShAIYiemJQroi"
!unzip sample_data/dataset_cards.zip -d sample_data/cards_data

In [None]:
df["cards_label"] = df.apply(lambda row: 0 if row['claim_label'] == 0 else None,axis=1)

### Climate-related classification

Using climatebert

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
import datasets
from tqdm.auto import tqdm
import torch

dataset_name = "climatebert/climate_detection"
model_name = "climatebert/distilroberta-base-climate-detector"

# If you want to use your own data, simply load them as 🤗 Datasets dataset, see https://huggingface.co/docs/datasets/loading
# dataset = datasets.load_dataset(dataset_name, split="test")

model_cbert = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer_cbert = AutoTokenizer.from_pretrained(model_name, max_len=512)
model_cbert.to("cuda")

def predict_env(text):
  tokenized_text = tokenizer_cbert(text, return_tensors="pt").to("cuda")
  outputs = model_cbert(**tokenized_text)
  pred = torch.argmax(outputs.logits, axis=1)
  pred = pred.to("cpu").item()
  return pred

Testing recall on climate-fever (which is supposed to be 100% climate-related)

In [None]:
df = ds_climate_fever["test"].to_pandas()
df = df.sample(100)
df["prediction_env"] = df.claim.apply(lambda x: predict_env(x))
df.to_csv("../data/exploration/climate-fever-env-prediction.csv",index=False)

print("NON CLIMATE RELATED")
for x in df[df["prediction_env"]==0].claim:
  print(x)

print("CLIMATE RELATED")
for x in df[df["prediction_env"]==1].sample(30).claim:
  print(x)

### Filtering datasets to find climate samples

In [None]:
# !wget -O sample_data/claimbuster.zip "https://drive.google.com/uc?export=download&id=14exmlYCT3-K2byYHFFrShAIYiemJQroi"
!unzip drive/MyDrive/datasets/MultiFC_Codalab/scoring_multifc.zip -d drive/MyDrive/datasets/MultiFC_Codalab/scoring_multifc

In [None]:
import pandas as pd
df = pd.read_csv("../../datasets/MultiFC_Codalab/public_data/dev.tsv", sep='\t', header=None)
df.columns = ['claimID', 'claim', 'label', 'claim_url', "reason", "category", "speaker","checker", "tags","title","publish_date", "claim_date", "claim_entities"]
print(df.size)
df = df.dropna(subset=['claim'])
df = df[df['claim'].str.len() <= 512]
print(df.size)
df.head()

In [None]:
tags = []
for x in df.tags.unique():
  if not isinstance(x, str):
    continue
  values = x[1:-1].split(",")
  values = [v.replace("'","").strip() for v in values]
  tags.extend(values)

tags = list(set(tags))
tags.sort()

In [None]:
tags = []
for value in df.claim_date
tags = df.claim_date

In [None]:
from tqdm.auto import tqdm

tqdm.pandas(desc="detecting climate related...")
df["env_predict"] = df.progress_apply(lambda x: predict_env(x["claim"]), axis=1)

In [None]:
env_claims = df[df.env_predict == 1]

env_claims.to_csv("/content/drive/MyDrive/results/multiFC_env_claims.csv", index=False)