# Extract Heuristic Features from a Dataset

This is based on: https://github.com/AI-4-Sci/SciTweets/tree/main/heuristics


In [1]:
import os
import sys

sys.path.append("../../")

if not os.path.exists("./data"):
    ROOT_DIR = "../../data/"
else:
    ROOT_DIR = "./data/"

In [2]:
# Makse sure punktab is installed
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/gb7776/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from climatesense_checkthat2025.heuristics.cat1_sciknowledge import (
    contains_arg_relation,
    contains_scientific_term,
    is_claim,
    is_claim_with_sciterm,
    load_scientific_terms,
)

## Scientific Knowledge Heuristics


In [4]:
scientific_terms = load_scientific_terms(
    wiki_sci_terms_path=os.path.join(ROOT_DIR, "heuristics/wiki_sci_terms.txt"),
    sc_methods_path=os.path.join(ROOT_DIR, "heuristics/sc_methods.txt"),
)


def contains_arg(text):
    if contains_arg_relation(text) != "":
        return True
    else:
        return False

In [None]:
txt = "lemonade a killing bacteria"

print(is_claim_with_sciterm(txt, scientific_terms))  # False
print(is_claim(txt)[0])  # True
print(contains_arg(txt))  # True
print(contains_scientific_term(txt, scientific_terms)[0])  # False

False
False
False
True


## Scientific URLs Heuristics


In [None]:
from climatesense_checkthat2025.heuristics.cat2_sciurl import (
    annotate_tweets,
    load_domain_files,
    prepare_urls,
)

In [None]:
import pandas as pd

from climatesense_checkthat2025.utils.data import extract_urls_from_texts

texts = [
    "this is a test http://this.com http://this2.com https://www.independent.co.uk/ https://www.science.org/ https://www.nbcnews.com/science/space/space-x-launch-crew-6-nasa-astronauts-international-space-station-rcna128123 https://www.nature.com/articles/s41586-022-05396-0",  # noqa: E501
    "this is another test",
    "this is a poster with a url https://cartographicperspectives.org/index.php/journal/article/view/100 https://www.sciencedirect.com/science/article/pii/S1364815219300011",
]
extracted_urls_df = pd.DataFrame({"text": texts, "urls": extract_urls_from_texts(texts)})


# replace [] with None:
extracted_urls_df["urls"] = extracted_urls_df["urls"].apply(lambda x: None if len(x) == 0 else x)

extracted_urls_df

Unnamed: 0,text,urls
0,this is a test http://this.com http://this2.co...,"[http://this.com, http://this2.com, https://ww..."
1,this is another test,
2,this is a poster with a url https://cartograph...,[https://cartographicperspectives.org/index.ph...


In [8]:
subdomains, sci_mags_domains, sci_news_domains = load_domain_files(
    subdomains_file=os.path.join(ROOT_DIR, "heuristics/repo_subdomains.csv"),
    sci_mags_file=os.path.join(ROOT_DIR, "heuristics/science_mags_domains.csv"),
    sci_news_file=os.path.join(ROOT_DIR, "heuristics/news_outlets_domains.csv"),
)

In [9]:
extracted_urls_df = prepare_urls(extracted_urls_df)
extracted_urls_df

Unnamed: 0,text,has_url,urls,processed_urls,tlds,domain_tlds,subdomain_domain_tlds
0,this is a test http://this.com http://this2.co...,True,"[http://this.com, http://this2.com, https://ww...","[http://this.com, http://this2.com, https://ww...","[com, com, co.uk, org, com, com]","[this.com, this2.com, independent.co.uk, scien...","[this.com, this2.com, www.independent.co.uk, w..."
1,this is a test with a url https://cartographic...,True,[https://cartographicperspectives.org/index.ph...,[https://cartographicperspectives.org/index.ph...,"[org, com]","[cartographicperspectives.org, sciencedirect.com]","[cartographicperspectives.org, www.sciencedire..."
1,this is another test,False,[],[],[],[],[]


In [None]:
extracted_urls_df = annotate_tweets(extracted_urls_df, subdomains, sci_mags_domains, sci_news_domains)
extracted_urls_df

100%|██████████| 2/2 [00:00<00:00, 284.57it/s]
100%|██████████| 2/2 [00:00<00:00, 13252.15it/s]
100%|██████████| 2/2 [00:00<00:00, 3087.45it/s]


Unnamed: 0,text,has_url,urls,processed_urls,tlds,domain_tlds,subdomain_domain_tlds,sci_subdomain,has_sci_subdomain,sci_mag_domain,has_sci_mag_domain,sci_news_domain,has_sci_news_domain
0,this is a test http://this.com http://this2.co...,True,"[http://this.com, http://this2.com, https://ww...","[http://this.com, http://this2.com, https://ww...","[com, com, co.uk, org, com, com]","[this.com, this2.com, independent.co.uk, scien...","[this.com, this2.com, www.independent.co.uk, w...",www.science.org; www.nature.com,True,www.science.org,True,www.nbcnews.com,True
1,this is a test with a url https://cartographic...,True,[https://cartographicperspectives.org/index.ph...,[https://cartographicperspectives.org/index.ph...,"[org, com]","[cartographicperspectives.org, sciencedirect.com]","[cartographicperspectives.org, www.sciencedire...",cartographicperspectives.org; www.sciencedirec...,True,www.sciencedirect.com,True,,False
2,this is another test,False,[],[],[],[],[],[],False,[],False,[],False


In [11]:
extracted_urls_df["has_sci_domain"] = extracted_urls_df[
    ["has_sci_subdomain", "has_sci_mag_domain", "has_sci_news_domain"]
].any(axis="columns")
extracted_urls_df

Unnamed: 0,text,has_url,urls,processed_urls,tlds,domain_tlds,subdomain_domain_tlds,sci_subdomain,has_sci_subdomain,sci_mag_domain,has_sci_mag_domain,sci_news_domain,has_sci_news_domain,has_sci_domain
0,this is a test http://this.com http://this2.co...,True,"[http://this.com, http://this2.com, https://ww...","[http://this.com, http://this2.com, https://ww...","[com, com, co.uk, org, com, com]","[this.com, this2.com, independent.co.uk, scien...","[this.com, this2.com, www.independent.co.uk, w...",www.science.org; www.nature.com,True,www.science.org,True,www.nbcnews.com,True,True
1,this is a test with a url https://cartographic...,True,[https://cartographicperspectives.org/index.ph...,[https://cartographicperspectives.org/index.ph...,"[org, com]","[cartographicperspectives.org, sciencedirect.com]","[cartographicperspectives.org, www.sciencedire...",cartographicperspectives.org; www.sciencedirec...,True,www.sciencedirect.com,True,,False,True
2,this is another test,False,[],[],[],[],[],[],False,[],False,[],False,False


## Research Heuristics


In [None]:
from climatesense_checkthat2025.heuristics.cat3_research import (
    annotate_tweets as research_annotate_tweets,
)
from climatesense_checkthat2025.heuristics.cat3_research import load_methods

In [13]:
methods = load_methods(os.path.join(ROOT_DIR, "heuristics/sc_methods.txt"))

In [14]:
extracted_urls_df

Unnamed: 0,text,has_url,urls,processed_urls,tlds,domain_tlds,subdomain_domain_tlds,sci_subdomain,has_sci_subdomain,sci_mag_domain,has_sci_mag_domain,sci_news_domain,has_sci_news_domain,has_sci_domain
0,this is a test http://this.com http://this2.co...,True,"[http://this.com, http://this2.com, https://ww...","[http://this.com, http://this2.com, https://ww...","[com, com, co.uk, org, com, com]","[this.com, this2.com, independent.co.uk, scien...","[this.com, this2.com, www.independent.co.uk, w...",www.science.org; www.nature.com,True,www.science.org,True,www.nbcnews.com,True,True
1,this is a test with a url https://cartographic...,True,[https://cartographicperspectives.org/index.ph...,[https://cartographicperspectives.org/index.ph...,"[org, com]","[cartographicperspectives.org, sciencedirect.com]","[cartographicperspectives.org, www.sciencedire...",cartographicperspectives.org; www.sciencedirec...,True,www.sciencedirect.com,True,,False,True
2,this is another test,False,[],[],[],[],[],[],False,[],False,[],False,False


In [17]:
research_annotate_tweets(extracted_urls_df, methods)

100%|██████████| 3/3 [00:00<00:00, 1863.58it/s]


Unnamed: 0,text,urls,is_related_to_research,mentions_science_research_in_general,mentions_scientist,mentions_publications,mentions_research_method
0,this is a test http://this.com http://this2.co...,"[http://this.com, http://this2.com, https://ww...",False,False,False,False,False
1,this is another test,,False,False,False,False,False
2,this is a poster with a url https://cartograph...,[https://cartographicperspectives.org/index.ph...,False,False,False,False,False


In [35]:
# Extract ever
# Load the provided trainning and test data:
subtask4a_train_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_train_clean.tsv"),
    sep="\t",
)
subtask4a_test_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_dev_clean.tsv"),
    sep="\t",
)

subtask4a_eval_df = pd.read_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_eval_clean.tsv"),
    sep="\t",
)

In [None]:
def heuristics(df, subdomains, sci_mags_domains, sci_news_domains, methods):
    # Apply cat1 heuristics to the data
    df["is_claim_with_sciterm"] = df["text"].apply(lambda x: is_claim_with_sciterm(x, scientific_terms))
    df["is_claim"] = df["text"].apply(lambda x: is_claim(x)[0])
    df["contains_arg"] = df["text"].apply(lambda x: contains_arg(x))
    df["contains_scientific_term"] = df["text"].apply(lambda x: contains_scientific_term(x, scientific_terms)[0])

    # Apply cat2 heuristics to the data:
    df["urls"] = extract_urls_from_texts(df["text"])
    df["urls"] = df["urls"].apply(lambda x: None if len(x) == 0 else x)
    df = prepare_urls(df)

    annotated_df = annotate_tweets(df, subdomains, sci_mags_domains, sci_news_domains)
    df["has_sci_domain"] = annotated_df[["has_sci_subdomain", "has_sci_mag_domain", "has_sci_news_domain"]].any(
        axis="columns"
    )
    df["has_sci_subdomain"] = annotated_df["has_sci_subdomain"]
    df["has_sci_mag_domain"] = annotated_df["has_sci_mag_domain"]
    df["has_sci_news_domain"] = annotated_df["has_sci_news_domain"]

    # Apply cat3 heuristics to the data:
    annotated_df2 = research_annotate_tweets(df, methods)
    df["is_related_to_research"] = annotated_df2["is_related_to_research"]
    df["mentions_science_research_in_general"] = annotated_df2["mentions_science_research_in_general"]
    df["mentions_scientist"] = annotated_df2["mentions_scientist"]
    df["mentions_publications"] = annotated_df2["mentions_publications"]
    df["mentions_research_method"] = annotated_df2["mentions_research_method"]

    return df

In [None]:
subtask4a_train_df = heuristics(
    subtask4a_train_df,
    subdomains,
    sci_mags_domains,
    sci_news_domains,
    methods,
)
subtask4a_test_df = heuristics(
    subtask4a_test_df,
    subdomains,
    sci_mags_domains,
    sci_news_domains,
    methods,
)
subtask4a_eval_df = heuristics(
    subtask4a_eval_df,
    subdomains,
    sci_mags_domains,
    sci_news_domains,
    methods,
)

100%|██████████| 632/632 [00:00<00:00, 1974.79it/s]
100%|██████████| 632/632 [00:00<00:00, 619780.25it/s]
100%|██████████| 632/632 [00:00<00:00, 93906.76it/s]
100%|██████████| 1228/1228 [00:01<00:00, 924.58it/s] 
100%|██████████| 76/76 [00:00<00:00, 1963.77it/s]
100%|██████████| 76/76 [00:00<00:00, 292178.83it/s]
100%|██████████| 76/76 [00:00<00:00, 78669.08it/s]
100%|██████████| 137/137 [00:00<00:00, 942.99it/s]
100%|██████████| 136/136 [00:00<00:00, 1969.49it/s]
100%|██████████| 136/136 [00:00<00:00, 387517.22it/s]
100%|██████████| 136/136 [00:00<00:00, 88246.50it/s]
100%|██████████| 240/240 [00:00<00:00, 748.80it/s]


In [None]:
# for each dataframe count the proportion of rows with has_urls == True
def count_proportion(df):
    return df["has_url"].sum() / len(df)


print("Proportion of rows with urls in train data: ", count_proportion(subtask4a_train_df))
print("Proportion of rows with urls in test data: ", count_proportion(subtask4a_test_df))
print("Proportion of rows with urls in eval data: ", count_proportion(subtask4a_eval_df))

Proportion of rows with urls in train data:  0.5146579804560261
Proportion of rows with urls in test data:  0.5547445255474452
Proportion of rows with urls in eval data:  0.5666666666666667


In [None]:
# Save the dataframes to csv files:
subtask4a_train_df.to_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_train_clean_heuristics.tsv"),
    sep="\t",
    index=False,
)

subtask4a_test_df.to_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_dev_clean_heuristics.tsv"),
    sep="\t",
    index=False,
)

subtask4a_eval_df.to_csv(
    os.path.join(ROOT_DIR, "processed/task4/subtask_4a/ct_eval_heuristics.tsv"),
    sep="\t",
    index=False,
)

In [None]:
# Also process the oversamples:

for cl in ["scientific_claim", "scientific_reference", "scientific_entities"]:
    oversample_df = pd.read_csv(
        os.path.join(ROOT_DIR, f"processed/task4/subtask_4a/ct_train_oversamples_{cl}.tsv"),
        sep="\t",
        names=["text", cl],
    )
    # Ensure all values in the 'text' column are strings
    oversample_df["text"] = oversample_df["text"].astype(str)

    oversample_df = heuristics(
        oversample_df,
        subdomains,
        sci_mags_domains,
        sci_news_domains,
        methods,
    )
    oversample_df.to_csv(
        os.path.join(
            ROOT_DIR,
            f"processed/task4/subtask_4a/ct_train_oversamples_{cl}_heuristics.tsv",
        ),
        sep="\t",
        index=False,
    )

100%|██████████| 85/85 [00:00<00:00, 2246.77it/s]
100%|██████████| 85/85 [00:00<00:00, 37017.53it/s]
100%|██████████| 85/85 [00:00<00:00, 64248.66it/s]
100%|██████████| 332/332 [00:00<00:00, 635.73it/s]
100%|██████████| 261/261 [00:00<00:00, 2040.18it/s]
100%|██████████| 261/261 [00:00<00:00, 526304.49it/s]
100%|██████████| 261/261 [00:00<00:00, 96527.06it/s]
100%|██████████| 672/672 [00:02<00:00, 256.01it/s]
100%|██████████| 215/215 [00:00<00:00, 1922.23it/s]
100%|██████████| 215/215 [00:00<00:00, 429826.20it/s]
100%|██████████| 215/215 [00:00<00:00, 84380.59it/s]
100%|██████████| 612/612 [00:01<00:00, 384.39it/s]


In [47]:
oversample_df

Unnamed: 0,text,scientific_entities,is_claim_with_sciterm,is_claim,contains_arg,contains_scientific_term,has_url,urls,processed_urls,tlds,...,subdomain_domain_tlds,has_sci_domain,has_sci_subdomain,has_sci_mag_domain,has_sci_news_domain,is_related_to_research,mentions_science_research_in_general,mentions_scientist,mentions_publications,mentions_research_method
0,"""*Remembering when we used to tweet about scie...",1.0,False,False,False,True,True,[https://www.ncbi.nlm.nih.gov/pmc/articles/PMC...,[https://www.ncbi.nlm.nih.gov/pmc/articles/PMC...,[gov],...,[www.ncbi.nlm.nih.gov],True,True,False,False,False,False,False,False,False
1,"""-demand-side energy policies flourish like vi...",1.0,False,False,False,False,True,[https://t.co/UKERCpublications/],[https://t.co/UKERCpublications/],[co],...,[t.co],False,False,False,False,False,False,False,False,False
2,"""@allvisionary✨ 25% eye-costly increases! Imag...",1.0,False,False,False,True,True,[https://health--insurance.me/articulo.php?id=...,[https://health--insurance.me/articulo.php?id=...,[me],...,[health--insurance.me],False,False,False,False,False,False,False,False,False
3,"""@user The trials aren’t focused on whether th...",1.0,False,False,False,True,True,[https://www.bmj.com/content/371/bmj.m4037],[https://www.bmj.com/content/371/bmj.m4037],[com],...,[www.bmj.com],True,True,False,False,False,False,False,False,False
4,"""@user check out this incredible journey of di...",1.0,False,False,False,True,True,[https://en.m.wikipedia.org/wiki/Nobel_disease],[https://en.m.wikipedia.org/wiki/Nobel_disease],[org],...,[en.m.wikipedia.org],False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,Mild temperatures alone won't stop the virus; ...,1.0,False,False,True,True,False,[],[],[],...,[],False,False,False,False,False,False,False,False,False
607,Hey there! Just wanted to share this cool rese...,1.0,False,False,False,True,False,[],[],[],...,[],False,False,False,False,False,False,False,False,False
608,"""@GregLaden: The War on Science is real... but...",1.0,False,False,False,True,False,[],[],[],...,[],False,False,False,False,True,True,False,False,False
610,"""When marijuana users undergo endoscopies or c...",1.0,True,True,True,True,False,[],[],[],...,[],False,False,False,False,False,False,False,False,False
