In [1]:
!pip install datasets spacy pandas torch_geometric transformers gensim stanza

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting emoji (from stanza)
  Downloading

In [1]:
import os
import json
import spacy
import stanza
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
import gensim.downloader as api
from datasets import load_dataset
from gensim.models import KeyedVectors
from typing import List, Tuple, Dict, Optional

In [2]:
# Load the environmental claims dataset
dataset = load_dataset('climatebert/environmental_claims')

# Convert datasets to DataFrames
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-98aa5228a06a17(…):   0%|          | 0.00/215k [00:00<?, ?B/s]

data/validation-00000-of-00001-2553e47d4(…):   0%|          | 0.00/28.9k [00:00<?, ?B/s]

data/test-00000-of-00001-79fd931297fff76(…):   0%|          | 0.00/28.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2117 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

In [9]:
# Initialize Spacy model for dependency parsing
nlp = spacy.load('en_core_web_sm')

def create_dependency_graph(sentence):
    doc = nlp(sentence)
    return [(token.text, token.dep_, token.head.text, token.pos_) for token in doc]  # Added POS tag

# Apply function to each sentence
df_train['Dependency_Graph'] = df_train['text'].apply(create_dependency_graph)
df_val['Dependency_Graph'] = df_val['text'].apply(create_dependency_graph)
df_test['Dependency_Graph'] = df_test['text'].apply(create_dependency_graph)



# Initialize counters for dependency types and POS tag types
dep_types = Counter()
pos_tag_types = Counter()

def count_types(dependency_graph):
    """Counts unique dependency and POS tag types across the dataset."""
    for _, dep, _, pos in dependency_graph:
        dep_types[dep] += 1
        pos_tag_types[pos] += 1

for df in [df_train, df_val, df_test]:
    df['Dependency_Graph'].apply(count_types)

# Create the final index mappings
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
pos_to_index = {pos: idx for idx, pos in enumerate(pos_tag_types)}

# Save these mappings for later use in the model
mappings = {'dep_to_index': dep_type_to_index, 'pos_to_index': pos_to_index}
with open('mappings.json', 'w') as f:
    json.dump(mappings, f)

print(f"Found {len(dep_type_to_index)} unique dependency types.")
print(f"Found {len(pos_to_index)} unique POS tag types.")

In [3]:
nlp = stanza.Pipeline(
    "en",
    processors="tokenize,mwt,pos,lemma,depparse",
    use_gpu=True  # set True if you have a GPU
)

def create_dependency_graph(sentence: str):
    """
    Returns a list of tuples: (token_text, ud_deprel, head_text, upos)
    Mirrors your spaCy output with UD labels from Stanza.
    """
    if not isinstance(sentence, str):
        sentence = "" if sentence is None else str(sentence)

    doc = nlp(sentence)
    triples = []

    for sent in doc.sentences:
        words = sent.words  # syntactic words
        for w in words:
            # In Stanza, w.head is the index (1-based) of the head in this sentence; 0 means ROOT.
            head_text = "ROOT" if w.head == 0 else words[w.head - 1].text
            triples.append((w.text, w.deprel, head_text, w.upos))

    return triples

df_train["Dependency_Graph"] = df_train["text"].apply(create_dependency_graph)
df_val["Dependency_Graph"]   = df_val["text"].apply(create_dependency_graph)
df_test["Dependency_Graph"]  = df_test["text"].apply(create_dependency_graph)



# Initialize counters for dependency types and POS tag types
dep_types = Counter()
pos_tag_types = Counter()

def count_types(dependency_graph):
    """Counts unique dependency and POS tag types across the dataset."""
    for _, dep, _, pos in dependency_graph:
        dep_types[dep] += 1
        pos_tag_types[pos] += 1

for df in [df_train, df_val, df_test]:
    df['Dependency_Graph'].apply(count_types)

# Create the final index mappings
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
pos_to_index = {pos: idx for idx, pos in enumerate(pos_tag_types)}

# Save these mappings for later use in the model
mappings = {'dep_to_index': dep_type_to_index, 'pos_to_index': pos_to_index}
with open('mappings_new.json', 'w') as f:
    json.dump(mappings, f)

print(f"Found {len(dep_type_to_index)} unique dependency types.")
print(f"Found {len(pos_to_index)} unique POS tag types.")

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/tokenize/combined.pt:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/mwt/combined.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/pos/combined_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/lemma/combined_nocharlm.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/depparse/combined_charlm.pt:  …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/backward_charlm/1billion.pt:  …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/forward_charlm/1billion.pt:   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/pretrain/conll17.pt:   0%|    …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [11]:
# One-hot encode dependency types
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
dep_type_to_index

{'det': 0,
 'nsubj': 1,
 'aux': 2,
 'ROOT': 3,
 'amod': 4,
 'dobj': 5,
 'prep': 6,
 'cc': 7,
 'conj': 8,
 'compound': 9,
 'pobj': 10,
 'advmod': 11,
 'punct': 12,
 'dep': 13,
 'ccomp': 14,
 'acomp': 15,
 'mark': 16,
 'advcl': 17,
 'csubj': 18,
 'relcl': 19,
 'poss': 20,
 'nmod': 21,
 'attr': 22,
 'xcomp': 23,
 'nummod': 24,
 'preconj': 25,
 'intj': 26,
 'npadvmod': 27,
 'pcomp': 28,
 'case': 29,
 'prt': 30,
 'nsubjpass': 31,
 'auxpass': 32,
 'agent': 33,
 'dative': 34,
 'parataxis': 35,
 'acl': 36,
 'appos': 37,
 'quantmod': 38,
 'neg': 39,
 'expl': 40,
 'meta': 41,
 'oprd': 42,
 'predet': 43,
 'csubjpass': 44}

In [6]:
print("Loading Word2Vec model (this may take a while)...")

# Load pre-trained Word2Vec model (Google News vectors)
word2vec_model = api.load('word2vec-google-news-300')

# Helper function to convert words to vectors using Word2Vec
def word_to_vec(word):
    try:
        return word2vec_model[word]
    except KeyError:
        # Return a zero vector if the word is not in the vocabulary
        return np.zeros(word2vec_model.vector_size)

Loading Word2Vec model (this may take a while)...


In [12]:
df_train['Dependency_Graph'][0]

[('The', 'det', 'project', 'DET'),
 ('project', 'nsubj', 'make', 'NOUN'),
 ('will', 'aux', 'make', 'AUX'),
 ('make', 'ROOT', 'make', 'VERB'),
 ('a', 'det', 'contribution', 'DET'),
 ('significant', 'amod', 'contribution', 'ADJ'),
 ('contribution', 'dobj', 'make', 'NOUN'),
 ('to', 'prep', 'contribution', 'ADP'),
 ('the', 'det', 'strategy', 'DET'),
 ('German', 'amod', 'strategy', 'ADJ'),
 ('and', 'cc', 'German', 'CCONJ'),
 ('European', 'conj', 'German', 'ADJ'),
 ('hydrogen', 'compound', 'strategy', 'NOUN'),
 ('strategy', 'pobj', 'to', 'NOUN'),
 ('and', 'cc', 'make', 'CCONJ'),
 ('hence', 'advmod', 'to', 'ADV'),
 ('to', 'conj', 'make', 'PART'),
 ('achievement', 'pobj', 'to', 'NOUN'),
 ('of', 'prep', 'achievement', 'ADP'),
 ('the', 'det', 'targets', 'DET'),
 ('climate', 'compound', 'targets', 'NOUN'),
 ('targets', 'pobj', 'of', 'NOUN'),
 ('.', 'punct', 'make', 'PUNCT')]

In [13]:
def process_and_save_graph_data(df, filename):
    """
    Processes a dataframe to create graph structures and saves them to a JSON file.
    Node features will be word2vec embeddings.
    A separate list of POS tag indices will be saved for each graph.
    """
    processed_graphs = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {filename}"):
        dependency_graph = row['Dependency_Graph']
        label = row['label']

        # First, build a complete map of tokens and their attributes in the sentence
        token_map = {}
        node_counter = 0
        for token, _, head, pos in dependency_graph:
            if token not in token_map:
                token_map[token] = {'id': node_counter, 'pos': pos}
                node_counter += 1
            if head not in token_map:
                # This case is less common but good to handle
                # We need to find the head's POS tag
                head_pos = 'UNKNOWN' # Default
                for t, _, h, p in dependency_graph:
                    if t == head:
                        head_pos = p
                        break
                token_map[head] = {'id': node_counter, 'pos': head_pos}
                node_counter += 1

        # Now, create the features and graph structure
        num_nodes = len(token_map)
        node_features = [None] * num_nodes
        pos_indices = [None] * num_nodes
        edges = []

        # Populate node features and pos_indices
        for token, data in token_map.items():
            node_id = data['id']
            pos_tag = data['pos']
            node_features[node_id] = word_to_vec(token).tolist()
            pos_indices[node_id] = pos_to_index.get(pos_tag, 0) # Default to 0 for unknown tags

        # Populate edge list
        for token, dep, head, _ in dependency_graph:
            src_id = token_map[token]['id']
            dst_id = token_map[head]['id']
            dep_id = dep_type_to_index.get(dep, 0) # Default to 0 for unknown deps
            edges.append((src_id, dep_id, dst_id))

        graph_data = {
            'targets': [label],
            'graph': edges,
            'node_features': node_features,
            'pos_indices': pos_indices  # <-- We now save the indices
        }
        processed_graphs.append(graph_data)

    with open(filename, 'w') as f:
        json.dump(processed_graphs, f)

train_json_path = "env_claim_train.json"
val_json_path = "env_claim_val.json"
test_json_path = "env_claim_test.json"

process_and_save_graph_data(df_train, train_json_path)
process_and_save_graph_data(df_val, val_json_path)
process_and_save_graph_data(df_test, test_json_path)

Processing env_claim_train.json: 100%|██████████| 2117/2117 [00:01<00:00, 1985.17it/s]
Processing env_claim_val.json: 100%|██████████| 265/265 [00:00<00:00, 1339.81it/s]
Processing env_claim_test.json: 100%|██████████| 265/265 [00:00<00:00, 2233.15it/s]
