In [None]:
!pip install datasets spacy pandas torch_geometric transformers gensim

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [None]:
import os
import json
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
import gensim.downloader as api
from datasets import load_dataset
from gensim.models import KeyedVectors

In [4]:
# Load the environmental claims dataset
dataset = load_dataset('climatebert/environmental_claims')

# Convert datasets to DataFrames
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

(…)-00000-of-00001-98aa5228a06a17d0.parquet:   0%|          | 0.00/215k [00:00<?, ?B/s]

(…)-00000-of-00001-2553e47d408fab28.parquet:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

(…)-00000-of-00001-79fd931297fff765.parquet:   0%|          | 0.00/28.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2117 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

In [5]:
# Initialize Spacy model for dependency parsing
nlp = spacy.load('en_core_web_sm')

def create_dependency_graph(sentence):
    doc = nlp(sentence)
    return [(token.text, token.dep_, token.head.text, token.pos_) for token in doc]  # Added POS tag

# Apply function to each sentence
df_train['Dependency_Graph'] = df_train['text'].apply(create_dependency_graph)
df_val['Dependency_Graph'] = df_val['text'].apply(create_dependency_graph)
df_test['Dependency_Graph'] = df_test['text'].apply(create_dependency_graph)

In [None]:
# Initialize counters for dependency types and POS tag types
dep_types = Counter()
pos_tag_types = Counter()

def count_types(dependency_graph):
    """Counts unique dependency and POS tag types across the dataset."""
    for _, dep, _, pos in dependency_graph:
        dep_types[dep] += 1
        pos_tag_types[pos] += 1

for df in [df_train, df_val, df_test]:
    df['Dependency_Graph'].apply(count_types)

# Create the final index mappings
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
pos_to_index = {pos: idx for idx, pos in enumerate(pos_tag_types)}

# Save these mappings for later use in the model
mappings = {'dep_to_index': dep_type_to_index, 'pos_to_index': pos_to_index}
with open('mappings.json', 'w') as f:
    json.dump(mappings, f)

print(f"Found {len(dep_type_to_index)} unique dependency types.")
print(f"Found {len(pos_to_index)} unique POS tag types.")

print("Loading Word2Vec model (this may take a while)...")

# Load pre-trained Word2Vec model (Google News vectors)
word2vec_model = api.load('word2vec-google-news-300')

# Helper function to convert words to vectors using Word2Vec
def word_to_vec(word):
    try:
        return word2vec_model[word]
    except KeyError:
        # Return a zero vector if the word is not in the vocabulary
        return np.zeros(word2vec_model.vector_size)

# One-hot encode dependency types
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
dep_type_to_index

{'det': 0,
 'nsubj': 1,
 'aux': 2,
 'ROOT': 3,
 'amod': 4,
 'dobj': 5,
 'prep': 6,
 'cc': 7,
 'conj': 8,
 'compound': 9,
 'pobj': 10,
 'advmod': 11,
 'punct': 12,
 'meta': 13,
 'ccomp': 14,
 'acomp': 15,
 'mark': 16,
 'advcl': 17,
 'csubj': 18,
 'relcl': 19,
 'poss': 20,
 'nmod': 21,
 'predet': 22,
 'appos': 23,
 'xcomp': 24,
 'attr': 25,
 'nummod': 26,
 'preconj': 27,
 'dative': 28,
 'intj': 29,
 'pcomp': 30,
 'case': 31,
 'npadvmod': 32,
 'prt': 33,
 'nsubjpass': 34,
 'auxpass': 35,
 'agent': 36,
 'parataxis': 37,
 'acl': 38,
 'quantmod': 39,
 'dep': 40,
 'neg': 41,
 'expl': 42,
 'oprd': 43,
 'csubjpass': 44}

In [None]:
df_train['Dependency_Graph'][0]

[('The', 'det', 'project', 'NOUN'),
 ('project', 'nsubj', 'make', 'VERB'),
 ('will', 'aux', 'make', 'VERB'),
 ('make', 'ROOT', 'make', 'VERB'),
 ('a', 'det', 'contribution', 'NOUN'),
 ('significant', 'amod', 'contribution', 'NOUN'),
 ('contribution', 'dobj', 'make', 'VERB'),
 ('to', 'prep', 'contribution', 'NOUN'),
 ('the', 'det', 'strategy', 'NOUN'),
 ('German', 'amod', 'strategy', 'NOUN'),
 ('and', 'cc', 'German', 'ADJ'),
 ('European', 'conj', 'German', 'ADJ'),
 ('hydrogen', 'compound', 'strategy', 'NOUN'),
 ('strategy', 'pobj', 'to', 'ADP'),
 ('and', 'cc', 'make', 'VERB'),
 ('hence', 'advmod', 'to', 'ADP'),
 ('to', 'conj', 'make', 'VERB'),
 ('achievement', 'pobj', 'to', 'ADP'),
 ('of', 'prep', 'achievement', 'NOUN'),
 ('the', 'det', 'targets', 'NOUN'),
 ('climate', 'compound', 'targets', 'NOUN'),
 ('targets', 'pobj', 'of', 'ADP'),
 ('.', 'punct', 'make', 'VERB')]

In [None]:
def process_and_save_graph_data(df, filename):
    """
    Processes a dataframe to create graph structures and saves them to a JSON file.
    Node features will be word2vec embeddings.
    A separate list of POS tag indices will be saved for each graph.
    """
    processed_graphs = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {filename}"):
        dependency_graph = row['Dependency_Graph']
        label = row['label']

        # First, build a complete map of tokens and their attributes in the sentence
        token_map = {}
        node_counter = 0
        for token, _, head, pos in dependency_graph:
            if token not in token_map:
                token_map[token] = {'id': node_counter, 'pos': pos}
                node_counter += 1
            if head not in token_map:
                # This case is less common but good to handle
                # We need to find the head's POS tag
                head_pos = 'UNKNOWN' # Default
                for t, _, h, p in dependency_graph:
                    if t == head:
                        head_pos = p
                        break
                token_map[head] = {'id': node_counter, 'pos': head_pos}
                node_counter += 1

        # Now, create the features and graph structure
        num_nodes = len(token_map)
        node_features = [None] * num_nodes
        pos_indices = [None] * num_nodes
        edges = []

        # Populate node features and pos_indices
        for token, data in token_map.items():
            node_id = data['id']
            pos_tag = data['pos']
            node_features[node_id] = word_to_vec(token).tolist()
            pos_indices[node_id] = pos_to_index.get(pos_tag, 0) # Default to 0 for unknown tags

        # Populate edge list
        for token, dep, head, _ in dependency_graph:
            src_id = token_map[token]['id']
            dst_id = token_map[head]['id']
            dep_id = dep_type_to_index.get(dep, 0) # Default to 0 for unknown deps
            edges.append((src_id, dep_id, dst_id))

        graph_data = {
            'targets': [label],
            'graph': edges,
            'node_features': node_features,
            'pos_indices': pos_indices  # <-- We now save the indices
        }
        processed_graphs.append(graph_data)

    with open(filename, 'w') as f:
        json.dump(processed_graphs, f)

train_json_path = "env_claim_train_pos.json"
val_json_path = "env_claim_val_pos.json"
test_json_path = "env_claim_test_pos.json"

process_and_save_graph_data(df_train, train_json_path)
process_and_save_graph_data(df_val, val_json_path)
process_and_save_graph_data(df_test, test_json_path)