In [1]:
pip install datasets spacy pandas torch_geometric transformers gensim

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [3]:
import os
import ast
import json
import spacy
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import networkx as nx
from tqdm import tqdm
import torch.optim as optim
from torch.optim import Adam
import matplotlib.pyplot as plt
import torch.nn.functional as F
from datasets import load_dataset
from gensim.models import Word2Vec
from torch.nn import CrossEntropyLoss
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from transformers import RobertaModel, RobertaTokenizer
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool

In [4]:
# Load the environmental claims dataset
dataset = load_dataset('climatebert/environmental_claims')

# Convert datasets to DataFrames
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

(…)-00000-of-00001-98aa5228a06a17d0.parquet:   0%|          | 0.00/215k [00:00<?, ?B/s]

(…)-00000-of-00001-2553e47d408fab28.parquet:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

(…)-00000-of-00001-79fd931297fff765.parquet:   0%|          | 0.00/28.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2117 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

In [5]:
# Initialize Spacy model for dependency parsing
nlp = spacy.load('en_core_web_sm')

def create_dependency_graph(sentence):
    doc = nlp(sentence)
    return [(token.text, token.dep_, token.head.text, token.pos_) for token in doc]  # Added POS tag

# Apply function to each sentence
df_train['Dependency_Graph'] = df_train['text'].apply(create_dependency_graph)
df_val['Dependency_Graph'] = df_val['text'].apply(create_dependency_graph)
df_test['Dependency_Graph'] = df_test['text'].apply(create_dependency_graph)

In [7]:
# Initialize a set to hold all unique tokens
unique_tokens = set()

# Function to extract tokens from the dependency graph column
def extract_tokens(dependency_graph_str):
    # Convert the string representation of the list back to a list of tuples
    # dependency_graph = eval(dependency_graph_str)
    # Extract and add tokens to the set of unique tokens
    for token_tuple in dependency_graph_str:
        unique_tokens.add(token_tuple[0])  # Add the dependent token
        unique_tokens.add(token_tuple[2])  # Add the head token

df_train['Dependency_Graph'].apply(extract_tokens)
df_val['Dependency_Graph'].apply(extract_tokens)
df_test['Dependency_Graph'].apply(extract_tokens)

token_to_index = {token: idx for idx, token in enumerate(unique_tokens)}

# Number of unique tokens identified
len(unique_tokens)

8822

In [10]:
# Initialize counters for dependency types and POS tag types
dep_types = Counter()
pos_tag_types = Counter()

def count_types(dependency_graph):
    for _, dep, _, pos in dependency_graph:
        dep_types[dep] += 1
        pos_tag_types[pos] += 1

# Apply the function to count types
for df in [df_train, df_val, df_test]:
    df['Dependency_Graph'].apply(count_types)

embedding_dim = 16  # Dimension of the dense POS embeddings
pos_to_index = {pos: idx for idx, pos in enumerate(pos_tag_types)}  # POS to index mapping
pos_embeddings = np.random.randn(len(pos_to_index), embedding_dim)  # Random dense embeddings for POS tags

# Load pre-trained Word2Vec model (Google News vectors)
word2vec_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz', binary=True)

# Helper function to convert words to vectors using Word2Vec
def word_to_vec(word):
    try:
        return word2vec_model[word]
    except KeyError:
        # Return a zero vector if the word is not in the vocabulary
        return np.zeros(word2vec_model.vector_size)

# One-hot encode dependency types
dep_type_to_index = {dep: idx for idx, dep in enumerate(dep_types)}
dep_type_to_index

{'det': 0,
 'nsubj': 1,
 'aux': 2,
 'ROOT': 3,
 'amod': 4,
 'dobj': 5,
 'prep': 6,
 'cc': 7,
 'conj': 8,
 'compound': 9,
 'pobj': 10,
 'advmod': 11,
 'punct': 12,
 'meta': 13,
 'ccomp': 14,
 'acomp': 15,
 'mark': 16,
 'advcl': 17,
 'csubj': 18,
 'relcl': 19,
 'poss': 20,
 'nmod': 21,
 'predet': 22,
 'appos': 23,
 'xcomp': 24,
 'attr': 25,
 'nummod': 26,
 'preconj': 27,
 'dative': 28,
 'intj': 29,
 'pcomp': 30,
 'case': 31,
 'npadvmod': 32,
 'prt': 33,
 'nsubjpass': 34,
 'auxpass': 35,
 'agent': 36,
 'parataxis': 37,
 'acl': 38,
 'quantmod': 39,
 'dep': 40,
 'neg': 41,
 'expl': 42,
 'oprd': 43,
 'csubjpass': 44}

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train['Dependency_Graph'][0]

[('The', 'det', 'project', 'NOUN'),
 ('project', 'nsubj', 'make', 'VERB'),
 ('will', 'aux', 'make', 'VERB'),
 ('make', 'ROOT', 'make', 'VERB'),
 ('a', 'det', 'contribution', 'NOUN'),
 ('significant', 'amod', 'contribution', 'NOUN'),
 ('contribution', 'dobj', 'make', 'VERB'),
 ('to', 'prep', 'contribution', 'NOUN'),
 ('the', 'det', 'strategy', 'NOUN'),
 ('German', 'amod', 'strategy', 'NOUN'),
 ('and', 'cc', 'German', 'ADJ'),
 ('European', 'conj', 'German', 'ADJ'),
 ('hydrogen', 'compound', 'strategy', 'NOUN'),
 ('strategy', 'pobj', 'to', 'ADP'),
 ('and', 'cc', 'make', 'VERB'),
 ('hence', 'advmod', 'to', 'ADP'),
 ('to', 'conj', 'make', 'VERB'),
 ('achievement', 'pobj', 'to', 'ADP'),
 ('of', 'prep', 'achievement', 'NOUN'),
 ('the', 'det', 'targets', 'NOUN'),
 ('climate', 'compound', 'targets', 'NOUN'),
 ('targets', 'pobj', 'of', 'ADP'),
 ('.', 'punct', 'make', 'VERB')]

In [11]:
def process_and_save_graph_data_fixed(df, word2vec_func, dep_type_idx, filename):
    graphs = []

    for _, row in df.iterrows():
        dependency_graph = row['Dependency_Graph']
        label = row['label']

        token_to_idx = {}
        idx_to_token = []
        node_features = []
        edges = []

        for token, dep, head, pos in dependency_graph:
            if token not in token_to_idx:
                token_to_idx[token] = len(idx_to_token)
                idx_to_token.append(token)

                # Get the dense POS embedding
                pos_idx = pos_to_index.get(pos, 0)  # Default to 0 if POS is unknown
                pos_embedding = pos_embeddings[pos_idx]  # Lookup dense embedding for POS tag

                # Concatenate token embedding with POS embedding
                token_embedding = word2vec_func(token)
                node_features.append(np.concatenate([token_embedding, pos_embedding]).tolist())

            if head not in token_to_idx:
                token_to_idx[head] = len(idx_to_token)
                idx_to_token.append(head)

                # Get the dense POS embedding
                pos_idx = pos_to_index.get(pos, 0)  # Default to 0 if POS is unknown
                pos_embedding = pos_embeddings[pos_idx]  # Lookup dense embedding for POS tag

                # Concatenate token embedding with POS embedding
                head_embedding = word2vec_func(head)
                node_features.append(np.concatenate([head_embedding, pos_embedding]).tolist())

            src_idx = token_to_idx[token]
            dst_idx = token_to_idx[head]
            dep_idx = dep_type_idx.get(dep, 0)  # Get numerical index for dependency

            edges.append((src_idx, dep_idx, dst_idx))  # Use numerical index directly

        graph_data = {
            'targets': [label],
            'graph': edges,
            'node_features': node_features
        }

        graphs.append(graph_data)

    # Save the processed graphs to a JSON file
    with open(filename, 'w') as file:
        json.dump(graphs, file)

train_json_path = "/content/drive/MyDrive/env_claim_train.json"
val_json_path = "/content/drive/MyDrive/env_claim_val.json"
test_json_path = "/content/drive/MyDrive/env_claim_test.json"
# Re-attempt to process and save each dataset with the fixed function
process_and_save_graph_data_fixed(df_train, word_to_vec, dep_type_to_index, train_json_path)
process_and_save_graph_data_fixed(df_val, word_to_vec, dep_type_to_index, val_json_path)
process_and_save_graph_data_fixed(df_test, word_to_vec, dep_type_to_index, test_json_path)