In [20]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder
import ast

In [38]:
def string_to_list_of_lists(df, col_name):
    df[col_name] = df[col_name].apply(lambda x: ast.literal_eval(x) if not isinstance(x, list) else x)
    return df

def integer_encode_list(series):
    label_encoder = LabelEncoder()
    
    # Concatenate all the lists in the series to fit the encoder
    concatenated = [item for sublist in series for item in sublist if isinstance(item, str)]
    unique_elements = list(set(concatenated))
    label_encoder.fit(unique_elements)
    
    # Transform each list separately and store in a new series
    encoded_series = series.apply(lambda x: [label_encoder.transform([elem]).tolist()[0] if isinstance(elem, str) else elem for elem in x])
    
    return encoded_series

def create_graph_instance(tokens, pos_encoded, heads, word_embeddings, article_id):
    sentence_graphs = []
    
    for sentence_tokens, sentence_pos, sentence_heads, sentence_word_embeddings in zip(tokens, pos_encoded, heads, word_embeddings):
        # Create a dictionary to map tokens to their corresponding embeddings
        token_embedding_map = {embedding_tuple[1]: embedding_tuple[2] for embedding_tuple in sentence_word_embeddings}
        
        # Create a list of tuples containing the pos tag and the corresponding word embedding, or zeros if the token has no embedding
        node_features = [(pos, token_embedding_map.get(token, np.zeros(300))) for pos, token in zip(sentence_pos, sentence_tokens)]
        
        # Create nodes and assign features
        node_features = torch.tensor([([pos] if type(pos) == int else [pos]) + list(embedding) for pos, embedding in node_features], dtype=torch.float).view(-1, 301)
        
        # Convert heads from string to int and create edges using head indices
        sentence_heads = list(map(int, sentence_heads))
        edge_index = [[head_idx, idx] for idx, head_idx in enumerate(sentence_heads)]
        
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        # Assign article_id as ground truth
        y = torch.tensor(article_id, dtype=torch.long)
        
        # Create Data instance for PyTorch Geometric
        graph = Data(x=node_features, edge_index=edge_index, y=y)
        sentence_graphs.append(graph)
    
    return sentence_graphs

def process_dataframe(hdf5_path):
    # Load the dataframe from the hdf5 file
    df = pd.read_hdf(hdf5_path)

    # Convert string representations of lists to actual lists
    columns_to_convert = ['pos', 'dep', 'heads', 'tokens', 'word_embeddings']
    for col in columns_to_convert:
        df = string_to_list_of_lists(df, col)

    # Integer encode the pos and dep columns
    pos_encoded = integer_encode_list(df['pos'])
    dep_encoded = integer_encode_list(df['dep'])

    # Create graphs
    graphs = []
    for i, row in df.iterrows():
        tokens = row['tokens']
        pos = pos_encoded.loc[i]
        dep = dep_encoded.loc[i]
        heads = row['heads']
        word_embeddings = row['word_embeddings']

        # Check if the current dataframe is a question dataframe
        if 'article_ids' in df.columns:
            article_ids = ast.literal_eval(row['article_ids'])
            if isinstance(article_ids, tuple):
                for article_id in article_ids:
                    graphs.extend(create_graph_instance(tokens, pos, heads, word_embeddings, article_id))
            else:
                graphs.extend(create_graph_instance(tokens, pos, heads, word_embeddings, article_ids))
        else:
            article_id = row['id']
            graphs.extend(create_graph_instance(tokens, pos, heads, word_embeddings, article_id))

    return graphs

In [39]:
q_test_graphs = process_dataframe("../../local_datasets/bsard_extra/gsard_expert_questions_test.h5")

ValueError: too many dimensions 'str'

In [6]:
df = pd.read_hdf("../../local_datasets/bsard_extra/gsard_expert_questions_test.h5")

## Testing

In [40]:
def string_to_list_of_lists(df, col_name):
    df[col_name] = df[col_name].apply(lambda x: ast.literal_eval(x) if not isinstance(x, list) else x)
    return df

In [45]:
df.columns

Index(['id', 'category', 'subcategory', 'question', 'extra_description',
       'article_ids', 'normalized_question', 'pos', 'dep', 'heads', 'tokens',
       'word_embeddings'],
      dtype='object')

In [51]:
df = string_to_list_of_lists(df, "tokens")

In [52]:
df["tokens"][0]

[['quels', 'sont', 'les', 'critères', 'communaux', 'd', "'", 'insalubrité']]

In [53]:
def integer_encode_list(series):
    label_encoder = LabelEncoder()
    
    # Concatenate all the lists in the series to fit the encoder
    concatenated = [item for sublist in series for item in sublist if isinstance(item, str)]
    unique_elements = list(set(concatenated))
    label_encoder.fit(unique_elements)
    
    # Transform each list separately and store in a new series
    encoded_series = series.apply(lambda x: [label_encoder.transform([elem]).tolist()[0] if isinstance(elem, str) else elem for elem in x])
    
    return encoded_series

In [56]:
df["pos"][0]

[['ADJ', 'AUX', 'DET', 'NOUN', 'ADJ', 'ADP', 'ADP', 'NOUN']]

In [54]:
integer_encode_list(df["pos"])

0           [[ADJ, AUX, DET, NOUN, ADJ, ADP, ADP, NOUN]]
1      [[AUX, PRON, PRON, NOUN, ADP, DET, DET, NOUN, ...
2      [[DET, NOUN, VERB, PRON, VERB, ADV, ADP, ADP, ...
3             [[ADV, PRON, VERB, DET, NOUN, ADP, PROPN]]
4      [[VERB, PRON, PRON, PRON, VERB, ADP, NOUN, ADP...
                             ...                        
217    [[PRON, VERB, NOUN, ADJ, PUNCT, VERB, PRON, VE...
218           [[ADP, PRON, VERB, PRON, VERB, DET, NOUN]]
219    [[PRON, AUX, ADJ, VERB, VERB, PUNCT, PRON, VER...
220    [[AUX, PRON, SCONJ, PRON, VERB, VERB, DET, NOU...
221    [[PRON, AUX, NOUN, ADP, NOUN, ADJ, PUNCT, ADP,...
Name: pos, Length: 222, dtype: object