<a href="https://colab.research.google.com/github/digwit678/Can-Language-Models-Follow-Discussions/blob/main/probing_sw4_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# local path
#pickle_file_path = "../../../data_preparation/kialo_data/3_parsed_node_data/parsed_discussion_graphs.pickle"
#with open(pickle_file_path, 'rb') as f:
   # parsed_discussion_graphs = pickle.load(f)

In [1]:
!pip install igraph

Collecting igraph
  Downloading igraph-0.10.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.10.8 texttable-1.7.0


In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import igraph as ig
import yaml
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
base_path_gdrive = "/content/drive/MyDrive/Can Language Models Follow Discussions?/"

## Utility Functions (often used)

In [12]:
def save_final_files_to_drive(probing_example_nr, df, num_inputs, subsample=False, additional_info="", control_task_type="NONE", num_labels=2, num_probe_folds=1, probe_task_type="", probes_samples_path="sample_probes"):
    """
    Save the final probing files to Google Drive.

    Parameters:
    - probing_example_nr: The probing example number used in the folder path.
    - df: The DataFrame to save.
    - num_inputs: Number of input sentences (1 for single sentence, 2 for two sentences).
    - subsample: Boolean flag to indicate if it's a subsample or full data (default: False).
    - additional_info:  is added to the end of the probe name if specified
    - control_task_type: The type of control task (default: "NONE").
    - num_labels: Number of labels (default: 2).
    - num_probe_folds: Number of probing folds (default: 1).
    - probe_task_type: The task type of the probe (default: "SENTENCE").
    - probes_samples_path: Relative path to the probe (default: "sample_probes").
    """

    # Determine folder based on subsample or full
    folder_type = "subsample" if subsample else "full"

    # Set the base path
    base_path = f"/content/drive/MyDrive/Can Language Models Follow Discussions?/{probing_example_nr}/final probing files/{folder_type}/{(additional_info if additional_info != '' else '')}"

    # Create folder if not exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Determine the config file name
    config_name = "config"
    if num_inputs == 2:
        config_name += "-bi"
    if subsample:
        config_name += "-rand"
    config_name += "-none.yaml"

    # Create YAML config content
    config_content = {
        'control_task_type': control_task_type,
        'num_inputs': num_inputs,
        'num_labels': num_labels,
        'num_probe_folds': num_probe_folds,
        'probe_name': f"probe-{('single' if num_inputs == 1 else 'two')}_sentence_probing_task_{(additional_info if additional_info != '' else '')}_{probing_example_nr}",
        'probe_task_type': ("SENTENCE" if num_inputs == 1 else "SENTENCE_PAIR_BI") if probe_task_type =="" else probe_task_type,
        'probes_samples_path': f"{probes_samples_path}/{('single' if num_inputs == 1 else 'two')}_sentence_probing_task"
    }

    # Save YAML file
    with open(base_path + config_name, 'w') as file:
        yaml.safe_dump(config_content, file, default_flow_style=False)

    # Save DataFrame as CSV
    df.to_csv(base_path + "folds.csv", index=True)

    print(f"Files saved at {base_path}")


In [5]:
def load_file(path_end, file_type):
  """
  - path_end: provide number + file name
  - file_type: "p" for pickle or "" for csv

  """
  base_path_gdrive = "/content/drive/MyDrive/Can Language Models Follow Discussions?/"
  if file_type == "p":
    with open (base_path_gdrive + path_end , 'rb') as f:
        file_out = all_node_info = pickle.load(f)
  else:
        file_out = pd.read_csv(base_path_gdrive + path_end)
  return file_out

In [6]:
def save_dataframe_to_drive(probing_example_nr, df, file_type='e', task_name='task'):
    # Set the base path
    base_path = f"/content/drive/MyDrive/Can Language Models Follow Discussions?/{probing_example_nr}/final probing files/"

    # Create the filename
    filename = f"{probing_example_nr}_{task_name}_data"

    # Full file path
    if file_type == 'p':
        full_file_path = f"{base_path}{filename}.pkl"
        df.to_pickle(full_file_path)
    else:
        full_file_path = f"{base_path}{filename}.xlsx"
        df.to_excel(full_file_path, index=False)

    print(f"File saved at {full_file_path}")

# Example usage:
# save_dataframe_to_drive(1, df, file_type='e', task_name='my_task')


In [7]:
def print_first_n_entries(dictionary, n, max_inner_items=5):
    """
    Print the first 'n' entries of a nested dictionary, showing only the first 'max_inner_items' key-value pairs of the inner dictionary at each depth level.

    Args:
        dictionary (dict): The nested dictionary to print.
        n (int): The number of entries to print.
        max_inner_items (int): The maximum number of key-value pairs to print from the inner dictionary at each depth level.

    Returns:
        None
    """
    def print_nested_dict(d, depth):
        if depth >= max_depth:
            return

        for key, value in d.items():
            if len(entries) >= n:
                return

            entries.append((key, value))
            print(f"Entry {len(entries)}:")
            print(f"Key: {key}")
            print("Value:")

            if isinstance(value, dict):
                sub_dict = {k: value[k] for k in list(value)[:max_inner_items]}
                print_nested_dict(sub_dict, depth + 1)
            else:
                print(f"  {value}\n")

    max_depth = max_inner_items
    entries = []

    print_nested_dict(dictionary, 0)



In [133]:

def create_general_dataframe(claim_pairs, context, topics, org_labels, string_labels, identifiers):
    """
    Creates a DataFrame for probing tasks with the given parameters.

    Parameters:
        claim_pairs (list of tuple): List of tuples where each tuple contains two claims to compare.
        context (list of str): List of context for each claim pair. Could be empty or NaN.
        topics (list of str): List of topics or base claims for each claim pair.
        org_labels (list of str): List of original labels ("Consecutive", "Non-Consecutive", etc.).
        string_labels (list of int): List of string labels (1 for "Consecutive", 0 for "Non-Consecutive").
        identifiers (list of int): List of unique identifiers for each claim pair.

    Returns:
        DataFrame: A DataFrame containing the data in the specified format.
    """

    df = pd.DataFrame({
        'inputs': claim_pairs,
        'context': context,
        'topic': topics,
        'org_label': org_labels,
        'id': identifiers,
        'string_label': string_labels,
        'label': string_labels  # assuming that string_label and label are the same
    })

    # Split
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train, dev = train_test_split(train, test_size=0.25, random_state=42)

    # Assign set
    train['set-0'] = 'train'
    dev['set-0'] = 'dev'
    test['set-0'] = 'test'

    # Combine
    final_df = pd.concat([train, dev, test])

    return final_df


# Data Structure

General ID-Structure: [DiscussionID . ClaimID], claim_ids are not always in chronological order (so it not always follows 0.2 on 0.1)
Some ID´s are also skipped / missing / deleted


In [None]:
print("First Discussion: Overview", parsed_discussion_graphs[0][0:5])

In [None]:
print("total number of discussion graphs ", len(parsed_discussion_graphs))

total number of discussion graphs  17832


In [None]:
print("each discussion is a sequential igraph object storing edges (relations) and nodes (content) per claim: ",
      parsed_discussion_graphs[0].vs)

each discussion is a sequential igraph object storing edges (relations) and nodes (content) per claim:  <igraph.seq.VertexSeq object at 0x000001F4CA8670B0>


## Claim Level

Claims are accessed on the igraph object by the ".vs" - attribute.

For each claim it is possible to access its node-attributes (claim id and content) and its edge-attribute (relation)

For each claim-node 2 attributes are stored:
- id : the claims id
- text : the claims content


In [None]:
print(
    "to see the objects more clearly let´s print the first discussion´s nodes in readable format with the help of the name attribute (selected number=",
    len(parsed_discussion_graphs[0].vs["name"][0:20]), ") : \n\n",
    parsed_discussion_graphs[0].vs["name"][0:20])

to see the objects more clearly let´s print the first discussion´s nodes in readable format with the help of the name attribute (selected number= 20 ) : 

 [{'id': '333.0', 'text': 'Anonymous currency discussions'}, {'id': '333.1', 'text': 'An anonymous untrackable digital currency \\(ADC\\), like Bitcoin, is beneficial for civilized societies.'}, {'id': '333.2', 'text': "Cash is a necessary part of any functional society's economic model and cryptocurrencies are an easier and online-ready form of cash. They are the natural progression of what money is becoming in the digital age."}, {'id': '333.3', 'text': 'Most ordinary citizens would not use such a currency anyway, rendering all potential benefits moot.'}, {'id': '333.4', 'text': 'Most people pay taxes and banks report all their in/outflows above certain limits.'}, {'id': '333.5', 'text': 'Cryptocurrencies are not like cash.'}, {'id': '333.7', 'text': 'If people really wanted anonymity, they would use more cash and make less use of 

In [None]:
# accessing id
print(parsed_discussion_graphs[0].vs["name"][0]["id"])

333.0


In [None]:
# accessing text
print(parsed_discussion_graphs[0].vs["name"][0]["text"])

Anonymous currency discussions


For each edge (relation) we can access its target and source nodes , its relation type and its edge_id.
One edge is always a relation between 2 claims.

In [None]:
edge_example = parsed_discussion_graphs[0].es[0]
print("1st edge (edge 0) ", edge_example)
print("2nd edge (edge 1) ", parsed_discussion_graphs[0].es[1])

1st edge (edge 0)  igraph.Edge(<igraph.Graph object at 0x000001F4C6A1EB40>, 0, {'relation': 1})
2nd edge (edge 1)  igraph.Edge(<igraph.Graph object at 0x000001F4C6A1EB40>, 1, {'relation': -1})


In [None]:
print("The type of the 1st relation. 1 is pro and -1 is con:", parsed_discussion_graphs[0].es[0]["relation"])

The type of the 1st relation. 1 is pro and -1 is con: 1


In [None]:
# the node/vertex, where the relation comes from
print("the node/claim, from where the 1st relation comes from: \n", edge_example.source_vertex)
print()
print("access only claim_id of the source_claim (so claim id of 222.5 is 5): \n", edge_example.source)
print("access full id of the source claim: \n", edge_example.source_vertex["name"]["id"])

the node/claim, from where the 1st relation comes from: igraph.Vertex(<igraph.Graph object at 0x000001F4C6A1EB40>, 1, {'name': {'id': '333.1', 'text': 'An anonymous untrackable digital currency \\(ADC\\), like Bitcoin, is beneficial for civilized societies.'}})

access only claim_id of the source_claim (so claim id of 222.5 is 5):  1
access full id of the source claim: 333.1


In [None]:
''  # the node where the relation goes to
print("the node, to where the 1st relation goes to \n", edge_example.target_vertex)
print()
print("claim id of the target claim: ", edge_example.target)
print("access full id of the target claim:", edge_example.source_vertex["name"]["id"])

the node, to where the 1st relation goes to  igraph.Vertex(<igraph.Graph object at 0x000001F4C6A1EB40>, 2, {'name': {'id': '333.2', 'text': "Cash is a necessary part of any functional society's economic model and cryptocurrencies are an easier and online-ready form of cash. They are the natural progression of what money is becoming in the digital age."}})

claim id of the target claim:  2
access full id of the target claim: 333.1


In [None]:
print("access claim_id tuple of source and target claim (source,target): ", edge_example.tuple)

access claim_id tuple of source and target claim (source,target):  (1, 2)


## Data Cleaning 2: Removing empty nodes & duplicates

### Further investigating the .0 claim id

- already known by random sample inspection: a lot of claims have the text 'Please enter the multiple choice question.' as their first claim (in .0 position) and this claim can be removed since it is not a real claim

In [None]:
{'id': '444.0', 'text': 'Please enter the multiple choice question.'}

{'id': '444.0', 'text': 'Please enter the multiple choice question.'}

In [None]:
noise_start_claim_texts = []
errors = []

In [None]:
for i in range(len(parsed_discussion_graphs)):
    graph = parsed_discussion_graphs[i]

    if graph is not None:
        if hasattr(graph, 'vs') and len(graph.vs) > 0:
            try:
                if graph.vs["name"][0]["text"] == 'Please enter the multiple choice question.' or graph.vs["name"][0][
                    "text"] == '' or graph.vs["name"][0]["text"] is None:
                    print(graph.vs["name"][0]["text"])
                    print("--------------------------------------------------")
                    noise_start_claim_texts.append((graph.vs["name"][0]["id"],
                                                    graph.vs["name"][0]["text"],
                                                    graph))
            except KeyError:
                print(f"KeyError encountered for graph at index {i}")
                errors.append((parsed_discussion_graphs[i], "KeyError"))
        else:
            print(f"Graph at index {i} has no vertices.")
            errors.append((parsed_discussion_graphs[i], "has no vertices"))
    else:
        print(f"Graph at index {i} is None.")
        errors.append((parsed_discussion_graphs[i], "None"))


In [None]:
print(
    "Number of discussions with noise ('' or Multiple Choice Question) at position vs[""name""][0] for claim content and claim id: ",
    len(noise_start_claim_texts))
print("Number of discussions with Errors (KeyError,None,No Vertices): ", len(errors))

Number of discussions with noise ('' or Multiple Choice Question) at position vs[name][0] for claim content and claim id:  14637
Number of discussions with Errors (KeyError,None,No Vertices):  851


since graphs in errors folder have no further use, all graphs in errors can be entirely removed

for the noisy .0 positions we just remove this position

In [None]:
# Length before cleaning
original_len_nodes = sum(len(graph.vs) if graph is not None else 0 for graph in parsed_discussion_graphs)
original_len_graphs = len(parsed_discussion_graphs)
print(original_len_nodes, original_len_graphs)

631841 17832


In [None]:
import pickle

# Step 1: Identify IDs and Indices to Remove
ids_to_remove = set([entry[0] for entry in noise_start_claim_texts])
indices_to_remove = set([parsed_discussion_graphs.index(entry[0]) for entry in errors if entry[0] is not None])

# Step 2: Iterate Through Graphs and Remove Noise and Errors
cleaned_graphs = []
for i, graph in enumerate(parsed_discussion_graphs):
    if i in indices_to_remove:
        continue  # Skip graphs that are in the errors list
    if graph is not None:
        if hasattr(graph, 'vs') and len(graph.vs) > 0:
            try:
                node_id = graph.vs["name"][0]["id"]
                if node_id in ids_to_remove:
                    graph.delete_vertices(0)
                cleaned_graphs.append(graph)
            except KeyError:
                continue

# Step 3: Save Cleaned Data
with open("cleaned_parsed_discussion_graphs.pkl", "wb") as f:
    pickle.dump(cleaned_graphs, f)


In [None]:
# Length after cleaning
cleaned_len_graphs = len(cleaned_graphs)
cleaned_len_nodes = sum(len(graph.vs) for graph in cleaned_graphs)

# Calculate the number of removed graphs and nodes
removed_graphs = original_len_graphs - cleaned_len_graphs
removed_nodes = original_len_nodes - cleaned_len_nodes

print(f"Removed graphs: {removed_graphs}, should be around 800")
print(f"Removed nodes: {removed_nodes}, should be around 14000")

Removed graphs: 851, should be around 800
Removed nodes: 14637, should be around 14000


In [None]:
# remove empty graphs (v1)

# Initialize a list to keep track of cleaned graphs
cleaned_graphs_v2 = []

# Traverse through each graph in the cleaned_graphs list
for graph in cleaned_graphs:
    if graph is None:
        cleaned_graphs_v2.append(None)
        continue

    # Identify vertices to remove based on empty or None 'id' and 'text'
    remove_indices = [v.index for v in graph.vs if
                      not graph.vs["name"][v.index]["id"] or graph.vs["name"][v.index]["id"] == '' or
                      not graph.vs["name"][v.index]["text"] or graph.vs["name"][v.index]["text"] == '']

    # Create a new graph with the vertices removed
    new_graph = graph.subgraph([v.index for v in graph.vs if v.index not in remove_indices])

    cleaned_graphs_v2.append(new_graph)

# Now cleaned_graphs_v2 contains graphs with empty or None 'id' and 'text' vertices removed
print("number of nodes remaining:", len(cleaned_graphs_v2),"/",original_len_graphs) # todo reexectute


number of empty nodes removed): 16981


Save the cleaned_graphs_v2 list to a pickle file for future use  



```
import pickle

cleaned_graphs_path_v2 = "cleaned_graphs_v2.pkl"

with open(cleaned_graphs_path_v2, "wb") as f:
    pickle.dump(cleaned_graphs_v2, f)

cleaned_graphs_path_v2
```




In [None]:
# local code
cleaned_graphs_path_v3 = "cleaned_graphs_v3.pkl"
with open(cleaned_graphs_path_v3, 'rb') as f:
    cleaned_graphs_v3 = pickle.load(f)


In [None]:
# colab
import pickle
with open('/content/drive/MyDrive/Can Language Models Follow Discussions?/Data - kialo.com/cleaned_graphs_v3.pkl', 'rb') as f:
    cleaned_graphs_v3 = pickle.load(f)

Converting text ids to float numbers for faster access

In [None]:
# examine datatypes
print("claim id: ", type(cleaned_graphs_v3[0].vs["name"][0]["id"]), "edges : ", type(cleaned_graphs_v3[0].es[0]),
      " relation type : ", type(cleaned_graphs_v3[0].es[0]["relation"]), "source vertex id: ",
      type(cleaned_graphs_v3[0].es[0].source_vertex["name"]["id"]), "target vertex id:  ",
      type(cleaned_graphs_v3[0].es[0].target_vertex["name"]["id"]))

claim id:  <class 'str'> edges :  <class 'igraph.Edge'>  relation type :  <class 'int'> source vertex id:  <class 'str'> target vertex id:   <class 'str'>


In [None]:
"""def convert_ids_to_float(graphs):
    for graph in graphs:
        # Convert vertex IDs
        for vertex in graph.vs:
            vertex["name"]["id"] = float(vertex["name"]["id"])

        # Convert edge source and target IDs
        for edge in graph.es:
            source_idx = edge.source
            target_idx = edge.target
            graph.es[source_idx].source_vertex["name"]["id"] = float(graph.es[source_idx].source_vertex["name"]["id"])
            if (graph.es[target_idx] and graph.es[target_idx]):
              graph.es[target_idx].target_vertex["name"]["id"] = float(graph.es[target_idx].target_vertex["name"]["id"])
              graph.vs[source_idx]["name"]["id"] = float(graph.vs[source_idx]["name"]["id"])
              graph.vs[target_idx]["name"]["id"] = float(graph.vs[target_idx]["name"]["id"])
    return graphs


# Assuming cleaned_graphs_v2 contains your list of graphs
cleaned_graphs_v4 = convert_ids_to_float(cleaned_graphs_v3) """

IndexError: ignored

In [None]:
def convert_ids_to_float(graphs):
    for graph in graphs:
        # Convert vertex IDs
        for vertex in graph.vs:
            vertex["name"]["id"] = float(vertex["name"]["id"])

        # Convert edge source and target IDs
        for edge in graph.es:
            source_vertex = edge.source_vertex  # Source vertex object
            target_vertex = edge.target_vertex  # Target vertex object

            # Convert to float
            source_vertex["name"]["id"] = float(source_vertex["name"]["id"])
            target_vertex["name"]["id"] = float(target_vertex["name"]["id"])

    return graphs

# Assuming cleaned_graphs_v3 contains your list of graphs
cleaned_graphs_v4 = convert_ids_to_float(cleaned_graphs_v3)


In [None]:

print("claim id: ", type(cleaned_graphs_v4[0].vs["name"][0]["id"]), "edges : ", type(cleaned_graphs_v4[0].es[0]),
      " relation type : ", type(cleaned_graphs_v4[0].es[0]["relation"]), "source vertex id: ",
      type(cleaned_graphs_v4[0].es[0].source_vertex["name"]["id"]), "target vertex id:  ",
      type(cleaned_graphs_v4[0].es[0].target_vertex["name"]["id"]))

claim id:  <class 'float'> edges :  <class 'igraph.Edge'>  relation type :  <class 'int'> source vertex id:  <class 'float'> target vertex id:   <class 'float'>


In [None]:
cleaned_graphs_v4[0].es[0].target_vertex["name"]["id"]

333.2

In [None]:
example_graph = cleaned_graphs_v4[0]

Is the source claim on edges(idx) the same claim as the claim on nodes (idx) where idx is the same ?

In [None]:
print("claim 1 with id = ", cleaned_graphs_v4[0].vs["name"][1]["id"], "has the following target vertex: ",
      cleaned_graphs_v4[0].es[1].target_vertex["name"]["id"],
      "it can be verified that the source vertex to this target vertex is the claim 1 with id = ",
      cleaned_graphs_v4[0].es[1].source_vertex["name"]["id"])
#cleaned_graphs_v4[0].es[1]

claim 1 with id =  333.1 has the following target vertex:  333.3 it can be verified that the source vertex to this target vertex is the claim 1 with id =  333.1


## Data Cleaning pt 3: Duplicate Discussions + Spam Removal

set random seed ?
methods should be validated, reproducible and transparent

Refined Data Cleaning Plan
Goals
1.	Remove duplicates
2.	Convert IDs to numeric format
3.	Remove or flag spam comments
4.	Ensure base claims are not mislabeled Steps


### Duplicate Removal

In [None]:
# Assuming that cleaned_graphs_v2 is a list of igraph objects and that the first graph has vertices
# Here is how you would retrieve the "id" attribute of the first vertex in the first graph in both notations:

# First Notation
first_id_notation1 = cleaned_graphs_v2[0].vs["name"][0]["id"]

# Second Notation (within a loop, but just picking the first vertex for demonstration)
first_graph = cleaned_graphs_v2[0]
first_vertex = first_graph.vs[0]
first_id_notation2 = first_vertex["name"]["id"]

first_id_notation1, first_id_notation2, first_id_notation1 == first_id_notation2


(333.0, 333.0, True)

check for spam at position 0.3 since we observed a  lot of those in the data ( # probing 1 data preparation)

In [None]:
import pickle
import re

# Initialize counters and lists
spam_at_03_count = 0
total_spam_count = 0

# Loop through each graph in your parsed_discussion_graphs list
for graph in cleaned_graphs_v2:
    for vertex in graph.vs:
        # Convert Claim IDs to float
        claim_id = float(vertex["name"]["id"])
        claim_text = vertex["name"]["text"]

        # Update the vertex with the float ID
        vertex["name"]["id"] = claim_id

        # Check for spam keywords
        if re.search(r'\b(Dumps|Coupon Code|Discount|Exam|Pass)\b', claim_text, re.IGNORECASE):
            total_spam_count += 1
            print(f"Debug - Claim ID: {claim_id}, Decimal Part: {claim_id % 1}")  # Debugging line
            tolerance = 1e-10  # 0.3000000000029104, which is very close to 0.3 but not exactly equal.
            # Check if ID has a decimal part close to .3
            if abs(claim_id % 1 - 0.3) < tolerance:
                spam_at_03_count += 1

print(f"Total spam count: {total_spam_count}")
print(f"Spam at position 0.3: {spam_at_03_count}")


**OUTPUT**  
...  
Debug - Claim ID: 46606.3, Decimal Part: 0.3000000000029104
Debug - Claim ID: 46607.3, Decimal Part: 0.3000000000029104
Debug - Claim ID: 46618.3, Decimal Part: 0.3000000000029104  

Total spam count: 2433  
Spam at position 0.3: 943  

TODO: REFACTOR as FCT + ID to FLOAT + SAVE OUTPUTS FOR REPORT

### Spam Removal



assumptions: spam text is short ( =...), and is written in capitalized letters, contains often some kind of adress/code/number , contains a key word such as "Discount", "Coupon Code" or something similar.  

In [None]:
import re
import pickle

# Initialize a list to hold sample spam entries for verification
sample_spam_entries = []

# Initialize counters to track the number of spam entries
total_spam_count = 0
spam_at_03_count = 0


# Function to check if most words in a string start with a capital letter
def check_capitalized_words(text):
    words = text.split()
    capitalized_words = [word for word in words if word.istitle()]
    return len(capitalized_words) > len(words) // 2


# Loop through each graph
for graph_index, graph in enumerate(cleaned_graphs_v2):
    for vertex_index, vertex in enumerate(graph.vs):
        claim_id = float(vertex["name"]["id"])
        claim_text = vertex["name"]["text"]

        # Check for spam keywords
        if re.search(r'\b(Dumps|Coupon Code|Discount|Exam|Pass)\b', claim_text, re.IGNORECASE):

            # Check if ID has a decimal part of .3 (with tolerance)
            if abs(claim_id % 1 - 0.3) < 1e-9:

                # Additional constraints: Minimum number of words should be greater than 4 and most words should start with capital letters
                if len(claim_text.split()) > 4 and check_capitalized_words(claim_text):
                    total_spam_count += 1  # Moved this line here
                    spam_at_03_count += 1

                    sample_spam_entries.append({
                        "Graph Index": graph_index,
                        "Vertex Index": vertex_index,
                        "Claim ID": claim_id,
                        "Claim Text": claim_text
                    })

# Limit the sample size to 5 for quick verification
sample_spam_entries = sample_spam_entries[:5]

# Print spam statistics
print(f"Total spam count: {total_spam_count}")
print(f"Spam at position 0.3: {spam_at_03_count}")

# Print sample spam entries for verification
print("Sample spam entries for verification:")
print(sample_spam_entries)


#for entry in sample_spam_entries:
#graph_index = entry["Graph Index"]
#vertex_index = entry["Vertex Index"]
#cleaned_graphs_v2[graph_index].delete_vertices(vertex_index)


Total spam count: 694
Spam at position 0.3: 694
Sample spam entries for verification:
[{'Graph Index': 12250, 'Vertex Index': 0, 'Claim ID': 47773.3, 'Claim Text': 'Discount Rate, Reserve Requirements & Money Supply'}, {'Graph Index': 12459, 'Vertex Index': 0, 'Claim ID': 48581.3, 'Claim Text': 'Splunk SPLK-1002 Dumps Study Material | 100% Success Guarantee'}, {'Graph Index': 15551, 'Vertex Index': 0, 'Claim ID': 43811.3, 'Claim Text': 'How Can You Pass Zend ZF-100-500 Exam In First Attempt ?'}, {'Graph Index': 15555, 'Vertex Index': 0, 'Claim ID': 43815.3, 'Claim Text': 'Get 2021 New Year Discount Offer On JN0-348 Exam Dumps | Coupon Code "PASS30NY21"'}, {'Graph Index': 15556, 'Vertex Index': 0, 'Claim ID': 43816.3, 'Claim Text': 'Get Christmas Special Offer Huawei H13-321 Dumps Using Coupon Code "CHRIS30"'}]


since we verified by inspection that the spams samples are now correctly identified, we can remove them from the data set and save a new version of the data set

In [None]:
# Initialize a list to store cleaned graphs
cleaned_graphs_v3 = []


# Function to check if most words start with a capital letter
def check_capitalized_words(text):
    words = text.split()
    capitalized_words = [word for word in words if word.istitle()]
    return len(capitalized_words) > len(words) // 2


# Loop through each graph
for graph in cleaned_graphs_v2:
    # Initialize a list to store vertices to remove
    vertices_to_remove = []

    # Loop through each vertex in the graph
    for vertex in graph.vs:
        claim_id = float(vertex["name"]["id"])
        claim_text = vertex["name"]["text"]

        # Check for spam keywords or if most words are capitalized
        if re.search(r'\b(Dumps|Coupon Code|Discount|Exam|Pass)\b', claim_text,
                     re.IGNORECASE) or check_capitalized_words(claim_text):
            # Add vertex index to the list of vertices to remove
            vertices_to_remove.append(vertex.index)

    # Remove spam vertices
    graph.delete_vertices(vertices_to_remove)

    # Add cleaned graph to the new list
    cleaned_graphs_v3.append(graph)

# Probing 1

Nodes and edges being saved in igraph-sequences

In [None]:
example_graph_edges = example_graph.es
example_graph_nodes = example_graph.vs
print(example_graph_edges, example_graph_nodes)

<igraph.seq.EdgeSeq object at 0x7d2a0ba07be0> <igraph.seq.VertexSeq object at 0x7d2a2493bf10>


In [None]:
for i, node in enumerate(example_graph_nodes):
  if i < 10:
    print("source: ", node["name"]["id"])
    print("target: ", example_graph_edges[i].target_vertex["name"]["id"])
    print("-----------------------------------")



source:  333.0
target:  333.2
-----------------------------------
source:  333.1
target:  333.3
-----------------------------------
source:  333.2
target:  333.7
-----------------------------------
source:  333.3
target:  333.12
-----------------------------------
source:  333.4
target:  333.17
-----------------------------------
source:  333.5
target:  333.18
-----------------------------------
source:  333.7
target:  333.19
-----------------------------------
source:  333.9
target:  333.2
-----------------------------------
source:  333.1
target:  333.21
-----------------------------------
source:  333.11
target:  333.22
-----------------------------------


**Observation**: The same node (see 333.1) is being saved n times, whereas n = number of outgoing edges

In [None]:

# Save as cleaned_graphs_v3.pkl
with open('cleaned_graphs_v3.pkl', 'wb') as f:
    pickle.dump(cleaned_graphs_v3, f)

In [None]:
"total number of edges , nodes in graph/discussion 0", len(cleaned_graphs_v4[0].es), len(cleaned_graphs_v4[0].vs)

('total number of edges , nodes in graph/discussion 0', 603, 592)

In [None]:
for i, node in enumerate(cleaned_graphs_v4[0].vs["name"][1:]):  # exclude base claim at position 0
  # restrict output
  if i < 10 :
    if len(cleaned_graphs_v4[0].es[i].target_vertex) > 1:  # is there more than one target vertex ? should not according to previous observation
        print("More than 1 target vertex on the same edge ?!")

In [None]:
import pickle

with open('/content/drive/MyDrive/Can Language Models Follow Discussions?/Data - kialo.com/cleaned_graphs_v3.pkl', 'rb') as f:
    cleaned_graphs_v3 = pickle.load(f)

In [None]:
"all nodes (subsampled to 5 examples)/claims in graph/discussion 0", cleaned_graphs_v4[0].vs["name"][0:5]

('all nodes (subsampled to 5 examples)/claims in graph/discussion 0',
 [{'id': 333.0, 'text': 'Anonymous currency discussions'},
  {'id': 333.1,
   'text': 'An anonymous untrackable digital currency \\(ADC\\), like Bitcoin, is beneficial for civilized societies.'},
  {'id': 333.2,
   'text': "Cash is a necessary part of any functional society's economic model and cryptocurrencies are an easier and online-ready form of cash. They are the natural progression of what money is becoming in the digital age."},
  {'id': 333.3,
   'text': 'Most ordinary citizens would not use such a currency anyway, rendering all potential benefits moot.'},
  {'id': 333.4,
   'text': 'Most people pay taxes and banks report all their in/outflows above certain limits.'}])

In [None]:
import pickle

def create_outgoing_dict_for_all_graphs(graphs):
    """
    saves for each claim its outgoing claims directly in the same order as interger keys
    into a dict such that we can reduce the access time of node´s target nodes to O(1)
    """
    all_outgoing_dicts = {}
    for graph_index, graph in enumerate(graphs):
        outgoing_dict = {}
        for edge in graph.es:
            src = edge.source_vertex["name"]["id"]
            tgt = edge.target_vertex["name"]["id"]

            if src in outgoing_dict:
                outgoing_dict[src].append(tgt)
            else:
                outgoing_dict[src] = [tgt]

        all_outgoing_dicts[graph_index] = outgoing_dict

    return all_outgoing_dicts

# Create the dictionary
all_outgoing_dicts = create_outgoing_dict_for_all_graphs(cleaned_graphs_v4)

# Save it using pickle
with open("all_outgoing_dicts.pkl", "wb") as f:
    pickle.dump(all_outgoing_dicts, f)


In [None]:
len(all_outgoing_dicts)

16981

In [None]:
print("Integer Keys: \n")
print(list(all_outgoing_dicts.keys())[0:5])

Integer Keys: 

[0, 1, 2, 3, 4]


In [None]:
# TODO
def verify_base_claims(cleaned_graphs):
    duplicates = []
    base_claim_ids = set()
    for graph_index, graph in enumerate(cleaned_graphs):
        if len(graph.vs) == 0:
            print(f"Empty graph found at index {graph_index}. Skipping.")
            continue
        first_node_id = graph.vs[0]["name"]["id"]
        if  first_node_id in base_claim_ids:
            print(f"Duplicate base claim ID {first_node_id} found at index {graph_index}.")
            duplicates.append((first_node_id,graph_index))
        base_claim_ids.add(first_node_id)
    print(f"Set of all unique base claim IDs: {base_claim_ids}")
    print("all duplicates: ", len(duplicates))
    return True
verify_base_claims(cleaned_graphs_v4)

**OUTPUT**  
...   
Empty graph found at index 16979. Skipping.
Empty graph found at index 16980. Skipping.

Set of all unique base claim IDs: {32784.3, 32786.3, 32789.3, 32793.3, 32797.3, 32799.3, ...

all duplicates:  966

True

In [None]:
"""def check_for_duplicates(cleaned_graphs):

    duplicate_indexes = []  # Store indexes of duplicate ids in the 1st node position
    base_claim_dict = {}  # Store base claim IDs and their corresponding indexes

    for index, graph in enumerate(cleaned_graphs):
        if len(graph.vs) == 0:
            continue

        first_node_id = graph.vs[0]["name"]["id"] # floating point id format
        if first_node_id in base_claim_dict:
            duplicate_indexes.append((base_claim_dict[first_node_id], index))
        else:
            base_claim_dict[first_node_id] = index

    # further check those indexes representative for 1 discussion
    true_duplicate_indexes = []
    for index1, index2 in duplicate_indexes:
        graph1 = cleaned_graphs[index1]
        graph2 = cleaned_graphs[index2]
        if graph1.es and graph2.es:
          #print(graph1.es)
        #print(graph1.vs["name"] ,graph1.es[0])
        #print(graph1.es[0].target_vertex["name"]["id"])
        # duplicate if node id, node text and edge target vertex are the same (updated)
          #print(graph1.es[0].target_vertex["name"]["id"])
          if graph1.vs["name"]["id"] == graph2.vs["name"]["id"] and graph1.es[0].target_vertex["name"]["id"] == graph2.es[0].target_vertex["name"]["id"]:
               print(graph1.vs["name"], "and", graph2.vs["name"], "are duplicates, which point to the same target node, see:", target_id1, target_id2)
                true_duplicate_indexes.append(index2)
              #print(f"Graphs at index {index1} and {index2} are true duplicates.")
              true_duplicate_indexes.append(index2)
    print(len(true_duplicate_indexes), "true duplicates found")
check_for_duplicates(cleaned_graphs_v4)"""

In [None]:
def check_for_duplicates(cleaned_graphs):
    """
    checks for duplicate values in the first position, where the unique base claim is assumed to be
    """
    duplicate_indexes = []
    base_claim_dict = {}

    for index, graph in enumerate(cleaned_graphs):
        if len(graph.vs) == 0:
            continue

        first_node_id = graph.vs[0]["name"]["id"]
        if first_node_id in base_claim_dict:
            duplicate_indexes.append((base_claim_dict[first_node_id], index))
        else:
            base_claim_dict[first_node_id] = index

    true_duplicate_indexes = []
    for index1, index2 in duplicate_indexes:
        graph1 = cleaned_graphs[index1]
        graph2 = cleaned_graphs[index2]

        if graph1.es and graph2.es:
            first_node_id1 = graph1.vs[0]["name"]["id"]
            first_node_id2 = graph2.vs[0]["name"]["id"]

            target_id1 = graph1.es[0].target_vertex["name"]["id"]
            target_id2 = graph2.es[0].target_vertex["name"]["id"]

            if first_node_id1 == first_node_id2 and target_id1 == target_id2:
                print(graph1.vs["name"], "and", graph2.vs["name"], "are duplicates, which point to the same target node, see:", target_id1, target_id2)
                true_duplicate_indexes.append(index2)
                #print(f"Graphs at index {index1} and {index2} are true duplicates.")
                true_duplicate_indexes.append(index2)

    print(f"{len(true_duplicate_indexes)} true duplicates found.")
    check_for_duplicates(cleaned_graphs_v4)

    """ conclusion: no duplicate base claims found """


In [None]:
def examine_graphs(cleaned_graphs, index1, index2):
    """
    Examine vertices and edges of two graphs based on their indices and print them.
    """
    # Fetching graphs by index
    graph1 = cleaned_graphs[index1]
    graph2 = cleaned_graphs[index2]

    # Extracting vertices' id and text
    vertices1 = graph1.vs["name"]
    vertices2 = graph2.vs["name"]

    # Extracting edges
    edges1 = [(edge.source, edge.target) for edge in graph1.es]
    edges2 = [(edge.source, edge.target) for edge in graph2.es]

    # Printing details
    print(f"Graph at index {index1}\n{'-'*24}")
    print(f"Vertices: {vertices1}\nEdges: {edges1}")

    print(f"\nGraph at index {index2}\n{'-'*24}")
    print(f"Vertices: {vertices2}\nEdges: {edges2}")

# Function to check for empty graphs
def check_empty_graphs(cleaned_graphs):
    """
    Identify and return indices of empty graphs.
    """
    return [index for index, graph in enumerate(cleaned_graphs) if len(graph.vs) == 0]

# Function to remove graphs based on indices
def remove_graphs(graphs, indices_to_remove):
    """
    Remove graphs based on a list of indices.
    """
    return [graph for i, graph in enumerate(graphs) if i not in indices_to_remove]

# Example usage:
# --------------

# Examine first duplicate graphs: 10681 and 11022
examine_graphs(cleaned_graphs_v4, 10681, 11022)

# Examine first empty graphs: 451 and 670
examine_graphs(cleaned_graphs_v4, 451, 670)

# Identify duplicate and empty graph indices
# (Make sure check_for_duplicates returns a list)
#duplicate_indices = check_for_duplicates(cleaned_graphs_v4) or []
empty_indices = check_empty_graphs(cleaned_graphs_v4)

# Merge, sort, and deduplicate indices
indices_to_remove = sorted(set(empty_indices))


# Merge, sort, and deduplicate indices
#indices_to_remove = sorted(set(duplicate_indices + empty_indices))

# Remove graphs and verify
cleaned_graphs_v5 = remove_graphs(cleaned_graphs_v4, indices_to_remove)

# Print length before and after to verify removal
print(f"Length before: {len(cleaned_graphs_v4)}, Length after: {len(cleaned_graphs_v5)}")


Graph at index 10681
------------------------
Vertices: [{'id': 34419.0, 'text': 'Kialo should provide separate places to debate the following kinds of pros and cons. We would use the outcome of these debates to generate distinct scores.'}, {'id': 34419.3, 'text': '1\\) "Truth" or "Veracity." This score would combine with "Importance Scores" to determine a "Conclusion Score." The Conclusion Score would attempt to identify a belief\'s value, aside from claims that it supports other conclusions.'}, {'id': 34419.5, 'text': "Even if two arguments have similar chances of being true, it doesn't mean they will always have the same impact if we assumed they were true. Therefore, truth can't be all that matters, and we must rate the importance of arguments, separate from their truth and Relevance."}, {'id': 34419.7, 'text': 'True relevant arguments can be less important than other evidence.'}, {'id': 34419.9, 'text': 'We need to help users organize different parts of debates.'}, {'id': 34419.11

In [None]:
print(len(cleaned_graphs_v4) - len(cleaned_graphs_v5), "empty graphs having len(graph.vs) == 0 (<=> 0 nodes/claims) removed")

1737 empty graphs having len(graph.vs) == 0 (<=> 0 nodes/claims) removed


In [8]:
!pip install langdetect
import langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=7ce4a224895e5eb301fede925904f50025c95cb0eff8627e296e1b2ed41b68f3
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [9]:
from langdetect import detect
import re

def clean_text(text):
    # Remove URLs and special characters to help detection
    return re.sub(r'http\S+|\\|[@#$%^&*()_+={}\[\]:;"\'<>,.?/`~\n]', '', text)

def is_sentence_english(sentence):
    sentence_cleaned = clean_text(sentence)
    try:
        return detect(sentence_cleaned) == 'en'
    except:
        return False  # If language detection fails, assume non-English

def filter_non_english_discussions(cleaned_graphs):
    non_english_graphs = []
    english_graphs = []

    for graph in cleaned_graphs:
        graph_ids = [int(float(node["name"]["id"])) for node in graph.vs]
        if graph_ids:
            base_claim_id = min(graph_ids)
            base_claim_text = [node["name"]["text"] for node in graph.vs if int(float(node["name"]["id"])) == base_claim_id][0]

            if is_sentence_english(base_claim_text):
                english_graphs.append(graph)
            else:
                non_english_graphs.append(base_claim_id)  # Save the ID of the base claim of the non-English discussion

    print(f"Total number of non-English graphs: {len(non_english_graphs)}")
    print(f"IDs of non-English graphs: {non_english_graphs}")

    return english_graphs


cleaned_graphs_v6 = filter_non_english_discussions(cleaned_graphs_v5)


NameError: ignored

In [None]:
# Step 3: Save Cleaned Data
with open(base_path_gdrive+"2/cleaned_graphs_v6.pkl", "wb") as f:
    pickle.dump(cleaned_graphs_v6, f)

In [None]:
def build_node_relations(graph):
    """
    fct that creates a dictionary, that stores for each node its relation
    graphs are stored in same order as the cleaned_graphs list
    """
    node_relations = {}
    for edge in graph.es:
        source_idx = edge.source
        relation = edge["relation"]

        if source_idx in node_relations:
            node_relations[source_idx].append(relation)
        else:
            node_relations[source_idx] = [relation]

    return node_relations

# Build node_relations for all graphs
all_node_relations_v2 = [build_node_relations(graph) for graph in cleaned_graphs_v6]


In [None]:

n = 10
print_first_n_entries(all_node_relations, n)

Entry 1:
Key: 0
Value:
Entry 2:
Key: 0
Value:
Entry 3:
Key: outgoing_nodes
Value:
  [151]

Entry 4:
Key: relations
Value:
  [0]

Entry 5:
Key: 1
Value:
Entry 6:
Key: outgoing_nodes
Value:
  [2, 3, 10, 14, 29, 59, 100, 150, 154, 65]

Entry 7:
Key: relations
Value:
  [1, -1, -1, 1, 1, -1, -1, -1, 1, 1]

Entry 8:
Key: 2
Value:
Entry 9:
Key: outgoing_nodes
Value:
  [151, 496, 497, 72, 79, 24]

Entry 10:
Key: relations
Value:
  [1, 1, 1, -1, -1, 1]



In [None]:
NODE_INFO_PATH = base_path_gdrive+"2/all_node_info_v2.pkl"
with open (NODE_INFO_PATH  , 'rb') as f:
    all_node_info = pickle.load(f)

In [None]:
CLEANED_GRAPHS_V6_PATH = NODE_INFO_PATH = base_path_gdrive+"/2/cleaned_graphs_v6.pkl"
with open (CLEANED_GRAPHS_V4_PATH, 'rb') as f:
    cleaned_graphs_v4 = pickle.load(f)

In [None]:
def create_node_info(cleaned_graphs):
    node_info = {}

    for graph_index, graph in enumerate(cleaned_graphs):
        node_info[graph_index] = {}

        for vertex_index, vertex in enumerate(graph.vs):
            outgoing_edges = [edge for edge in graph.es if edge.source == vertex.index]
            outgoing_nodes = [edge.target for edge in outgoing_edges]
            relations = [edge["relation"] for edge in outgoing_edges]

            node_info[graph_index][vertex_index] = {
                'outgoing_nodes': outgoing_nodes,
                'relations': relations
            }

    return node_info

all_node_info = create_node_info(cleaned_graphs_v6)


In [None]:
# Try saving the sample node_relations dictionary to a pickle file again
with open(base_path_gdrive+"2/all_node_info_v2.pkl", 'wb') as f:
    pickle.dump(all_node_info, f)

'all_node_info_v2.pkl'

'all_node_info_v2.pkl'

In [None]:
# Try saving the sample node_relations dictionary to a pickle file again
with open(base_path_gdrive + "1/cleaned_graphs_v4.pkl", 'wb') as f:
    pickle.dump(cleaned_graphs_v4, f)

'cleaned_graphs_v4.pkl has been saved'

In [None]:

n = 10
print_first_n_entries(all_node_info, n)

Entry 1:
Key: 0
Value:
Entry 2:
Key: 0
Value:
Entry 3:
Key: outgoing_nodes
Value:
  [151]

Entry 4:
Key: relations
Value:
  [0]

Entry 5:
Key: 1
Value:
Entry 6:
Key: outgoing_nodes
Value:
  [2, 3, 10, 14, 29, 59, 100, 150, 154, 65]

Entry 7:
Key: relations
Value:
  [1, -1, -1, 1, 1, -1, -1, -1, 1, 1]

Entry 8:
Key: 2
Value:
Entry 9:
Key: outgoing_nodes
Value:
  [151, 496, 497, 72, 79, 24]

Entry 10:
Key: relations
Value:
  [1, 1, 1, -1, -1, 1]



In [None]:
len(all_node_info)

11337

In [None]:
len(cleaned_graphs_v4)

16981

In [None]:
all_node_info[0][0]

{'outgoing_nodes': [151], 'relations': [0]}

## CURRENT POINT

In [None]:
NODE_INFO_PATH = base_path_gdrive+"1/all_node_info.pkl"
with open (NODE_INFO_PATH , 'rb') as f:
    all_node_info = pickle.load(f)

In [None]:
import random

# Setting random seed for reproducibility
random.seed(42)

def random_traversal_debugged(cleaned_graphs, node_info):
    all_traversal_data = []
    duplicate_count = 0
    skip_count = 0

    for graph_index, graph in enumerate(cleaned_graphs):
        #print(f"Working on graph {graph_index + 1}/{len(cleaned_graphs)}")

        # Initialize
        traversal_data = []

        # Skip graphs with fewer than 3 nodes
        if len(graph.vs) < 3:
            #print("Skipping graph due to insufficient nodes.")
            skip_count += 1
            continue

        first_claim_id = None  # To store the first claim ID

        for i in range(2):  # We are interested in two claims
            current_node_idx = 0  # Start from the base claim
            current_stance = 1  # Initialize stance as 1 (Pro)

            # Get outgoing nodes and relations
            outgoing_nodes = node_info[graph_index].get(current_node_idx, {}).get('outgoing_nodes', [])

            if not outgoing_nodes:
                #print("No outgoing nodes found, breaking loop.")
                break

            # Choose a random next node that is not the base claim and not the first claim
            next_node_idx = random.choice([node for node in outgoing_nodes if node != current_node_idx and node != first_claim_id])


            claim_text = graph.vs[next_node_idx]["name"]["text"]
            """
            if any(d['Claim Text'] == claim_text for d in traversal_data):
                duplicate_count += 1
                #print(f"Duplicate found: {claim_text}. Total duplicates so far: {duplicate_count}")
                break"""

            # Fetch the relation of the edge between current_node and next_node
            relations = node_info[graph_index].get(current_node_idx, {}).get('relations', [])
            edge_relation = relations[outgoing_nodes.index(next_node_idx)]
            current_stance *= edge_relation

            # Update traversal data
            claim_id = graph.vs[next_node_idx]["name"]["id"]
            traversal_data.append({
                "Claim ID": claim_id,
                "Claim Text": claim_text,
                "Stance": current_stance
            })

            # Update the first_claim_id if this is the first iteration
            if i == 0:
                first_claim_id = claim_id

        # Add the traversal data for this graph to the all_traversal_data list
        all_traversal_data.append(traversal_data)
    print("number of graphs skipped ", skip_count)
    return all_traversal_data, duplicate_count
# Example usage
#traversal_data, duplicate_count = random_traversal_debugged(cleaned_graphs_v4, all_node_info)
#print(f"Total number of duplicates found: {duplicate_count}")


In [None]:
traversal_data_2, duplicate_count_2 = random_traversal_debugged(cleaned_graphs_v6, all_node_info)

number of graphs skipped  3499


## Reorganizing the data for probing

In [114]:
def create_base_claim_dict(cleaned_graphs): # TODO : rename
    """
    - cleaned_graphs: list of igraph objects

    Returns: Discussion Ids
    """
    base_claim_dict = {}
    for graph in cleaned_graphs:
        graph_ids = [int(float(node["name"]["id"])) for node in graph.vs]
        if graph_ids:
          base_claim_id = min(graph_ids)
          base_claim_text = [node["name"]["text"] for node in graph.vs if int(float(node["name"]["id"])) == base_claim_id][0]
          discussion_id = base_claim_id // 1  # get the integer part of the base claim ID
          base_claim_dict[discussion_id] = (base_claim_text, base_claim_id)
    return base_claim_dict

base_claim_dict = create_base_claim_dict(cleaned_graphs_v6)


In [118]:
n = 10
print_first_n_entries(base_claim_dict, n )

Entry 1:
Key: 333
Value:
  ('Anonymous currency discussions', 333)

Entry 2:
Key: 444
Value:
  ('There should be a 100% inheritance tax on all inherited wealth above 1 million Euros.', 444)

Entry 3:
Key: 486
Value:
  ('The West should build working autonomous killing machines \\(AKMs\\) as quickly as possible.', 486)

Entry 4:
Key: 801
Value:
  ('Capital punishment should be abolished in the United States.', 801)

Entry 5:
Key: 864
Value:
  ('The world would be better off without organized religion.', 864)

Entry 6:
Key: 921
Value:
  ('God is an invention of the elites and religious rules are made to ensure and enhance their power.', 921)

Entry 7:
Key: 939
Value:
  ('Polygamy should be legal.', 939)

Entry 8:
Key: 946
Value:
  ("What is the EU's best strategy to deal with refugees coming from Africa and Near/Middle East?", 946)

Entry 9:
Key: 963
Value:
  ('Parental consent should be required for pregnant minors to have abortions.', 963)

Entry 10:
Key: 969
Value:
  ('The right of po

In [115]:
def get_base_claim(claim_id):
    discussion_id = int(float(claim_id))
    base_claim_text, _ = base_claim_dict.get(discussion_id, (None, None))
    return base_claim_text

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def organize_and_create_dataframe_final(traversal_data, cleaned_graphs):
    data = []

    for idx, claim_group in tqdm(enumerate(traversal_data)):
        # Safety checks
        if len(claim_group) >= 2:
            claim1 = claim_group[0]
            claim2 = claim_group[1]

            if 'Claim Text' in claim1 and 'Stance' in claim1 and 'Claim ID' in claim1 and \
               'Claim Text' in claim2 and 'Stance' in claim2:
                claim1_text = claim1['Claim Text']
                stance1 = claim1['Stance']
                claim2_text = claim2['Claim Text']
                stance2 = claim2['Stance']
                label = 1 if stance1 == stance2 else 0
                claim1_id = claim1["Claim ID"]
                base_claim = get_base_claim(claim1_id)


            if base_claim and (claim1_text != claim2_text):  # Only add if a base claim exists and claims are not the same
              data.append(((claim1_text, claim2_text), "", base_claim, "Pro" if label == 1 else "Con", idx, label, label))
    # Create DataFrame
    df = pd.DataFrame(data, columns=['inputs', 'context', 'topic', 'org_label', 'id', 'string_label', 'label'])

    # Split Data
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train, dev = train_test_split(train, test_size=0.25, random_state=42)

    # Assign set
    train['set-0'] = 'train'
    dev['set-0'] = 'dev'
    test['set-0'] = 'test'

    # Combine and Save
    final_df = pd.concat([train, dev, test])

    # Saving to Google Drive
    final_df.to_csv('/content/drive/MyDrive/Can Language Models Follow Discussions?/1/folds_1_v4.csv', index=True)

    return final_df

# Call function
#organize_and_create_dataframe_final(traversal_data,cleaned_graphs_v4)

In [None]:
probing_1_df = organize_and_create_dataframe_final(traversal_data_2,cleaned_graphs_v6)

7838it [00:00, 361426.08it/s]


In [None]:
"""def get_base_claim_v0(claim_id, cleaned_graphs):
    discussion_id = int(claim_id)  # Extract the integer part of the claim ID
    #print("discussion id= ", discussion_id)
    for graph in cleaned_graphs:
        graph_discussion_id = int(graph.vs[0]["name"]["id"])
        #print(graph_discussion_id)
        if graph_discussion_id == discussion_id:
            print("one after base claim : \n",graph.vs[1]["name"]["id"], graph.vs[1]["name"]["text"],"\nbase claim :")
            return graph.vs[0]["name"]["text"], graph.vs[0]["name"]["id"]
    return None  # If the base claim wasn't found

# Example usage
claim_id = 1474.146
base_claim = get_base_claim_v0(claim_id, cleaned_graphs_v4)
print(base_claim)"""

one after base claim : 
 1474.144 Widespread public awareness of a sufficiently cruel punishment is likely to deter even those who struggle with self control. 
base claim :
('This is particularly true for the kind of crimes that attract the death penalty.', 1474.143)


In [None]:
 #get_base_claim_v0(1715.444, cleaned_graphs_v4)

one after base claim : 
 1715.3 Affirmative action reinforces negative racial stigmas. 
base claim :


('It is time to end [affirmative action](http://www.ncsl.org/research/education/affirmative-action-overview.aspx) in college admissions.',
 1715.1)

We can observe that the base claim isnt always at the 1st position therefore we access it by using min

In [None]:
def get_base_claim_deprecated(claim_id, cleaned_graphs):
    """
    finds base claim by taking the smallest node id per graph (min)
    """
    discussion_id = int(float(claim_id))  # Handles claim IDs with multiple decimal points
    for graph in cleaned_graphs:
        graph_ids = [int(float(node["name"]["id"])) for node in graph.vs]
        if discussion_id in graph_ids:
            base_claim_id = min([node["name"]["id"] for node in graph.vs if int(float(node["name"]["id"])) == discussion_id])
            base_claim_text = [node["name"]["text"] for node in graph.vs if node["name"]["id"] == base_claim_id][0]
            return base_claim_text, base_claim_id
    return None  # If the base claim wasn't found

# Example usage
claim_id = 1715.444
base_claim = get_base_claim(claim_id, cleaned_graphs_v4)
print(base_claim)


In [None]:
 get_base_claim(1470.7, cleaned_graphs_v4)

('Convicted felons in the United States \\(that is: sentenced of a crime punishable with at least one year in prison or death\\) should permanently lose their voting rights.',
 1470.1)

In [None]:
 get_base_claim(1715.444, cleaned_graphs_v4)

('It is time to end [affirmative action](http://www.ncsl.org/research/education/affirmative-action-overview.aspx) in college admissions.',
 1715.1)

In [None]:
print(len(traversal_data), traversal_data)

2 [{'Claim ID': 0, 'Claim Text': 'Deberían existir las redes de pesca?', 'Stance': 1}, {'Claim ID': 0, 'Claim Text': 'Deberían existir las redes de pesca?', 'Stance': 1}]


In [None]:
cleaned_graphs_v4[0].es[1].target_vertex["name"]["id"]

333.3

In [None]:
cleaned_graphs_v4[0].es[0].target_vertex["name"]["id"]

333.2

In [None]:
import random
import igraph as ig


# Function to calculate max depth from the base claim
def calculate_max_depth(graph, base_claim_index=0):
    return max([len(path) for path in graph.get_all_shortest_paths(base_claim_index)])



In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Can Language Models Follow Discussions?/1/probing files v1/folds_1_v3 (1).csv')

In [None]:
df

Unnamed: 0,index,inputs,context,topic,org_label,string_label,label,set-0
0,0,('För mycket jobb att tänka på detta som stör ...,,Ska Arcada gå inför en aktivitetsbaserad arbet...,Con,0,0,train
1,1,"('YES! When people do something so horrific, t...",,Should the death penalty be allowed?,Con,0,0,train
2,2,('We would get a better understanding of the c...,,Should college cost be more regulated by the g...,Pro,1,1,train
3,3,('This would make it easier for fresh faces wi...,,Congress should have limited terms.,Con,0,0,train
4,4,('Friendships with ex-partners are likely to b...,,Should I stay friends with my ex-partner while...,Pro,1,1,train
...,...,...,...,...,...,...,...,...
7237,7237,('Maus es una ficción porque a pesar de estar ...,,Maus es una no ficción porque trata acerca de ...,Pro,1,1,test
7238,7238,('Article 11\\(1\\) prohibits for-profit web s...,,The EU was right to approve the [Directive on ...,Pro,1,1,test
7239,7239,('Modern industrial medicine has improved heal...,,Has the Industrial Revolution and its conseque...,Pro,1,1,test
7240,7240,"('Emotions relate to ethical judgements.', 'Em...",,Are our emotions a useful potential source of ...,Pro,1,1,test


## Probing 1 Example extended with topic added to claim 1

Loading the cleaned data containing only English examples to avoid confunding variable "language" for probes

In [61]:
cleaned_graphs_v6 = load_file("1/final probing files/full/only_claims/folds.csv","")

In [None]:
import pandas as pd
#df_only_english = pd.read_csv('/content/drive/MyDrive/Can Language Models Follow Discussions?/1/final probing files/folds_1_all_only_english.csv')
#

In [21]:
cleaned_graphs_v6.columns

Index(['Unnamed: 0', 'inputs', 'context', 'topic', 'org_label', 'id',
       'string_label', 'label', 'set-0'],
      dtype='object')

In [22]:
cleaned_graphs_v6.head()

Unnamed: 0.1,Unnamed: 0,inputs,context,topic,org_label,id,string_label,label,set-0
0,3346,"('The Republican Party has had a successful, n...",,Republican members of the US Congress are sign...,Pro,4852,1,1,train
1,4753,"(""Cutting the genitals of boys is essentially ...",,Laws against FGM and anti FGM campaigns breach...,Con,6879,0,0,train
2,1012,('Users would have to practice to be proficien...,,Why not develop semi-automatic tranq guns and ...,Con,1408,0,0,train
3,4058,"('Any stable two parent family functions well,...",,Children need a father.,Con,5905,0,0,train
4,4823,('Peer pressure is not always bad. It can help...,,Peer Pressure is a good thing.,Pro,6978,1,1,train


In [60]:
# Define a function to append the topic to the first claim
def append_topic_to_first_claim(inputs_tuple):
   #print(type(row['inputs']))
    # Extracting the first and second claims

    first_claim, second_claim = inputs_tuple

    # Appending the topic to the first claim
    new_first_claim = f"{inputs_tuple}. {first_claim}"

    # Creating a new tuple with the modified first claim
    new_input = new_first_claim, second_claim

    return new_input

In [62]:
print(cleaned_graphs_v6['inputs'].apply(type).value_counts())
# convert string to tuples
import ast
cleaned_graphs_v6['inputs'] = cleaned_graphs_v6['inputs'].apply(ast.literal_eval)

<class 'str'>    5435
Name: inputs, dtype: int64


In [63]:
print(cleaned_graphs_v6['inputs'].apply(type).value_counts())

<class 'tuple'>    5435
Name: inputs, dtype: int64


In [64]:
# Function to append topic to the first claim
def append_topic_to_first_claim(row):
    first_claim, second_claim = row['inputs']
    topic = row['topic']
    new_first_claim = f"{first_claim} {topic}"
    return (new_first_claim, second_claim)

# Apply function and overwrite 'inputs' column
cleaned_graphs_v6['inputs'] = cleaned_graphs_v6.apply(append_topic_to_first_claim, axis=1)

# Show the updated DataFrame
print(cleaned_graphs_v6)

      Unnamed: 0                                             inputs  context  \
0           3346  (The Republican Party has had a successful, na...      NaN   
1           4753  (Cutting the genitals of boys is essentially t...      NaN   
2           1012  (Users would have to practice to be proficient...      NaN   
3           4058  (Any stable two parent family functions well, ...      NaN   
4           4823  (Peer pressure is not always bad. It can help ...      NaN   
...          ...                                                ...      ...   
5430         757  (These so called happy few have no interest wh...      NaN   
5431        1941  (Popular protest and civil disobedience have h...      NaN   
5432        1041  (It's incredibly unlikely to happen - banks ha...      NaN   
5433        1634  (Credit unions are better than shareholder ban...      NaN   
5434        2715  (They have too much debt Puerto Rico should be...      NaN   

                                       

In [65]:
# Reset index
cleaned_graphs_v6.reset_index(drop=True, inplace=True)
cleaned_graphs_v6.drop(["Unnamed: 0", "id"], inplace=True, axis = 1)
cleaned_graphs_v6.head()

Unnamed: 0,inputs,context,topic,org_label,string_label,label,set-0
0,"(The Republican Party has had a successful, na...",,Republican members of the US Congress are sign...,Pro,1,1,train
1,(Cutting the genitals of boys is essentially t...,,Laws against FGM and anti FGM campaigns breach...,Con,0,0,train
2,(Users would have to practice to be proficient...,,Why not develop semi-automatic tranq guns and ...,Con,0,0,train
3,"(Any stable two parent family functions well, ...",,Children need a father.,Con,0,0,train
4,(Peer pressure is not always bad. It can help ...,,Peer Pressure is a good thing.,Pro,1,1,train


In [69]:
 # Saving to Google Drive
cleaned_graphs_v6.to_csv('/content/drive/MyDrive/Can Language Models Follow Discussions?/1/final probing files/full/topic_claim_1/folds.csv', index=True)

In [None]:
# save_dataframe_to_drive(1, df, file_type='e', task_name='my_task')

In [None]:

save_final_files_to_drive(1, df_only_english_with_context, num_inputs=2, additional_info="topic_claim_1")

Files saved at /content/drive/MyDrive/Can Language Models Follow Discussions?/1/final probing files/full/topic_claim_1


In [13]:
!pip install tqdm



In [None]:
import pickle
import pandas as pd
from tqdm import tqdm  # for the progress bar

def extract_node_attributes(cleaned_graphs, node_info, base_path_gdrive):
    """
    Extracts attributes for each node in a list of graphs.

    Parameters:
    - cleaned_graphs: List of cleaned iGraph objects.
    - node_info: Dictionary containing outgoing nodes and relations for each node.
    - base_path_gdrive: Base path for saving the output file.

    Returns:
    - all_node_attributes: Dictionary containing attributes for each node in each graph.
    """

    all_node_attributes = {}  # Dictionary to store attributes for each graph

    # Progress bar for tracking
    for graph_index, graph in enumerate(tqdm(cleaned_graphs, desc='Processing graphs')):

        # Skip graphs with fewer than 3 nodes
        if len(graph.vs) < 3:
            continue

        graph_attributes = {}  # Dictionary to store attributes for this graph
        queue = [(0, 0, 0)]  # Initialize BFS queue with (node_index, depth, parent_node)

        while queue:
            current_node_idx, depth, parent_node = queue.pop(0)

            # Error handling: Check if node exists
            if current_node_idx >= len(graph.vs):
                continue

            # Get node ID for more meaningful indexing
            node_id = graph.vs["name"][current_node_idx]["id"]

            # Initialize parent_node as None for the base claim
            if current_node_idx == 0:
                parent_node = None

            # Calculate breadth (order in which nodes at the same depth are visited)
            breadth = sum(1 for node in graph_attributes.values() if node['Depth'] == depth)

            # Get outgoing nodes and relations
            outgoing_nodes = node_info[graph_index].get(current_node_idx, {}).get('outgoing_nodes', [])
            relations = node_info[graph_index].get(current_node_idx, {}).get('relations', [])

            # Calculate overall stance score
            stance_score = sum(relations)  # Sum of relations
            normalized_stance_score = stance_score / len(relations) if relations else 0  # Handle division by zero

            # Save attributes
            graph_attributes[node_id] = {
                "Depth": depth,
                "Breadth": breadth,
                "Parent Node": parent_node,
                "Normalized Stance Score": normalized_stance_score,
                "Outgoing Nodes": outgoing_nodes
            }

            # Add child nodes to queue
            for child_node in outgoing_nodes:
                queue.append((child_node, depth + 1, node_id))

        all_node_attributes[graph_index] = graph_attributes

    # Save to an external file as a pickle
    with open(base_path_gdrive+'2/node_attributes.pkl', 'wb') as f:
        pickle.dump(all_node_attributes, f)

    # Or save as a DataFrame for easier manipulation
    df = pd.DataFrame.from_dict({(i,j): all_node_attributes[i][j]
                                for i in all_node_attributes.keys()
                                for j in all_node_attributes[i].keys()},
                               orient='index')
    df.to_csv(base_path_gdrive+'2/node_attributes.csv')

    return all_node_attributes


In [None]:
all_node_attributes = extract_node_attributes(cleaned_graphs_v6, all_node_info, base_path_gdrive)

Processing graphs: 100%|██████████| 11337/11337 [00:08<00:00, 1350.82it/s]


In [80]:
all_node_attributes_df = load_file("2/node_attributes.csv","")

In [82]:
all_node_attributes_df.rename(columns={"Unnamed: 0" : "Graph_Index", "Unnamed: 1": "Node_ID"}, inplace = True)

In [83]:
all_node_attributes_df.head()

Unnamed: 0,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Outgoing Nodes
0,0,333.0,0,0,,0.0,[151]
1,0,333.167,1,0,333.0,0.454545,"[225, 152, 368, 396, 454, 474, 478, 541, 564, ..."
2,0,333.338,2,0,333.167,-0.142857,"[226, 227, 237, 398, 468, 580, 585]"
3,0,333.168,2,1,333.167,1.0,"[494, 495]"
4,0,333.644,2,2,333.167,0.333333,"[370, 476, 400]"


In [None]:
def update_outgoing_ids(df, cleaned_graphs):
    updated_outgoing_nodes = []

    for index, row in df.iterrows():
        graph_idx = int(row['Graph_Index'])
        outgoing_indices = row['Outgoing Nodes']

        if isinstance(outgoing_indices, str):
            outgoing_indices = eval(outgoing_indices)

        graph = cleaned_graphs[graph_idx]

        try:
            outgoing_ids = [graph.vs[node]['name']['id'] for node in outgoing_indices]
            updated_outgoing_nodes.append(outgoing_ids)

        except KeyError as e:
            print(f"KeyError at index {index}, graph index {graph_idx}, outgoing indices {outgoing_indices}")
            raise e  # re-raise the exception to stop the execution and debug

        except Exception as e:
            print(f"An unknown error occurred: {e}")
            raise e  # re-raise the exception to stop the execution and debug

    df['Outgoing_Node_IDs'] = updated_outgoing_nodes

# Error handling and DataFrame update
try:
    update_outgoing_ids(all_node_attributes_df, cleaned_graphs_v6)
    all_node_attributes_df.to_csv(base_path_gdrive+'2/node_attributes_v2.csv')
except Exception as e:
    print(f"An error occurred: {e}")


In [84]:
def update_outgoing_ids(df, cleaned_graphs_v6):
    """
    Update the DataFrame with original IDs for outgoing nodes.

    Parameters:
    - df: DataFrame containing node attributes
    - cleaned_graphs_v6: List of igraph Graph objects

    Returns:
    Updated DataFrame with a new column 'Outgoing_Node_IDs' containing original IDs for outgoing nodes.

    Note:
    This function is designed to mitigate the dependency on the order of nodes.
    It fetches the original IDs that are saved in the graph attributes, ensuring a consistent mapping.
    """
    updated_outgoing_nodes = []

    for index, row in df.iterrows():
        graph_idx = int(row['Graph_Index'])  # Ensuring it's an integer
        outgoing_indices = row['Outgoing Nodes']

        # Convert to list of integers if it's not already
        if isinstance(outgoing_indices, str):
            outgoing_indices = eval(outgoing_indices)

        graph = cleaned_graphs_v6[graph_idx]

        try:
            outgoing_ids = []
            for edge in graph.es:
                if edge.source in outgoing_indices:
                    outgoing_ids.append(edge.target_vertex["name"]["id"])

            updated_outgoing_nodes.append(outgoing_ids)
        except KeyError as e:
            print(f"Error at index {index}, graph index {graph_idx}, outgoing_indices {outgoing_indices}")
            print(f"Available attributes for graph.vs[i]: {graph.vs[0]['name']}")
            raise e  # re-raise the exception to stop execution and debug

    df['Outgoing_Node_IDs'] = updated_outgoing_nodes

# Error handling and DataFrame update
try:
    update_outgoing_ids(all_node_attributes_df, cleaned_graphs_v6)
    all_node_attributes_df.to_csv(base_path_gdrive+'2/node_attributes_v2.csv')
except Exception as e:
    print(f"An error occurred: {e}")


An error occurred: 0


In [86]:
all_node_attributes_df = load_file("2/node_attributes_v2.csv","")

In [90]:
all_node_attributes_df.rename(columns={"Unnamed: 0" : "Index"}, inplace = True)

In [93]:
all_node_attributes_df.to_csv(base_path_gdrive+'2/node_attributes_v2.csv')

In [92]:
all_node_attributes_df

Unnamed: 0,Index,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Outgoing Nodes,Outgoing_Node_IDs
0,0,0,333.000,0,0,,0.000000,[151],"[333.338, 333.168, 333.644, 333.69, 333.794, 3..."
1,1,0,333.167,1,0,333.000,0.454545,"[225, 152, 368, 396, 454, 474, 478, 541, 564, ...","[333.339, 333.34, 333.355, 333.646, 333.694, 3..."
2,2,0,333.338,2,0,333.167,-0.142857,"[226, 227, 237, 398, 468, 580, 585]","[333.341, 333.695, 333.707, 333.898, 333.899, ..."
3,3,0,333.168,2,1,333.167,1.000000,"[494, 495]",[]
4,4,0,333.644,2,2,333.167,0.333333,"[370, 476, 400]","[333.659, 333.856, 333.858, 333.857, 333.927]"
...,...,...,...,...,...,...,...,...,...
241654,241654,11336,46614.171,1,76,46614.300,0.000000,[],[]
241655,241655,11336,46614.173,1,77,46614.300,0.000000,[],[]
241656,241656,11336,46614.170,2,0,46614.500,0.000000,[],[]
241657,241657,11336,46614.450,2,1,46614.230,0.000000,[],[]




In [124]:
# The function to create base_claim_dict remains unchanged
def create_base_claim_dict(cleaned_graphs):
    base_claim_dict = {}
    for graph in cleaned_graphs:
        graph_ids = [node["name"]["id"] for node in graph.vs]
        if graph_ids:
            base_claim_id = min(graph_ids)
            base_claim_dict[int(base_claim_id)] = base_claim_id
    return base_claim_dict

In [None]:
cleaned_graphs_v6 = load_file("2/cleaned_graphs_v6.pkl", "p")
base_claim_dict = create_base_claim_dict(cleaned_graphs_v6) # dict key is an integer (discussion id) and the value is the corresponding base claim of the discussion

In [126]:
cleaned_graphs_v6 = load_file("2/cleaned_graphs_v6.pkl", "p")
# Assume df is your DataFrame containing the node attributes
def add_base_claim_id_column(df, cleaned_graphs):
    base_claim_dict = create_base_claim_dict(cleaned_graphs)
    # Create a new column for Base_Claim_ID
    df['Base_Claim_ID'] = df['Node_ID'].apply(lambda x: base_claim_dict.get(int(x), None))
    # Save to Google Drive or any other location
    df.to_csv(base_path_gdrive + "2/"+"node_attributes_with_base_claim.csv")
    return df

# Usage
all_node_attributes_df_2 = add_base_claim_id_column(all_node_attributes_df, cleaned_graphs_v6)

In [127]:
all_node_attributes_df_2

Unnamed: 0,Index,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Outgoing Nodes,Outgoing_Node_IDs,Base_Claim_ID
0,0,0,333.000,0,0,,0.000000,[151],"[333.338, 333.168, 333.644, 333.69, 333.794, 3...",333.000
1,1,0,333.167,1,0,333.000,0.454545,"[225, 152, 368, 396, 454, 474, 478, 541, 564, ...","[333.339, 333.34, 333.355, 333.646, 333.694, 3...",333.000
2,2,0,333.338,2,0,333.167,-0.142857,"[226, 227, 237, 398, 468, 580, 585]","[333.341, 333.695, 333.707, 333.898, 333.899, ...",333.000
3,3,0,333.168,2,1,333.167,1.000000,"[494, 495]",[],333.000
4,4,0,333.644,2,2,333.167,0.333333,"[370, 476, 400]","[333.659, 333.856, 333.858, 333.857, 333.927]",333.000
...,...,...,...,...,...,...,...,...,...,...
241654,241654,11336,46614.171,1,76,46614.300,0.000000,[],[],46614.101
241655,241655,11336,46614.173,1,77,46614.300,0.000000,[],[],46614.101
241656,241656,11336,46614.170,2,0,46614.500,0.000000,[],[],46614.101
241657,241657,11336,46614.450,2,1,46614.230,0.000000,[],[],46614.101


In [142]:
def extend_dataframe_with_igraph_info(df, cleaned_graphs_v6):
    """
    Extends the DataFrame with additional information available in the igraph objects.

    Parameters:
    - df: DataFrame containing node attributes
    - cleaned_graphs_v6: List of igraph objects representing the discussion graphs

    Returns:
    - df: Extended DataFrame
    """

    # Initialize empty lists to hold the new data
    claim_texts = []
    outgoing_nodes_ids_2 = []
    base_claim_texts = []
    relations_to_outgoing = []

    # Iterate over each unique graph in the DataFrame
    for graph_index, graph_df in tqdm(df.groupby('Graph_Index'), desc='Processing graphs'):

        # Fetch the corresponding igraph object
        graph = cleaned_graphs_v6[graph_index]

        # Iterate through the DataFrame rows
        for _, row in graph_df.iterrows():

            # Get the Node_ID of the current row
            current_node_id = row['Node_ID']

            # Retrieve the claim text for the current node
            current_text = graph.vs.find(lambda vertex: vertex['name']['id'] == current_node_id)['name']['text']
            claim_texts.append(current_text)

            # Get the text of the base claim (assuming it's the first node in the graph)
            base_claim_text = graph.vs[0]['name']['text']
            base_claim_texts.append(base_claim_text)

            # Initialize empty lists for the new outgoing node IDs and their relations
            new_outgoing_node_ids = []
            new_relations = []

            # Iterate through the edges to find outgoing nodes and relations for the current node
            for edge in graph.es:
                source_id = edge.source_vertex['name']['id']
                if source_id == current_node_id:
                    target_id = edge.target_vertex['name']['id']
                    relation = edge['relation']
                    new_outgoing_node_ids.append(target_id)
                    new_relations.append(relation)

            # Append the new data
            outgoing_nodes_ids_2.append(new_outgoing_node_ids)
            relations_to_outgoing.append(new_relations)

    # Add the new columns to the DataFrame
    df['Claim_Text'] = claim_texts
    df['Outgoing_Node_IDs_2'] = outgoing_nodes_ids_2
    df['Base_Claim_Text'] = base_claim_texts
    df['Relations_To_Outgoing'] = relations_to_outgoing

    return df

extended_df = extend_dataframe_with_igraph_info(all_node_attributes_df_2, cleaned_graphs_v6)


Processing graphs: 100%|██████████| 7838/7838 [03:22<00:00, 38.76it/s]


In [148]:
def save(df, name, probing_task_nr, input_type = "df", output_type = "csv"):
  base_path_gdrive= "/content/drive/MyDrive/Can Language Models Follow Discussions?/"
  if input_type == "df":
    if output_type == "csv":
          df.to_csv(base_path_gdrive + f"{probing_task_nr}/{name}.{output_type}")
          print(f"{name}.{output_type} saved to {base_path_gdrive}{probing_task_nr}/")

save(extended_df, "node_attributes_v3", 2)

node_attributes_v3.csv saved to /content/drive/MyDrive/Can Language Models Follow Discussions?/2/


In [149]:
extended_df.head()

Unnamed: 0,Index,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Outgoing Nodes,Outgoing_Node_IDs,Base_Claim_ID,Claim_Text,Outgoing_Node_IDs_2,Base_Claim_Text,Relations_To_Outgoing
0,0,0,333.0,0,0,,0.0,[151],"[333.338, 333.168, 333.644, 333.69, 333.794, 3...",333.0,Anonymous currency discussions,[333.167],Anonymous currency discussions,[0]
1,1,0,333.167,1,0,333.0,0.454545,"[225, 152, 368, 396, 454, 474, 478, 541, 564, ...","[333.339, 333.34, 333.355, 333.646, 333.694, 3...",333.0,"Cryptocurrencies, such as Bitcoin and others, ...","[333.338, 333.168, 333.644, 333.69, 333.794, 3...",Anonymous currency discussions,"[-1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1]"
2,2,0,333.338,2,0,333.167,-0.142857,"[226, 227, 237, 398, 468, 580, 585]","[333.341, 333.695, 333.707, 333.898, 333.899, ...",333.0,Cryptocurrency mining is a waste of resources.,"[333.339, 333.34, 333.355, 333.694, 333.824, 3...",Anonymous currency discussions,"[1, -1, 1, -1, -1, -1, 1]"
3,3,0,333.168,2,1,333.167,1.0,"[494, 495]",[],333.0,Cryptocurrencies and the technologies built on...,"[333.891, 333.894]",Anonymous currency discussions,"[1, 1]"
4,4,0,333.644,2,2,333.167,0.333333,"[370, 476, 400]","[333.659, 333.856, 333.858, 333.857, 333.927]",333.0,A cryptocurrency is harder to counterfeit than...,"[333.646, 333.84, 333.697]",Anonymous currency discussions,"[-1, 1, 1]"


In [156]:
#extended_df.drop(["Outgoing Nodes","Outgoing_Node_IDs"],axis=1,inplace=True)
extended_df.rename({"Outgoing_Node_IDs_2":"Outgoing_Node_IDs"},axis=1, inplace=True)

In [157]:
extended_df

Unnamed: 0,Index,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Base_Claim_ID,Claim_Text,Outgoing_Node_IDs,Base_Claim_Text,Relations_To_Outgoing
0,0,0,333.000,0,0,,0.000000,333.000,Anonymous currency discussions,[333.167],Anonymous currency discussions,[0]
1,1,0,333.167,1,0,333.000,0.454545,333.000,"Cryptocurrencies, such as Bitcoin and others, ...","[333.338, 333.168, 333.644, 333.69, 333.794, 3...",Anonymous currency discussions,"[-1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1]"
2,2,0,333.338,2,0,333.167,-0.142857,333.000,Cryptocurrency mining is a waste of resources.,"[333.339, 333.34, 333.355, 333.694, 333.824, 3...",Anonymous currency discussions,"[1, -1, 1, -1, -1, -1, 1]"
3,3,0,333.168,2,1,333.167,1.000000,333.000,Cryptocurrencies and the technologies built on...,"[333.891, 333.894]",Anonymous currency discussions,"[1, 1]"
4,4,0,333.644,2,2,333.167,0.333333,333.000,A cryptocurrency is harder to counterfeit than...,"[333.646, 333.84, 333.697]",Anonymous currency discussions,"[-1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...
241654,241654,11336,46614.171,1,76,46614.300,0.000000,46614.101,JAJAJAJAJAJAJAJAJAJAJAJAJAAJJAJAJAJAAJAJJAJAJA...,[],Shoud uniforms be required?,[]
241655,241655,11336,46614.173,1,77,46614.300,0.000000,46614.101,WKGYEFUgfwdfiwudfysdgfygdsuagkdfgshfgsdjfkjdsg...,[],Shoud uniforms be required?,[]
241656,241656,11336,46614.170,2,0,46614.500,0.000000,46614.101,I don't like the uniforms because I prefer to ...,[],Shoud uniforms be required?,[]
241657,241657,11336,46614.450,2,1,46614.230,0.000000,46614.101,"Bulliying is not because our style, it's becau...",[],Shoud uniforms be required?,[]


In [158]:
# Sample code snippet to re-calculate the "Normalized Stance Score" and reorder columns
# Assumes that you have a dataframe named extended_df

# Recalculate "Normalized Stance Score"
# The Relations_To_Outgoing column is assumed to be a list of integers
# NaN and empty lists are treated as having an average of 0
extended_df['Normalized Stance Score'] = extended_df['Relations_To_Outgoing'].apply(
    lambda x: sum(x) / len(x) if x else 0
)

# Reorder columns
columns_order = ['Index', 'Graph_Index', 'Node_ID', 'Depth', 'Breadth', 'Parent Node',
                 'Base_Claim_ID', 'Base_Claim_Text', 'Claim_Text', 'Outgoing_Node_IDs', 'Relations_To_Outgoing',
                  'Normalized Stance Score']

extended_df = extended_df[columns_order]


In [160]:
save(extended_df, "node_attributes_v3", 2)

node_attributes_v3.csv saved to /content/drive/MyDrive/Can Language Models Follow Discussions?/2/


In [130]:
high_stance_nodes = all_node_attributes_df_2[all_node_attributes_df_2['Normalized Stance Score'] > 0.5]
high_stance_nodes

Unnamed: 0,Index,Graph_Index,Node_ID,Depth,Breadth,Parent Node,Normalized Stance Score,Outgoing Nodes,Outgoing_Node_IDs,Base_Claim_ID
3,3,0,333.1680,2,1,333.167,1.0,"[494, 495]",[],333.000
6,6,0,333.7940,2,4,333.167,1.0,"[455, 456, 458]","[333.797, 333.865, 333.96, 333.1075]",333.000
9,9,0,333.1055,2,7,333.167,1.0,[551],[333.1074],333.000
10,10,0,333.1089,2,8,333.167,1.0,"[565, 566, 567, 568]",[],333.000
31,31,0,333.7960,3,18,333.794,1.0,[457],[],333.000
...,...,...,...,...,...,...,...,...,...,...
241438,241438,11322,46431.1590,2,34,46431.124,1.0,[74],[],46431.101
241445,241445,11322,46431.2070,3,4,46431.600,1.0,[97],[],46431.101
241478,241478,11322,46431.9100,4,6,46431.270,1.0,"[45, 53]",[],46431.101
241488,241488,11322,46431.5200,5,1,46431.390,1.0,[25],[],46431.101


In [132]:
grouped_by_graph = all_node_attributes_df_2.groupby('Graph_Index').agg({'Normalized Stance Score': 'mean', 'Outgoing Nodes': 'count'})
grouped_by_graph

Unnamed: 0_level_0,Normalized Stance Score,Outgoing Nodes
Graph_Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.021729,80
1,-0.165577,229
2,-0.074221,767
3,-0.009770,116
4,-0.262887,97
...,...,...
11330,-0.083333,4
11331,-0.055556,9
11332,0.000000,7
11335,0.062222,15


# Probing 2 Example

## Task Definition

The primary objective of this research is to determine ***whether language models have the capability to identify the sequential relationship between two claims within the same discussion***.

**Input:** Two claims originating from the same discussion.

**Output:** A binary label indicating if the second claim directly responds to the first claim.

## Variations and Considerations

### Multi-level Consecutiveness
This task could be expanded to not only identify if a claim is a direct response, but also to determine the number of levels separating the two claims from each other.

### Base Claim Context
Incorporating the base claim as context could either simplify or complicate the task, depending on the model's effectiveness in utilizing it.

In [None]:
# Initialize lists to store data for DataFrame
claim_pairs = []
context = []
topics = []
org_labels = []
string_labels = []
identifiers = []

# Loop through DataFrame to generate data for tasks
for index, row in df.iterrows():
    parent_node = row['Parent Node']
    current_node = row['Node_ID']
    graph_index = row['Graph_Index']
    stance = row['Normalized Stance Score']
    depth = row['Depth']

    # 1. Recognizing Consecutive Claims
    is_consecutive = parent_node == row['Parent Node']
    claim_pairs.append((parent_node, current_node))
    context.append(None)  # Assuming no context is needed
    topics.append(row['Base_Claim_ID'])  # Base claim as topic
    org_labels.append("Consecutive" if is_consecutive else "Non-Consecutive")
    string_labels.append(1 if is_consecutive else 0)
    identifiers.append(index)

# Create DataFrame using the general function
final_df = create_general_dataframe(claim_pairs, context, topics, org_labels, string_labels, identifiers)


In [None]:
# 2. Multi-level Consecutiveness (Just an example, you can set your own levels)
levels_away = abs(depth - df[df['Node_ID'] == parent_node]['Depth'].values[0])
org_labels[-1] = f"Levels_{levels_away}"
string_labels[-1] = levels_away


In [138]:
from collections import defaultdict
import pandas as pd
import random

def generate_balanced_claim_pairs(df, cleaned_graphs_v6):
    """
    Generates a balanced set of consecutive and non-consecutive claim pairs from the DataFrame.

    Parameters:
    - df: DataFrame containing the discussion graph data
    - cleaned_graphs_v6: List of cleaned graphs

    Returns:
    - final_df: DataFrame containing balanced claim pairs
    """

    # Initialize lists to store data for DataFrame
    claim_pairs = []
    context = []
    topics = []
    org_labels = []
    string_labels = []
    identifiers = []

    # Loop through DataFrame to generate data for tasks
    for graph_index, group_df in tqdm(df.groupby('Graph_Index'), desc='Processing graphs'):

        graph = cleaned_graphs_v6[graph_index]

        # Lists to hold consecutive and non-consecutive pairs
        balanced_pairs = []

        # Temp lists for one round of consecutive and non-consecutive pair
        temp_consecutive = None
        temp_non_consecutive = None

        for index, row in group_df.iterrows():
            parent_node = row['Parent Node']
            current_node = row['Node_ID']

            # Get claim text for parent and current nodes
            parent_text = graph.vs.find(name=str(parent_node))['text']
            current_text = graph.vs.find(name=str(current_node))['text']

            # Exclude NaN or None text
            if parent_text is None or current_text is None:
                continue

            # Recognizing Consecutive Claims
            if parent_node == row['Parent Node']:
                temp_consecutive = (parent_text, current_text)
            else:
                temp_non_consecutive = (parent_text, current_text)

            # If both consecutive and non-consecutive pairs are found, append them to the list
            if temp_consecutive and temp_non_consecutive:
                balanced_pairs.extend([temp_consecutive, temp_non_consecutive])
                temp_consecutive = None
                temp_non_consecutive = None

        # Shuffle the balanced pairs for randomness
        random.shuffle(balanced_pairs)

        for pair in balanced_pairs:
            parent, child = pair
            is_consecutive = parent == child  # Verify if it's a consecutive pair
            claim_pairs.append(pair)
            context.append(None)
            topics.append(group_df['Base_Claim_ID'].iloc[0])
            org_labels.append("Consecutive" if is_consecutive else "Non-Consecutive")
            string_labels.append(1 if is_consecutive else 0)
            identifiers.append(f"{graph_index}_{parent}_{child}")

    # Create DataFrame using the general function (Assuming this function exists)
    final_df = create_general_dataframe(claim_pairs, context, topics, org_labels, string_labels, identifiers)

    return final_df

In [139]:
import pandas as pd
import random
from tqdm import tqdm

def generate_balanced_claim_pairs(df, cleaned_graphs_v6):
    """
    Generate a balanced dataset for the task of identifying if two claims are consecutive or not.

    Parameters:
    df: DataFrame containing the node attributes
    cleaned_graphs_v6: List of igraph objects representing the discussion graphs

    Returns:
    final_df: DataFrame suitable for the task
    """

    # Initialize lists to store the DataFrame data
    claim_pairs, contexts, topics, org_labels, string_labels, identifiers = [], [], [], [], [], []

    # Iterate over each unique graph in the DataFrame
    for graph_index, graph_df in df.groupby("Graph_Index"):

        # Fetch the corresponding igraph object
        graph = cleaned_graphs_v6[graph_index]

        # Temporary lists to hold consecutive and non-consecutive claim pairs
        consecutive_pairs = []
        non_consecutive_pairs = []

        # Iterate through the DataFrame rows
        for _, row in graph_df.iterrows():
            parent_node_id = row['Parent Node']
            current_node_id = row['Node_ID']

            # Retrieve the claim texts based on the node IDs
            parent_text = graph.vs.find(lambda vertex: vertex['name']['id'] == parent_node_id)['name']['text']
            current_text = graph.vs.find(lambda vertex: vertex['name']['id'] == current_node_id)['name']['text']

            # Skip if any text is None
            if parent_text is None or current_text is None:
                continue

            # Check if the current node is a direct child of the parent node
            is_consecutive = parent_node_id in graph_df.loc[graph_df['Node_ID'] == current_node_id, 'Parent Node'].values

            # Append to the appropriate list
            if is_consecutive:
                consecutive_pairs.append((parent_text, current_text))
            else:
                non_consecutive_pairs.append((parent_text, current_text))

        # Shuffle the pairs to randomize
        random.shuffle(consecutive_pairs)
        random.shuffle(non_consecutive_pairs)

        # Make the dataset balanced
        min_length = min(len(consecutive_pairs), len(non_consecutive_pairs))
        balanced_pairs = consecutive_pairs[:min_length] + non_consecutive_pairs[:min_length]

        # Randomly shuffle the balanced pairs
        random.shuffle(balanced_pairs)

        # Extract and store the information needed for the DataFrame
        for parent, child in balanced_pairs:
            claim_pairs.append((parent, child))
            contexts.append(None)
            topics.append(graph.vs[0]['name']['text'])  # Assuming the first node is the base claim
            org_labels.append("Consecutive" if (parent, child) in consecutive_pairs else "Non-Consecutive")
            string_labels.append(1 if (parent, child) in consecutive_pairs else 0)
            identifiers.append(f"{graph_index}_{parent}_{child}")

    # Create the final DataFrame
    final_df = pd.DataFrame({
        'inputs': claim_pairs,
        'context': contexts,
        'topic': topics,
        'org_label': org_labels,
        'id': identifiers,
        'string_label': string_labels,
        'label': string_labels  # You can modify this if needed
    })

    return final_df

# Example usage
# final_df = generate_balanced_claim_pairs(df, cleaned_graphs_v6)


In [None]:
generate_balanced_claim_pairs(all_node_attributes_df_2, cleaned_graphs_v6)