### Import Statements

In [None]:
import numpy as np
import pandas as pd
import pickle
import json
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from matplotlib.ticker import PercentFormatter

from scipy.stats import entropy
from collections import Counter

import os
from annoy import AnnoyIndex
from bisect import bisect_left
import re

from copy import deepcopy

from itertools import combinations
import random
from datetime import datetime

from union_find import *

import networkx as nx

### Outline

- Load DF without Clusters
- Load Embeddings
- Add Columns for clusters

- Number of Fact-checks; Singleton Ratio over time 
- Consistency Measures Mode & Entropy by Verdict
- Percentage of Clusters by Threshhold by number of languages
- Actual vs Random Percentage by number of Languages
- Mean Intra-Cluster Distance 
- Average Inter-Cluster Distance
- Density of most dissimilar claims
- Cosine Similarity of most dissimilar claims

### Load DF Without Clusters

In [None]:
# Load the data
df = pd.read_csv("../Data/minimal_FactCheckData_local.csv.gz", compression="gzip")

# Assure that datePublished is a datetime object
df["datePublished"] = pd.to_datetime(df.datePublished, errors="coerce")
print(f"Number of NaNs in datePublished: {df.datePublished.isna().sum()}")

df = df.dropna(subset=["datePublished"])
# Save the dates in dictionary for quick lookup in cluster mapping
dates = df.set_index("claim_minimal").datePublished.to_dict()

# Create a List of IDs for the cluster mapping
ORIGINAL_IDS = df.claim_minimal.to_list()

In [None]:
# Functions to create cluster mapping
def create_edgelist(msd, threshold):
    """
    Create edgelist from dictionary.
    """
    edge_list = []
    for key in msd.keys():
        for edge in msd[key]:
            if edge[1] >= threshold and dates[edge[0]] <= dates[key]:
                edge_list.append((key, edge[0]))
    return edge_list


# Retain dictionary structure but remove edges below threshold
def create_dict(msd, threshold):
    """
    Remove edges below threshold.
    """
    new_dict = {}
    for key in msd.keys():
        new_dict[key] = []
        for edge in msd[key]:
            if edge[1] >= threshold:
                new_dict[key].append(edge)
    return new_dict

In [None]:
# Create a dictionary for quick lookup of id -> domain
domains = df.set_index("claim_minimal")["domain"]

# Load Edge List
with open("../Data/edge_list.pkl", "rb") as f:
    msd = pickle.load(f)

# Change to list of tuples
msd = {k: list(v.items()) for k, v in msd.items()}

# Prune the dictionary with minimal similarity of 0.75
msd_small = create_dict(msd, 0.75)

# Generate list of all keys, Initialize list of removed keys, shuffle keys
all_keys = list(msd_small.keys())
removed_keys = []
np.random.shuffle(all_keys)

# 1) Scramble the keys to assure that everything is random.
# 2) Iterate through the keys. Remove a key iff any value is >0.99 (Except i==j)
#    2.5) Iterate through all other keys and remove the key from the values.
# 3) Repeat 2) until no more keys are removed.
for ann in tqdm(all_keys, desc="Removing Entries with high similarity and same domain"):
    if ann in removed_keys:
        raise Exception("This should not happen")

    for i in range(len(msd_small[ann])):
        if msd_small[ann][i][1] < 0.95:
            # Within each key the values are sorted by distance. So if one is < 0,95 all the rest will be too.
            break
        # If its not the same key and the domains are the same
        if (
            ann != msd_small[ann][i][0]
            and domains[ann] == domains[msd_small[ann][i][0]]
        ):
            msd_small.pop(ann)
            removed_keys.append(ann)
            # We remove the remove KEY from all VALUES.
            for key in msd_small.keys():
                if ann in msd_small[key]:
                    msd_small[key].remove(ann)
            break

In [None]:
# Initialize list of thresholds to investigate
thresholds = [0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95]
clusters = {}
edge_lists = {}

for threshold in tqdm(thresholds, desc=f"Creating Cluster for threshold"):
    edge_list = create_edgelist(msd_small, threshold)
    edge_lists[threshold] = edge_list
    clusters[threshold] = find_components(edge_list)

# Create Cluster Mapping
id_2_cluster = {threshold: {} for threshold in clusters}
for threshold in clusters:
    id_2_cluster[threshold] = {
        name: i + 1 for i, lst in enumerate(clusters[threshold]) for name in lst
    }

for threshold in id_2_cluster:
    df[f"cluster_{threshold}"] = df.claim_minimal.map(id_2_cluster[threshold])
    df[f"cluster_{threshold}"] = df[f"cluster_{threshold}"].fillna(0)

In [None]:
# Export df with cluster columns
df.to_csv("../Data/df_with_clusters_local_translated_with_clusters.csv", index=False)

### Number of Fact-checks; Singleton Ratio over time 

In [None]:
df["is_singleton_0.875"] = df["cluster_0.875"].map(lambda x: 1 if x == 0 else 0)
df["datePublished"] = pd.to_datetime(df.datePublished, errors="coerce")

fig, (ax2, ax1) = plt.subplots(2, 1, figsize=(10, 6), sharex=True)

# Plot singleton ratio
df[
    (df["datePublished"] > "2019-01-01") & (df["datePublished"] < "2024-12-01")
].set_index("datePublished").resample("3M")["is_singleton_0.875"].mean().plot(
    ax=ax1, marker="o", color="blue", alpha=1, linestyle="--"
)
ax1.set_ylim(0.85, 0.96)
ax1.set_ylabel("Singleton Ratio")
ax1.text(
    -0.1,
    1.05,
    "B",
    transform=ax1.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)

# Plot the number of clusters
df[
    (df["datePublished"] > "2019-01-01") & (df["datePublished"] < "2024-12-01")
].set_index("datePublished").resample("3M")["cluster_0.875"].size().plot(
    ax=ax2, color="red", linestyle="--", alpha=1, marker="o"
)
ax2.set_ylabel("Number of Fact-checks")
ax2.text(
    -0.1,
    1.05,
    "A",
    transform=ax2.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)

ax1.axvline(pd.to_datetime("2020-01-01"), color="black", linestyle="--", alpha=0.5)
ax1.text(
    pd.to_datetime("2020-01-01"),
    0.94,
    "Jan. 2020",
    rotation=90,
    verticalalignment="center",
    horizontalalignment="right",
    fontsize=12,
)
ax1.axvline(pd.to_datetime("2023-01-01"), color="black", linestyle="--", alpha=0.5)
ax1.text(
    pd.to_datetime("2023-01-01"),
    0.94,
    "Jan. 2023",
    rotation=90,
    verticalalignment="center",
    horizontalalignment="right",
    fontsize=12,
)

ax2.axvline(pd.to_datetime("2020-01-01"), color="black", linestyle="--", alpha=0.5)
ax2.text(
    pd.to_datetime("2020-01-01"),
    2.5 * 10**4,
    "Jan. 2020",
    rotation=90,
    verticalalignment="center",
    horizontalalignment="right",
    fontsize=12,
)
ax2.axvline(pd.to_datetime("2023-01-01"), color="black", linestyle="--", alpha=0.5)
ax2.text(
    pd.to_datetime("2023-01-01"),
    2.5 * 10**4,
    "Jan. 2023",
    rotation=90,
    verticalalignment="center",
    horizontalalignment="right",
    fontsize=12,
)
ax2.set_ylim(0, 3.1 * 10**4)
ax1.set_xlabel("Date")

# set y-axis to percentage
ax1.yaxis.set_major_formatter(PercentFormatter(1))

plt.savefig(
    "../Plots/Singleton_Ratio_and_Number_of_Fact-Checks_over_Time.png",
    dpi=300,
    bbox_inches="tight",
)
plt.savefig(
    "../Plots/Singleton_Ratio_and_Number_of_Fact-Checks_over_Time.pdf",
    bbox_inches="tight",
)

#### Remove Nodes & Edges that are outside of observation window

### Filter Fact-Checks That Are between March 2020 and March 2022

In [None]:
df = df[
    (df.datePublished < pd.to_datetime("2023-01-01", utc=True))
    & (df.datePublished > pd.to_datetime("2020-01-01", utc=True))
]
cluster_cols = [col for col in df.columns if "cluster_" in col]
df[cluster_cols] = df[cluster_cols].apply(
    lambda col: col.replace(col.value_counts()[col.value_counts() == 1].index, 0)
)

In [None]:
present_claims = set(list(df.claim_minimal))

for threshold in tqdm(edge_lists.keys()):
    edge_lists[threshold] = [
        (x[0], x[1])
        for x in edge_lists[threshold]
        if x[0] in present_claims and x[1] in present_claims
    ]

# Initialize dictionaries
to_alter = {k: {} for k in thresholds}
to_singleton = {k: [] for k in thresholds}

# Check connectedness
for threshold in thresholds:
    cluster_dict_thresh = (
        df.groupby(f"cluster_{threshold}")["claim_minimal"].apply(list).to_dict()
    )
    max_cluster = max(cluster_dict_thresh.keys())

    G = nx.Graph(edge_lists[threshold])

    for cluster, nodes in cluster_dict_thresh.items():
        if cluster == 0:
            continue

        # Create subgraph for this cluster
        subG = G.subgraph(nodes)

        # Find connected components in this subgraph
        connected_components = list(nx.connected_components(subG))
        if len(connected_components) == 0:
            to_singleton[threshold].extend(nodes)
        if len(connected_components) == 1:
            continue
        for component in connected_components:
            if len(component) == 1:
                to_singleton[threshold].extend(component)
            else:
                max_cluster += 1
                to_alter[threshold][max_cluster] = list(component)

# Update df
for threshold in thresholds:
    alter_map = {old: new for new, olds in to_alter[threshold].items() for old in olds}
    singleton_set = set(to_singleton[threshold])

    def update_cluster_id(row):
        if row["claim_minimal"] in singleton_set:
            return 0
        return alter_map.get(row["claim_minimal"], row[f"cluster_{threshold}"])

    df[f"cluster_{threshold}"] = df.apply(update_cluster_id, axis=1)

present_claims = set(list(df.claim_minimal))
for threshold in tqdm(edge_lists.keys()):
    id_2_cluster = df.set_index("claim_minimal")[f"cluster_{threshold}"].to_dict()
    edge_lists[threshold] = [
        (x[0], x[1])
        for x in edge_lists[threshold]
        if x[0] in present_claims and x[1] in present_claims
    ]

### Calculate Statistics

In [None]:
# Get a dictionary of ID -> Verdict for quick lookup
verdicts = df.set_index("claim_minimal").verdict.to_dict()
# Get Probability distribution for verdict_values
counts = Counter(verdicts.values())
# Remove nan. We did not map all verdicts to a verdict value.
counts.pop(np.nan)
# Normalize to get probabilities
verdict_probabilities = {k: v / sum(counts.values()) for k, v in counts.items()}
# Map ID -> Language
id_2_lang = df.set_index("claim_minimal")["language"].to_dict()


def calc_measures(labels, just_two=False, random=False):
    """
    Calculates the majority, entropy and gini for a list of labels.
    We use this to calculate the measures for the clusters.

    Parameters
    ----------
    labels : list
        List of labels.
    just_two : bool, optional
        If True, we map all labels to True or False. The default is False.
    random : bool, optional
        If True, we sample the labels from the distribution of verdicts. The default is False.
    """
    if random:
        num_labels = len(labels)
        labels = np.random.choice(
            list(verdict_probabilities.keys()),
            size=num_labels,
            p=list(verdict_probabilities.values()),
        )
    else:
        labels = [verdicts[x] for x in labels if x in verdicts and isinstance(x, str)]
        labels = [x for x in labels if str(x) != "nan"]  # removing 'nan' values

    if just_two:
        labels = [
            "False" if "False" in label else "True" for label in labels
        ]  # mapping labels
        assert len(set(labels)) <= 2, f"Labels are not binary: {set(labels)}"

    if len(labels) == 0:
        return 0, 0, 0, 0

    counter = Counter(labels)
    total = len(labels)
    majority = max(counter.values()) / total
    label_counts = np.array(list(counter.values()))
    probs = label_counts / total
    ent = entropy(probs, base=2)
    gini = 1 - sum((count / total) ** 2 for count in counter.values())
    return majority, ent, gini, total


def measure_cluster_consistency(cluster_dict, just_two=False, random=False):
    """
    Wrap calc_measures to calculate the measures for a cluster_dict.

    Parameters
    ----------
    cluster_dict : dict
        Dictionary mapping cluster_id -> list of ids.
    just_two : bool, optional
        If True, we map all labels to True or False. The default is False.
    random : bool, optional
        If True, we sample the labels from the distribution of verdicts. The default is False.
    """
    majority_scores = []
    entropy_scores = []
    gini_scores = []
    total_size = 0

    for labels in cluster_dict:
        majority, ent, gini, size = calc_measures(labels, just_two, random)
        majority_scores.append(majority * size)
        entropy_scores.append(ent * size)
        gini_scores.append(gini * size)
        total_size += size

    average_majority = sum(majority_scores) / total_size
    average_entropy = sum(entropy_scores) / total_size
    average_gini = sum(gini_scores) / total_size

    return average_majority, average_entropy, average_gini


# Statistics that we want

N_CLAIMS = df.shape[0]

stats = {k: {} for k in thresholds}
# Create Null Model =>  Randomly assign each claim to a language with associated probability
lang_distr = pd.Series(id_2_lang.values()).value_counts(normalize=True).to_dict()
for key in tqdm(stats.keys(), desc="Calculating Statistics"):
    num_per_cluster = [len(x) for x in clusters[key]]
    langs = [list(map(id_2_lang.get, x)) for x in clusters[key] if len(x) > 1]
    rand_langs = [
        list(
            np.random.choice(
                list(lang_distr.keys()),
                len(x),
                p=list(lang_distr.values()),
                replace=True,
            )
        )
        for x in clusters[key]
        if len(x) > 1
    ]
    # 1. Number of clusters
    stats[key]["num_cluster"] = len(clusters[key])
    # 2. Median Number of claims in each cluster (where the number of claims is > 1)
    stats[key]["median_num"] = np.median([x for x in num_per_cluster if x > 1])
    # 3. Mean Number of claims in each cluster (where the number of claims is > 1)
    stats[key]["mean_num"] = np.mean([x for x in num_per_cluster if x > 1])
    # 4. Number of claims in the largest cluster
    stats[key]["size_largest"] = max(num_per_cluster)
    # 5. % of Fact-checks that are singletons
    N_NODES = len(set([item for sublist in clusters[key] for item in sublist]))
    # Percentage Singletons
    stats[key]["perc_singletons"] = (N_CLAIMS - N_NODES) / N_CLAIMS
    # % of Clusters that are mono-lingual
    stats[key]["perc_mono"] = sum([1 for x in langs if len(set(x)) == 1]) / len(langs)
    stats[key]["perc_mono_rand"] = sum(
        [1 for x in rand_langs if len(set(x)) == 1]
    ) / len(rand_langs)
    # % of Clusters that are two languages
    stats[key]["perc_two"] = sum([1 for x in langs if len(set(x)) == 2]) / len(langs)
    stats[key]["perc_two_rand"] = sum(
        [1 for x in rand_langs if len(set(x)) == 2]
    ) / len(rand_langs)

    stats[key]["perc_more"] = sum([1 for x in langs if len(set(x)) >= 3]) / len(langs)
    stats[key]["perc_more_rand"] = sum(
        [1 for x in rand_langs if len(set(x)) >= 3]
    ) / len(rand_langs)

    stats_mode, stats_entropy, stats_gini = measure_cluster_consistency(clusters[key])
    stats_mode_two, stats_entropy_two, stats_gini_two = measure_cluster_consistency(
        clusters[key], just_two=True
    )
    stats_mode_rand, stats_entropy_rand, stats_gini_rand = measure_cluster_consistency(
        clusters[key], random=True
    )
    stats_mode_two_rand, stats_entropy_two_rand, stats_gini_two_rand = (
        measure_cluster_consistency(clusters[key], just_two=True, random=True)
    )

    stats[key]["mode_rand"] = stats_mode_rand
    stats[key]["entropy_rand"] = stats_entropy_rand
    stats[key]["gini_rand"] = stats_gini_rand

    stats[key]["just_two_mode_rand"] = stats_mode_two_rand
    stats[key]["just_two_entropy_rand"] = stats_entropy_two_rand
    stats[key]["just_two_gini_rand"] = stats_gini_two_rand

    stats[key]["mode"] = stats_mode
    stats[key]["entropy"] = stats_entropy
    stats[key]["gini"] = stats_gini

    stats[key]["just_two_mode"] = stats_mode_two
    stats[key]["just_two_entropy"] = stats_entropy_two
    stats[key]["just_two_gini"] = stats_gini_two

stats = pd.DataFrame(stats).T.reset_index().rename(columns={"index": "threshold"})

#### Calculate Consistency Measures

In [None]:
mode = stats.set_index("threshold")["mode"].round(4) * 100
two_mode = stats.set_index("threshold")["just_two_mode"].round(4) * 100
mode_rand = stats.set_index("threshold")["mode_rand"].round(4) * 100
two_mode_rand = stats.set_index("threshold")["just_two_mode_rand"].round(4) * 100


def print_latex_table():
    # Convert all numerical values to strings with proper formatting
    str_thresholds = list(map(str, thresholds))
    str_two_mode = list(map(lambda x: f"{x:.2f}", two_mode))
    str_two_mode_rand = list(map(lambda x: f"{x:.2f}", two_mode_rand))
    str_four_mode = list(map(lambda x: f"{x:.2f}", mode))
    str_four_mode_rand = list(map(lambda x: f"{x:.2f}", mode_rand))

    # Start of the table
    print("\\begin{table}[H]")
    print("\\centering")
    print("\\begin{tabular}{l" + "c" * len(thresholds) + "}")
    print("\\hline")

    # Header
    header_row = " & ".join([""] + str_thresholds) + " \\\\ \\hline"
    print(header_row)

    # Two Mode Rows
    two_mode_row = "Two Mode & " + " & ".join(str_two_mode) + " \\\\"
    two_mode_rand_row = (
        "\\small{(Two Mode Rand)} & "
        + " & ".join(f"\\small{{({x})}}" for x in str_two_mode_rand)
        + " \\\\ \\hline"
    )
    print(two_mode_row)
    print(two_mode_rand_row)

    # Four Mode Rows
    four_mode_row = "Four Mode & " + " & ".join(str_four_mode) + " \\\\"
    four_mode_rand_row = (
        "\\small{(Four Mode Rand)} & "
        + " & ".join(f"\\small{{({x})}}" for x in str_four_mode_rand)
        + " \\\\ \\hline"
    )
    print(four_mode_row)
    print(four_mode_rand_row)

    # End of the table
    print("\\end{tabular}")
    print("\\caption{Your Table Title}")
    print("\\label{tab:your_label}")
    print("\\end{table}")


# Call the function to print the LaTeX code
print_latex_table()

#### Percentage of clusters by Number of languages across thresholds

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(10, 5))
# Both line and scatter plot
plt.plot(stats["threshold"], stats["perc_mono"], label="Mono-lingual")
plt.plot(stats["threshold"], stats["perc_two"], label="Two-lingual")
# plt.plot(stats["threshold"], stats["perc_three"], label="Three-lingual")
plt.plot(stats["threshold"], stats["perc_more"], label="More than three-lingual")
# Scatter plot
plt.scatter(stats["threshold"], stats["perc_mono"], label=None)
plt.scatter(stats["threshold"], stats["perc_two"], label=None)
# plt.scatter(stats["threshold"], stats["perc_three"], label=None)
plt.scatter(stats["threshold"], stats["perc_more"], label=None)
plt.xlabel("Threshold")
plt.ylabel("Percentage")
# Foramt y-axis to Percentage
# Set y-axis log2
plt.yscale("log", base=2)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

# Set title
plt.title(
    "Percentage of clusters by number of languages very stable across Threshholds"
)
plt.xticks(np.arange(0.75, 0.975, 0.025))
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)
plt.savefig("../Plots/perc_clusters_by_lang.png", bbox_inches="tight", dpi=300)

##### Actual vs Random Percentage of Clusters by Number of Languages

In [None]:
sns.set_style("whitegrid")

plt.figure(figsize=(6, 6))
# Add 45 degree line
plt.plot([0, 1], [0, 1], transform=plt.gca().transAxes, color="black", linestyle="--")

# Shade area below 45 degree line with vertical stripes
plt.fill_between(
    [0, 1],
    [0, 1],
    [0, 0],
    hatch="/",
    facecolor="lightblue",
    edgecolor="blue",
    alpha=0.2,
)

# Shade area above 45 degree line with horizontal stripes
plt.fill_between(
    [0, 1], [0, 1], [1, 1], hatch="-", facecolor="lightpink", edgecolor="red", alpha=0.2
)

# Draw arrows between points
for i in range(stats.shape[0] - 2):
    for var in ["mono", "two", "more"]:
        plt.arrow(
            stats["perc_" + var + "_rand"][i],
            stats["perc_" + var][i],
            stats["perc_" + var + "_rand"][i + 1] - stats["perc_" + var + "_rand"][i],
            stats["perc_" + var][i + 1] - stats["perc_" + var][i],
            head_width=0.01,
            head_length=0.02,
            fc="k",
            ec="k",
            length_includes_head=True,
            alpha=0.8,
        )

# Set marker styles and colors
mono_marker = "D"
two_marker = "o"
more_marker = "^"

mono_color = "blue"
two_color = "red"
more_color = "lightgreen"

plt.scatter(
    stats["perc_mono_rand"].iloc[:-1],
    stats["perc_mono"].iloc[:-1],
    label="Mono-lingual",
    marker=mono_marker,
    color=mono_color,
    edgecolors="black",
)
plt.scatter(
    stats["perc_two_rand"].iloc[:-1],
    stats["perc_two"].iloc[:-1],
    label="Two Languages",
    marker=two_marker,
    color=two_color,
    edgecolors="black",
)
plt.scatter(
    stats["perc_more_rand"].iloc[:-1],
    stats["perc_more"].iloc[:-1],
    label="More than Three Languages",
    marker=more_marker,
    color=more_color,
    edgecolors="black",
)

plt.xlabel("Expected", fontsize=14)
plt.ylabel("Observed", fontsize=14)

# Format axes to percentage
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.gca().xaxis.set_major_formatter(PercentFormatter(1))


plt.legend(loc="upper right", borderaxespad=0)
plt.xlim(0, 0.8)
plt.ylim(0, 0.8)

# Add x-axis tick marks at regular intervals
plt.xticks(np.arange(0, 0.9, 0.1), fontsize=12)

plt.grid(False)
plt.grid(False)
plt.savefig("../Plots/actual_vs_random_accessible.png", dpi=300, bbox_inches="tight")
plt.savefig("../Plots/actual_vs_random_accessible.pdf", bbox_inches="tight")

#### Threshold vs number of clusters

In [None]:
# Plot threshold vs largest & perc_singletons on the same plot with two y-axis
fig, ax1 = plt.subplots()
plt.plot(stats.threshold, stats.size_largest)

plt.xlabel("Threshold")
# Set ylabel 1 to blue
ax1.set_ylabel("Size of the Largest Cluster", color="blue")
ax1.set_yscale("log")
ax2 = ax1.twinx()
plt.plot(stats.threshold, stats.perc_singletons, color="red")

# Set ylabel 2 to red
ax2.set_ylabel("Percentage of Singletons", color="red")
# Percentage Formatter ax2
ax2.yaxis.set_major_formatter(PercentFormatter(1))
# Remove grid
ax1.grid(False)
ax2.grid(False)
# Set title
plt.title("Threshold vs Number of Clusters & Percentage of Singletons")
# save plot

### Calculate Inter and Intra Cluster Variance

#### Load all Embeddings

In [None]:
def load_embeddings(file_path, ids_order=ORIGINAL_IDS):
    """Load embeddings from a numpy file."""
    try:
        embeddings_dict = np.load(file_path, allow_pickle=True).item()
        embeddings_ordered = {
            id: embeddings_dict[id] for id in ids_order if id in embeddings_dict
        }
        return embeddings_ordered
    except Exception as e:
        print(f"Failed to load embeddings from {file_path} with error {e}")
        return None


def load_all_embeddings(folder_path):
    """Load all embeddings from a directory."""
    embeddings_dict = {}
    # Get the list of filenames and sort them by the number included in the filename
    filenames = sorted(
        [x for x in os.listdir(folder_path) if not x.startswith(".")],
        key=lambda x: int(re.search(r"\d+", x).group()),
    )

    for file_name in filenames:
        if file_name.endswith(".npy"):
            embeddings_dict.update(
                load_embeddings(os.path.join(folder_path, file_name))
            )

    return embeddings_dict


# Load all Embeddings
embeddings = load_all_embeddings("../Data/embeddings/")
# Check that all embeddings were loaded
variance_calc = deepcopy(clusters)


# Calculate the variance of a cluster
def intra_cluster_variance(cluster):
    """
    Calculate the variance of a cluster
    """
    # Get the embeddings of the claims in the cluster
    cluster_embeddings = [embeddings[claim] for claim in cluster]
    # Calculate the variance of the embeddings
    variance = np.var(cluster_embeddings, axis=0)
    # Return the sum of the variance
    return np.sum(variance)


# Filter each key in variance_calc to only include clusters with at least 2 claims
for key in variance_calc:
    variance_calc[key] = [x for x in variance_calc[key] if len(x) > 1]
# Turn each key into dict with ID = threshold_index
for key in variance_calc:
    variance_calc[key] = {f"{key}_{i}": x for i, x in enumerate(variance_calc[key])}
variance_stats = {}

for threshold in tqdm(thresholds, desc="Calculating Variance"):
    variance_stats[threshold] = {}
    variance_stats[threshold]["intra_variances"], variance_stats[threshold]["sizes"] = (
        {},
        {},
    )
    variance_stats[threshold]["intra_variances"] = {
        k: intra_cluster_variance(v) for k, v in variance_calc[threshold].items()
    }
    variance_stats[threshold]["sizes"] = {
        k: len(v) for k, v in variance_calc[threshold].items()
    }


def centroid(cluster):
    """
    Calculate the centroid of a cluster
    """
    # Get the embeddings of the claims in the cluster
    cluster_embeddings = [embeddings[claim] for claim in cluster]
    # Calculate the centroid of the embeddings
    centroid = np.mean(cluster_embeddings, axis=0)
    # Return the centroid
    return centroid


def sample_combinations(cluster_tuples, n=10000):
    """
    Create a list of all combinations of clusters.
    If the number of combinations is too large, we sample n combinations.
    """
    # n! / (2! * (n - 2)!) = 10000
    # n**2 - n - 20.000 = 0 => ~142
    if len(cluster_tuples) <= 142:
        combs = list(combinations(cluster_tuples.items(), 2))
        # Filter combs to only include tuples where i != j
        combs = [x for x in combs if x[0][0] != x[1][0]]
    else:
        combs = []
        keys = list(cluster_tuples.keys())
        for i in range(n):
            indeces = random.sample(keys, 2)
            combs.append((cluster_tuples[indeces[0]], cluster_tuples[indeces[1]]))
        combs = [x for x in combs if x[0][0] != x[1][0]]
    return combs


np.random.seed(42)
# Create a dict with the centroids of each cluster
centroids = {}
for threshold in tqdm(thresholds, desc="Creating centroids"):
    centroids[threshold] = {}
    for key in variance_calc[threshold]:
        # Remove clusters with only 1 claim
        if len(variance_calc[threshold][key]) > 1:
            centroids[threshold][key] = centroid(variance_calc[threshold][key])

# Create a dict with all centroid tuples where i != j
centroid_tuples = {}
for threshold in tqdm(thresholds, desc="Creating centroid tuples"):
    centroid_tuples[threshold] = {}
    centroid_tuples[threshold] = sample_combinations(centroids[threshold], n=1_000)

# Calculate the distance between each centroid (cosine similarity)
centroid_distances = {}
for threshold in tqdm(thresholds, desc="Calculating centroid distances"):
    centroid_distances[threshold] = {}
    # Use cosine distance
    centroid_distances[threshold] = [
        cosine_similarity(np.array(x[0]).reshape(1, -1), np.array(x[1]).reshape(1, -1))[
            0
        ][0]
        for x in centroid_tuples[threshold]
    ]

#### Plot Intra Cluster Distance

In [None]:
# Calculate avg_intra_variances for each threshold
avg_intra_variances = [
    np.mean(list(variance_stats[threshold]["intra_variances"].values()))
    for threshold in thresholds
]
# Calculate avg_inter_variances for each threshold
avg_inter_variances = [
    np.mean(np.abs(centroid_distances[threshold])) for threshold in thresholds
]

# Find the threshold with minimum avg_intra_variance and maximum avg_inter_variance
min_avg_intra_variance = min(avg_intra_variances)
min_threshold_intra = thresholds[avg_intra_variances.index(min_avg_intra_variance)]

max_avg_inter_variance = max(avg_inter_variances)
max_threshold_inter = thresholds[avg_inter_variances.index(max_avg_inter_variance)]

# Create subplots
fig, ax = plt.subplots(2, 1, figsize=(12, 8))

# Plot for Average Intra Cluster Variance
for threshold, avg_variance in zip(thresholds, avg_intra_variances):
    color = "red" if threshold == min_threshold_intra else "grey"
    ax[0].bar(
        x=threshold, height=avg_variance, width=0.02, color=color, edgecolor="black"
    )

ax[0].set_xlabel("")
ax[0].set_ylabel("Average Intra Cluster Variance", fontsize=14)

# Plot for Average Inter Cluster Distance
for threshold, avg_distance in zip(thresholds, avg_inter_variances):
    color = "red" if threshold == max_threshold_inter else "grey"
    ax[1].bar(
        x=threshold, height=avg_distance, width=0.02, color=color, edgecolor="black"
    )

ax[1].set_xlabel("Thresholds")
ax[1].set_ylabel("Average Inter Cluster Distance", fontsize=14)

# Add Labels (A / B) - bold
ax[0].text(
    -0.075,
    1.07,
    "A",
    transform=ax[0].transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)
ax[1].text(
    -0.075,
    1.07,
    "B",
    transform=ax[1].transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)

# set x-ticks
ax[0].set_xticks(thresholds)
ax[1].set_xticks(thresholds)

plt.savefig("../Plots/avg_intra_inter_variance.png", dpi=300, bbox_inches="tight")
plt.savefig("../Plots/avg_intra_inter_variance.pdf", bbox_inches="tight")

## Similarity over Time

In [None]:
# Initialize dictionaries
to_alter = {k: {} for k in thresholds}
to_singleton = {k: [] for k in thresholds}

# Check connectedness
for threshold in thresholds:
    cluster_dict_thresh = (
        df.groupby(f"cluster_{threshold}")["claim_minimal"].apply(list).to_dict()
    )
    max_cluster = max(cluster_dict_thresh.keys())

    G = nx.Graph(edge_lists[threshold])

    for cluster, nodes in cluster_dict_thresh.items():
        if cluster == 0:
            continue

        # Create subgraph for this cluster
        subG = G.subgraph(nodes)

        # Find connected components in this subgraph
        connected_components = list(nx.connected_components(subG))
        if len(connected_components) == 0:
            to_singleton[threshold].extend(nodes)
        if len(connected_components) == 1:
            continue
        for component in connected_components:
            if len(component) == 1:
                to_singleton[threshold].extend(component)
            else:
                max_cluster += 1
                to_alter[threshold][max_cluster] = list(component)

# Update df
for threshold in thresholds:
    alter_map = {old: new for new, olds in to_alter[threshold].items() for old in olds}
    singleton_set = set(to_singleton[threshold])

    def update_cluster_id(row):
        if row["claim_minimal"] in singleton_set:
            return 0
        return alter_map.get(row["claim_minimal"], row[f"cluster_{threshold}"])

    df[f"cluster_{threshold}"] = df.apply(update_cluster_id, axis=1)

present_claims = set(list(df.claim_minimal))
for threshold in tqdm(edge_lists.keys()):
    id_2_cluster = df.set_index("claim_minimal")[f"cluster_{threshold}"].to_dict()
    edge_lists[threshold] = [
        (x[0], x[1])
        for x in edge_lists[threshold]
        if x[0] in present_claims and x[1] in present_claims
    ]  # and id_2_cluster[x[0]] != 0 and id_2_cluster[x[1]] != 0]

In [None]:
plt.figure(figsize=(10, 6))

all_diffs = {}
present_claims = set(list(df.claim_minimal))

# Adjusted list of thresholds for the plot
adjusted_thresholds = [0.8, 0.825, 0.85, 0.875, 0.9]

for threshold in tqdm(
    [k for k in edge_lists.keys() if k in adjusted_thresholds],
    desc=f"Calculating time differences",
):
    ana_cl = edge_lists[threshold]
    ana_cl_dates = []

    # Remove edges that contain nodes outside of the observation period
    ana_cl = [x for x in ana_cl if x[0] in present_claims and x[1] in present_claims]

    for edges in tqdm(ana_cl, leave=False):
        ana_cl_dates.append([dates[x] for x in edges])

    diffs = [abs(x[0] - x[1]).days for x in ana_cl_dates]
    all_diffs[threshold] = diffs

    perc = pd.Series([np.mean(pd.Series(diffs) <= x) for x in range(1, 30)])
    plt.plot(perc, label=f"{threshold}")

    # Make the label at the end of each line much bigger and include "Threshold: "
    if threshold == 0.9:
        plt.text(
            len(perc) - 3,
            perc.iloc[-1],
            f"Threshold\n{threshold}",
            color=plt.gca().lines[-1].get_color(),
            fontsize=12,
            verticalalignment="center",
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )
    elif threshold == 0.8:
        plt.text(
            len(perc) - 3,
            perc.iloc[-1] - 0.02,
            f"Threshold\n{threshold}",
            color=plt.gca().lines[-1].get_color(),
            fontsize=12,
            verticalalignment="center",
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )
    else:
        plt.text(
            len(perc) - 3,
            perc.iloc[-1] - 0.03,
            f"Threshold\n{threshold}",
            color=plt.gca().lines[-1].get_color(),
            fontsize=12,
            verticalalignment="center",
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )

    for x in [7, 21]:
        plt.annotate(
            f"{perc[x]*100:.2f}%",
            (x, perc[x]),
            textcoords="offset points",
            xytext=(0, 10),
            ha="center",
            color=plt.gca().lines[-1].get_color(),
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )

# Vertical lines and texts
for x in [7, 14, 21, 28]:
    plt.axvline(x=x, color="grey", linestyle="--", linewidth=0.5)
    plt.text(
        x,
        1.01,
        f"{x} Days",
        transform=plt.gca().get_xaxis_transform(),
        ha="center",
        va="bottom",
        fontsize=8,
        color="grey",
    )

# Modify y-axis to show percentage
plt.gca().set_yticklabels(["{:.0f}%".format(x * 100) for x in plt.gca().get_yticks()])

# Improve x-axis
plt.xticks(range(0, 30, 1))

# Labels and title
plt.xlabel("Days", fontsize=12)
plt.ylabel("Percentage of Edges with Time Difference â‰¤ x", fontsize=12)
plt.savefig("../Plots/percentage_edges_time_diff.png", dpi=300, bbox_inches="tight")
plt.savefig("../Plots/percentage_edges_time_diff.pdf", bbox_inches="tight")

## Similarities over Time

In [None]:
clustered_time = {}

for threshhold in [0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95]:
    # Get the number of claims in each cluster
    clusters = df[f"cluster_{threshhold}"].value_counts()
    # We remove clusters with only 1 or 2 claims. 2 is trivial, as similarity is necessarily equal to the threshold.
    clusters_to_incl = clusters[clusters > 2].index.to_list()
    # Filter df to only include claims in clusters with more than 2 claims
    small = df[
        (df[f"cluster_{threshhold}"].isin(clusters_to_incl))
        & (df[f"cluster_{threshhold}"] != 0)
    ]
    # Get the dates of each claim
    dates_cluster = df.set_index("claim_minimal")["datePublished"].to_dict()

    all_simis = []
    for cluster in tqdm(
        small[f"cluster_{threshhold}"].unique(),
        desc=f"Calculating similarities for threshold {threshhold}",
    ):
        if cluster == 0:
            # Skip singletons
            continue

        # Get all claims in the cluster
        claims_in_cluster = small.loc[
            small[f"cluster_{threshhold}"] == cluster, "claim_minimal"
        ]

        if claims_in_cluster.shape[0] > 1000:  # Check that cluster is not too large
            claims_in_cluster = claims_in_cluster.sample(1000).to_list()

        # Load embeddings for claims in cluster
        embeddings_cluster = [embeddings[x] for x in claims_in_cluster]
        # Create a dict mapping index to claim (?)
        index_to_claim = (
            small.loc[small.claim_minimal.isin(claims_in_cluster), "claim_minimal"]
            .reset_index(drop=True)
            .to_dict()
        )
        # Calculate cosine_similarity
        cosine_similarity_df = cosine_similarity(embeddings_cluster)
        # Flatten and join with dates
        cosine_similarity_df = pd.DataFrame(cosine_similarity_df).stack().reset_index()
        cosine_similarity_df.columns = ["claim_1", "claim_2", "cosine_similarity"]
        cosine_similarity_df["claim_1"] = cosine_similarity_df["claim_1"].map(
            index_to_claim
        )
        cosine_similarity_df["claim_2"] = cosine_similarity_df["claim_2"].map(
            index_to_claim
        )
        cosine_similarity_df["date_1"] = cosine_similarity_df["claim_1"].apply(
            lambda x: dates_cluster[x]
        )
        cosine_similarity_df["date_2"] = cosine_similarity_df["claim_2"].apply(
            lambda x: dates_cluster[x]
        )
        # Remove all directly connected claims. Any claim above the threshold is directly connected to itself.
        all_simis.append(
            cosine_similarity_df[cosine_similarity_df["cosine_similarity"] < threshhold]
        )
    # Concatenate all dataframes
    time_df = pd.concat(all_simis)
    # Remove all claims where date_1 > date_2 (?)
    time_df = time_df[time_df["date_2"] > time_df["date_1"]]
    # Calculate the time difference
    time_df["time_difference"] = (time_df["date_2"] - time_df["date_1"]).dt.days
    clustered_time[threshhold] = time_df

In [None]:
plt.figure(figsize=(12, 8))

for thresh in clustered_time:
    pl_df = clustered_time[thresh]
    pl_df["day"] = pl_df["time_difference"] // 1
    # Only calculate for the first 12 months
    pl_df = pl_df[pl_df["day"] < 365]
    # At least 30 datapoints per month
    pl_df = pl_df.groupby("day").filter(lambda x: len(x) > 10)
    # Calculate mean and variance
    pl_df_mean = pl_df.groupby("day")["cosine_similarity"].mean().reset_index()
    pl_df_var = pl_df.groupby("day")["cosine_similarity"].var().reset_index()

    if thresh > 0.875:
        print(f"Skipping {thresh}")
        continue

    pl_df_mean = pl_df_mean.dropna()
    pl_df_var = pl_df_var.dropna()
    pl_df_mean["day"] = pl_df_mean["day"].astype(float)
    pl_df_var["day"] = pl_df_var["day"].astype(float)
    pl_df_mean["cosine_similarity"] = pl_df_mean["cosine_similarity"].astype(float)
    pl_df_var["cosine_similarity"] = pl_df_var["cosine_similarity"].astype(float)

    # Plot the mean line
    plt.plot(pl_df_mean["day"], pl_df_mean["cosine_similarity"], label=thresh)

    # Calculate bounds for fill_between
    lower = np.array(pl_df_mean["cosine_similarity"] - pl_df_var["cosine_similarity"])
    upper = np.array(pl_df_mean["cosine_similarity"] + pl_df_var["cosine_similarity"])
    days = np.array(pl_df_mean["day"])

    plt.fill_between(days, lower, upper, alpha=0.2)

    # Put label at the end of the line
    last_day = days[-1]
    last_cosine_similarity = pl_df_mean["cosine_similarity"].iloc[-1]
    if thresh != 0.75:
        plt.text(
            last_day,
            last_cosine_similarity,
            f" {thresh}",
            verticalalignment="center",
            fontsize=14,
            color=plt.gca().lines[-1].get_color(),
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )
    else:
        plt.text(
            last_day,
            last_cosine_similarity - 0.02,
            f"{thresh}",
            verticalalignment="center",
            fontsize=14,
            color=plt.gca().lines[-1].get_color(),
            bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        )

plt.xlim(0, 365)
# plt.legend(loc="upper right", title="Threshold")
plt.xlabel("Days", fontsize=14)
plt.ylabel("Average similarity", fontsize=14)
plt.savefig(
    "../Plots/avg_cosine_similarity_time_diff.png", dpi=300, bbox_inches="tight"
)
plt.savefig("../Plots/avg_cosine_similarity_time_diff.pdf", bbox_inches="tight")

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

# Plot for the threshold 0.875
plt_df_mean = (
    clustered_time[0.875]
    .groupby("time_difference")["cosine_similarity"]
    .mean()
    .reset_index()
)
plt_df_var = (
    clustered_time[0.875]
    .groupby("time_difference")["cosine_similarity"]
    .var()
    .reset_index()
)

ax1.plot(
    plt_df_mean["time_difference"], plt_df_mean["cosine_similarity"], label="0.875"
)

# Calculations for the fill_between
plt_df_mean = plt_df_mean.dropna()
plt_df_var = plt_df_var.dropna()

plt_df_mean["time_difference"] = plt_df_mean["time_difference"].astype(float)
plt_df_var["time_difference"] = plt_df_var["time_difference"].astype(float)

plt_df_mean["cosine_similarity"] = plt_df_mean["cosine_similarity"].astype(float)
plt_df_var["cosine_similarity"] = plt_df_var["cosine_similarity"].astype(float)

upper = np.array(plt_df_mean["cosine_similarity"] + plt_df_var["cosine_similarity"])
lower = np.array(plt_df_mean["cosine_similarity"] - plt_df_var["cosine_similarity"])
days = np.array(plt_df_mean["time_difference"])

ax1.fill_between(days, lower, upper, alpha=0.2)
ax1.set_xlim(0, 30)
ax1.set_ylim(0.73, 0.82)
y_change = {0.75: -0.04, 0.775: 0, 0.8: 0.01, 0.825: 0, 0.85: 0.03}

# Other thresholds
for threshold in clustered_time.keys():
    if threshold >= 0.875:
        continue

    plt_df_mean = (
        clustered_time[threshold]
        .groupby("time_difference")["cosine_similarity"]
        .mean()
        .reset_index()
    )
    plt_df_var = (
        clustered_time[threshold]
        .groupby("time_difference")["cosine_similarity"]
        .var()
        .reset_index()
    )

    plt_df_mean = plt_df_mean.dropna()
    plt_df_var = plt_df_var.dropna()

    plt_df_mean["time_difference"] = plt_df_mean["time_difference"].astype(float)
    plt_df_var["time_difference"] = plt_df_var["time_difference"].astype(float)

    plt_df_mean["cosine_similarity"] = plt_df_mean["cosine_similarity"].astype(float)
    plt_df_var["cosine_similarity"] = plt_df_var["cosine_similarity"].astype(float)

    ax2.plot(
        plt_df_mean["time_difference"],
        plt_df_mean["cosine_similarity"],
        label=threshold,
    )

    upper = np.array(plt_df_mean["cosine_similarity"] + plt_df_var["cosine_similarity"])
    lower = np.array(plt_df_mean["cosine_similarity"] - plt_df_var["cosine_similarity"])
    days = np.array(plt_df_mean["time_difference"])
    plt_df_mean["cosine_similarity"] = np.array(plt_df_mean["cosine_similarity"])
    ax2.fill_between(days, lower, upper, alpha=0.2)
    last_day = 30
    last_cosine_similarity = plt_df_mean["cosine_similarity"][30]

    ax2.text(
        last_day,
        last_cosine_similarity + y_change[threshold],
        f" {threshold}",
        verticalalignment="center",
        fontsize=14,
        color=plt.gca().lines[-1].get_color(),
        bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
    )


ax2.set_xlim(0, 30)
ax2.set_ylim(0.35, 0.75)
# ax2.legend(title='Threshold', loc='upper right')
ax1.set_xlabel("Timedifference in Days", fontsize=14)
ax2.set_xlabel("Timedifference in Days", fontsize=14)
ax1.set_ylabel("Cosine Similarity (Thresh. 0.875)", fontsize=14)
ax2.set_ylabel("Cosine Similarity (Other Thresh.)", fontsize=14)
plt.tight_layout(
    rect=[0, 0.03, 1, 0.95]
)  # Adjust the layout to make room for the suptitle

# Add Labels to the Panels (A / B) - bold
ax1.text(
    -0.075,
    1.07,
    "A",
    transform=ax1.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)
ax2.text(
    -0.075,
    1.07,
    "B",
    transform=ax2.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)

# Full Grid
plt.grid(True)

plt.savefig("../Plots/cosine_similarity_time.png", dpi=300, bbox_inches="tight")
plt.savefig("../Plots/cosine_similarity_time.pdf", bbox_inches="tight")

In [None]:
multilingual_clusters_0875 = (
    df.groupby("cluster_0.875")["language"].apply(set).reset_index()
)
multilingual_clusters_0875["num_lang"] = multilingual_clusters_0875["language"].apply(
    len
)
multilingual_clusters_0875 = multilingual_clusters_0875[
    multilingual_clusters_0875["num_lang"] > 1
]
df["is_multilingual"] = df["cluster_0.875"].isin(
    multilingual_clusters_0875["cluster_0.875"]
)


def get_range(x):
    """
    input: groupby object in pandas.
    output: output difference between min and max (without NAs) within each group.
    """
    try:
        x = x.dropna()
        return x.max() - x.min()
    except Exception as e:
        return np.nan


df["claim_date"] = pd.to_datetime(df["claim_date"], utc=True, errors="coerce")
a = (
    df.groupby("cluster_0.875")["claim_date"]
    .apply(get_range)
    .apply(lambda x: float(x.days) if not pd.isnull(x) else x)
)
a = a.drop(0)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
import matplotlib.gridspec as gridspec

legend_fontsize = 20
title_fontsize = 24
label_fontsize = 20
tick_fontsize = 16

max_days = 28

multilingual = df[
    (df["is_multilingual"])
    & (df["cluster_0.875"] > 0.1)
    & ~(df["timedifference"].isna())
].timedifference // (24 * 60 * 60)
monolingual = df[
    ((~df["is_multilingual"]) & (df["cluster_0.875"] > 0.1))
    & ~(df["timedifference"].isna())
].timedifference // (24 * 60 * 60)
singleton = df[
    (df["is_singleton_0.875"]) & ~(df["timedifference"].isna())
].timedifference // (24 * 60 * 60)
non_signleton = df[
    (~df["is_singleton_0.875"]) & ~(df["timedifference"].isna())
].timedifference // (24 * 60 * 60)

df["timedifference_days"] = df["timedifference"] // (24 * 60 * 60)

data_singleton = [singleton, non_signleton]
labels_singleton = ["Singleton", "Non-Singleton"]
data_language = [monolingual, multilingual]
labels_language = ["Monolingual", "Multilingual"]

colors_singleton = ["#1f77b4", "#ff7f0e"]
colors_language = ["#ff7f0e", "#1f77b4"]

fig = plt.figure(figsize=(16, 12))
# gs = gridspec.GridSpec(2, 2, height_ratios=[0.66, 1], width_ratios=[1, 1])
# gs = gridspec.GridSpec(2, 2, height_ratios=[0.66, 1], width_ratios=[1, 1], top=1)
gs = gridspec.GridSpec(3, 2, height_ratios=[0.66, 0.005, 1], width_ratios=[1, 1])


ax0 = fig.add_subplot(gs[0, :])
cumulative_percentage = (
    df["timedifference_days"].value_counts(normalize=True).sort_index().cumsum() * 100
)
days = cumulative_percentage.index.to_numpy()
percentages = cumulative_percentage.to_numpy()
days = np.array(days, dtype=np.float64)
percentages = np.array(percentages, dtype=np.float64)
mask = np.isfinite(days)
days = days[mask]
percentages = percentages[mask]
mask = days <= max_days
days = days[mask]
percentages = percentages[mask]
ax0.plot(days, percentages, linewidth=2, color="#1f77b4")
ax0.scatter(days, percentages, color="#1f77b4", marker="o", s=40)
ax0.fill_between(days, percentages, alpha=0.2, color="#1f77b4")
ax0.set_xlabel("Number of Days Since Claim Emergence", fontsize=label_fontsize)
ax0.set_ylabel("Cumulative Percentage of Claims", fontsize=label_fontsize)

xticks = np.arange(0, max_days + 1, max(1, max_days // 10))
ax0.set_xticks(xticks)
ax0.set_xticklabels(xticks, fontsize=tick_fontsize)
yticks = np.arange(30, 101, 10)
ax0.set_yticks(yticks)
ax0.set_yticklabels([f"{tick}%" for tick in yticks], fontsize=label_fontsize)
# ax0.set_title('Time Lag Between Claim Emergence and Fact-Checking', fontsize=title_fontsize+3)
ax0.grid(axis="y", linestyle="--", alpha=0.7)
ax0.set_ylim(30, 100)

ax1 = fig.add_subplot(gs[2, 0])
for d, label, color in zip(data_singleton, labels_singleton, colors_singleton):
    cumulative_percentage = d.value_counts(normalize=True).sort_index().cumsum() * 100
    days = cumulative_percentage.index.to_numpy()
    percentages = cumulative_percentage.to_numpy()
    days = np.array(days, dtype=np.float64)
    percentages = np.array(percentages, dtype=np.float64)
    mask = np.isfinite(days)
    days = days[mask]
    percentages = percentages[mask]
    mask = days <= max_days
    days_masked = days[mask]
    percentages_masked = percentages[mask]
    ax1.plot(days_masked, percentages_masked, linewidth=2, label=label, color=color)
    ax1.scatter(days_masked, percentages_masked, marker="o", s=40, color=color)

for i in range(len(data_singleton) - 1):
    d1 = data_singleton[i].value_counts(normalize=True).sort_index().cumsum() * 100
    d2 = data_singleton[i + 1].value_counts(normalize=True).sort_index().cumsum() * 100
    days1 = d1.index.to_numpy()
    days2 = d2.index.to_numpy()
    mask1 = np.isfinite(days1) & (days1 <= max_days)
    mask2 = np.isfinite(days2) & (days2 <= max_days)
    ax1.fill_between(
        days1[mask1], np.array(d1[mask1]), np.array(d2[mask2]), alpha=0.2, color="red"
    )

ax1.set_xlabel("Number of Days Since Claim Emergence", fontsize=label_fontsize)
ax1.set_ylabel("Cumulative Percentage of Claims", fontsize=label_fontsize)

xticks = np.arange(0, max_days + 1, 7)
ax1.set_xticks(xticks)
ax1.set_xticklabels(xticks, fontsize=tick_fontsize)
yticks = np.arange(30, 101, 10)
ax1.set_yticks(yticks)
ax1.set_yticklabels([f"{tick}%" for tick in yticks], fontsize=tick_fontsize)
# ax1.set_title('Singleton vs. Non-Singleton', fontsize=title_fontsize)
ax1.grid(axis="y", linestyle="--", alpha=0.7)
ax1.legend(fontsize=legend_fontsize)

ax2 = fig.add_subplot(gs[2, 1])
for d, label, color in zip(data_language, labels_language, colors_language):
    cumulative_percentage = d.value_counts(normalize=True).sort_index().cumsum() * 100
    days = cumulative_percentage.index.to_numpy()
    percentages = cumulative_percentage.to_numpy()
    days = np.array(days, dtype=np.float64)
    percentages = np.array(percentages, dtype=np.float64)
    mask = np.isfinite(days)
    days = days[mask]
    percentages = percentages[mask]
    mask = days <= max_days
    days_masked = days[mask]
    percentages_masked = percentages[mask]
    ax2.plot(days_masked, percentages_masked, linewidth=2, label=label, color=color)
    ax2.scatter(days_masked, percentages_masked, marker="o", s=40, color=color)

for i in range(len(data_language) - 1):
    d1 = data_language[i].value_counts(normalize=True).sort_index().cumsum() * 100
    d2 = data_language[i + 1].value_counts(normalize=True).sort_index().cumsum() * 100
    days1 = d1.index.to_numpy()
    days2 = d2.index.to_numpy()
    mask1 = np.isfinite(days1) & (days1 <= max_days)
    mask2 = np.isfinite(days2) & (days2 <= max_days)
    ax2.fill_between(
        days1[mask1], np.array(d1[mask1]), np.array(d2[mask2]), alpha=0.2, color="red"
    )
ax2.set_xlabel("Number of Days Since Claim Emergence", fontsize=label_fontsize)
ax2.set_ylabel("Cumulative Percentage of Claims", fontsize=label_fontsize)

xticks = np.arange(0, max_days + 1, 7)
ax2.set_xticks(xticks)
ax2.set_xticklabels(xticks, fontsize=tick_fontsize)
yticks = np.arange(30, 101, 10)
ax2.set_yticks(yticks)
ax2.set_yticklabels([f"{tick}%" for tick in yticks], fontsize=tick_fontsize)
# ax2.set_title('Monolingual vs. Multilingual', fontsize=title_fontsize)
ax2.grid(axis="y", linestyle="--", alpha=0.7)
ax2.legend(fontsize=legend_fontsize)

# Add Labels to the Panels (A / B / C) - bold
ax0.text(
    -0.05,
    1.175,
    "A",
    transform=ax0.transAxes,
    fontsize=26,
    fontweight="bold",
    va="top",
    ha="right",
)
ax1.text(
    -0.1,
    1.1,
    "B",
    transform=ax1.transAxes,
    fontsize=26,
    fontweight="bold",
    va="top",
    ha="right",
)
ax2.text(
    -0.1,
    1.1,
    "C",
    transform=ax2.transAxes,
    fontsize=26,
    fontweight="bold",
    va="top",
    ha="right",
)

plt.tight_layout()
# Add space between rows
plt.savefig("../Plots/cumulative_percentage_claims.png", dpi=300, bbox_inches="tight")
plt.savefig("../Plots/cumulative_percentage_claims.pdf", bbox_inches="tight")

## Paths

In [None]:
def find_most_dissimilar(cluster, threshhold):
    """
    Find the most dissimilar claims in a cluster.

    Parameters
    ----------
    cluster : int
        The cluster id.
    threshhold : float
        The cosine similarity threshhold.
    """
    # Get all claims in the cluster
    claims_in_cluster = df[
        df[f"cluster_{threshhold}"] == cluster
    ].claim_minimal.to_list()
    # Find two most dissimilar nodes
    embeddings_cluster = [embeddings[x] for x in claims_in_cluster]
    # Print hash of embeddings
    cosine_similarities_cluster = cosine_similarity(embeddings_cluster)
    np.fill_diagonal(cosine_similarities_cluster, np.inf)
    # Get the indices of the most dissimilar vectors.
    i, j = np.unravel_index(
        cosine_similarities_cluster.argmin(), cosine_similarities_cluster.shape
    )
    return claims_in_cluster[i], claims_in_cluster[j]

In [None]:
paths = {}

for threshhold in thresholds:
    # Get the edge list for the threshold
    edge_list = edge_lists[threshhold]

    # Create a graph
    G = nx.Graph(edge_list)

    # Create a dict with all paths
    paths[threshhold] = {}
    for cluster in tqdm(
        df[f"cluster_{threshhold}"].unique(),
        desc=f"Calculating paths for threshold {threshhold}",
    ):
        # Skip singletons
        if cluster == 0:
            continue

        # Check if more than two claims in cluster
        if (
            df[df[f"cluster_{threshhold}"] == cluster].shape[0]
            <= 2 | df[df[f"cluster_{threshhold}"] == cluster].shape[0]
            > 50
        ):
            continue
        # Find the most dissimilar claims in the cluster
        from_node, to_node = find_most_dissimilar(cluster, threshhold)
        # Calculate the path between the two claims
        try:
            path = nx.shortest_path(G, source=from_node, target=to_node)
        except:
            continue

        # Add the path to the dict
        paths[threshhold][cluster] = {}
        paths[threshhold][cluster]["path"] = path
        paths[threshhold][cluster]["origin"] = from_node
        paths[threshhold][cluster]["destination"] = to_node
        paths[threshhold][cluster]["cosine_similarity"] = cosine_similarity(
            [embeddings[from_node], embeddings[to_node]]
        )[0][1]

In [None]:
langs = df.set_index("claim_minimal").language.to_dict()

for threshhold in paths.keys():
    for cluster in paths[threshhold].keys():
        paths[threshhold][cluster]["origin_lang"] = langs[
            paths[threshhold][cluster]["origin"]
        ]
        paths[threshhold][cluster]["destination_lang"] = langs[
            paths[threshhold][cluster]["destination"]
        ]
        paths[threshhold][cluster]["path_lang"] = [
            langs[x] for x in paths[threshhold][cluster]["path"]
        ]
        paths[threshhold][cluster]["length"] = len(paths[threshhold][cluster]["path"])
        paths[threshhold][cluster]["number_of_language_switches"] = sum(
            [
                1
                for i in range(1, len(paths[threshhold][cluster]["path_lang"]))
                if paths[threshhold][cluster]["path_lang"][i]
                != paths[threshhold][cluster]["path_lang"][i - 1]
            ]
        )
        # Number of unique languages
        paths[threshhold][cluster]["number_of_unique_languages"] = len(
            set(paths[threshhold][cluster]["path_lang"])
        )

In [None]:
stats = (
    pd.DataFrame(paths[0.875])
    .T.groupby("number_of_language_switches")["cosine_similarity"]
    .agg(["mean", "count", "std"])
    .dropna()
)
stats["std"] = stats["std"] / np.sqrt(stats["count"])
y_change = {0.825: 0.06, 0.8: 0.015, 0.775: -0.015, 0.75: -0.06}

# Two subplots in two rows
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
for threshold in paths.keys():
    stats = (
        pd.DataFrame(paths[threshold])
        .T.groupby("length")["cosine_similarity"]
        .agg(["mean", "count", "std"])
    )
    stats = stats[stats.index <= 8]
    stats["std"] = stats["std"] / np.sqrt(stats["count"])
    stats = stats[stats["count"] > 10]
    stats["mean"] = stats["mean"].astype(float)
    stats["std"] = stats["std"].astype(float)
    ax1.plot(
        stats.index, stats["mean"], "o-", markersize=2, label=f"Threshold {threshold}"
    )
    ax1.set_xlim(-0.5, 8.5)
    ax1.set_ylim(0.3, 1)
    ax1.grid(
        True, which="both", axis="both", color="grey", linestyle=":", linewidth=0.5
    )
    # ax1.set_xticklabels([])
    end = stats.index[-1]
    height = stats["mean"].iloc[-1]
    ax1.set_ylabel("Cosine Similarity", fontsize=14)
    ax1.set_xlabel("Length of Shortest Path", fontsize=14)
    ax1.text(
        end,
        height + y_change.get(threshold, 0),
        f"Threshold {threshold}",
        fontsize=10,
        verticalalignment="center",
        bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        color=ax1.lines[-1].get_color(),
    )


y_change_2 = {0.875: 0.03, 0.85: 0.0, 0.825: -0.03, 0.775: -0.06, 0.75: -0.03}
for threshold in paths.keys():
    stats = (
        pd.DataFrame(paths[threshold])
        .T.groupby("number_of_language_switches")["cosine_similarity"]
        .agg(["mean", "count", "std"])
    )
    stats["std"] = stats["std"] / np.sqrt(stats["count"])
    stats = stats[stats["count"] > 10]
    stats["mean"] = stats["mean"].astype(float)
    stats["std"] = stats["std"].astype(float)
    ax2.plot(
        stats.index, stats["mean"], "o-", markersize=2, label=f"Threshold {threshold}"
    )
    ax2.set_xlim(-0.5, 8.5)
    ax2.set_ylim(0.3, 1)
    end = stats.index[-1]
    height = stats["mean"].iloc[-1]
    ax2.set_ylabel("Cosine Similarity", fontsize=14)
    ax2.set_xlabel("Number of Language Switches", fontsize=14)
    ax2.text(
        end,
        height + y_change_2.get(threshold, 0),
        f"Threshold {threshold}",
        fontsize=10,
        verticalalignment="center",
        bbox=dict(boxstyle="round", fc="w", alpha=0.5, ec="k"),
        color=ax2.lines[-1].get_color(),
    )

# ax2.legend()
# Reduce horizontal space between plots
plt.subplots_adjust(hspace=0.2)

# Add Labels to the Panels (A / B) - bold
ax1.text(
    -0.075,
    1.07,
    "A",
    transform=ax1.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)
ax2.text(
    -0.075,
    1.07,
    "B",
    transform=ax2.transAxes,
    fontsize=18,
    fontweight="bold",
    va="top",
    ha="right",
)

plt.savefig(
    "../Plots/average_similarity_by_lengthandlanguages.png",
    dpi=300,
    bbox_inches="tight",
)
plt.savefig(
    "../Plots/average_similarity_by_lengthandlanguages.pdf", bbox_inches="tight"
)