In [2]:
import re
import torch
import pickle
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn
from scipy.special import softmax
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

In [3]:
cards_data = pd.read_csv(
    f"../datasets/augmented/9834838408490912248/cards_augmented_0_V1.csv")
augmentedv1 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/CHATGPT_V1.csv")
augmentedv1["DATASET"] = "CHATGPT (1)"
augmentedv1["claim"] = augmentedv1["generated_label"] 

augmentedv2 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/CHATGPT_V22.csv")
augmentedv2["DATASET"] = "CHATGPT2 (2)"
augmentedv2["claim"] = augmentedv2["generated_label"] 

augmentedv3 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/GPT-4_V2.csv")
augmentedv3["DATASET"] = "GPT-4 (1)"
augmentedv3["claim"] = augmentedv3["generated_label"]

data = pd.concat([cards_data, augmentedv1, augmentedv2, augmentedv3])
data.loc[data.DATASET!="cards", "PARTITION"] = "TRAIN"
data.DATASET.value_counts()

cards           28945
CHATGPT (1)     18891
CHATGPT2 (2)     8500
GPT-4 (1)        4490
Name: DATASET, dtype: int64

In [4]:
with open('../cards/models/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
# Load and pre-process the text data
# Define text pre-processing functions
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

data["p_text"] = data.text.astype(str).apply(denoise_text)

In [6]:
from simcse import SimCSE
model = SimCSE("../SimCSE/models/roberta-large-RANDOM_hard_negatives")

Some weights of the model checkpoint at ../SimCSE/models/roberta-large-RANDOM_hard_negatives were not used when initializing RobertaModel: ['mlp.dense.weight', 'mlp.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
texts = data["p_text"].tolist()
embeddings = model.encode(texts)

100%|██████████| 951/951 [10:20<00:00,  1.53it/s]


In [8]:
embeddings.shape

torch.Size([60826, 1024])

In [9]:
means = {}
cards_data = data[data.DATASET=="cards"]
for claim in np.sort(data.claim.unique()):
    idxs = cards_data[cards_data.claim == claim].index.values
    means[claim] = embeddings[idxs, :].mean(axis=0)
means

{'0_0': tensor([-0.0545, -0.0249,  0.0127,  ..., -0.0420, -0.0410,  0.0135]),
 '1_1': tensor([-0.0557, -0.0345,  0.0081,  ..., -0.0503, -0.0369,  0.0078]),
 '1_2': tensor([-0.0562, -0.0352,  0.0136,  ..., -0.0496, -0.0401,  0.0104]),
 '1_3': tensor([-0.0555, -0.0317,  0.0086,  ..., -0.0454, -0.0394,  0.0070]),
 '1_4': tensor([-0.0566, -0.0341,  0.0123,  ..., -0.0499, -0.0404,  0.0097]),
 '1_6': tensor([-0.0565, -0.0309,  0.0081,  ..., -0.0497, -0.0371,  0.0094]),
 '1_7': tensor([-0.0566, -0.0315,  0.0108,  ..., -0.0483, -0.0406,  0.0120]),
 '2_1': tensor([-0.0562, -0.0343,  0.0126,  ..., -0.0501, -0.0395,  0.0114]),
 '2_3': tensor([-0.0568, -0.0314,  0.0144,  ..., -0.0486, -0.0402,  0.0127]),
 '3_1': tensor([-0.0568, -0.0308,  0.0137,  ..., -0.0489, -0.0398,  0.0125]),
 '3_2': tensor([-0.0553, -0.0254,  0.0192,  ..., -0.0458, -0.0400,  0.0139]),
 '3_3': tensor([-0.0556, -0.0247,  0.0194,  ..., -0.0440, -0.0417,  0.0141]),
 '4_1': tensor([-0.0548, -0.0243,  0.0117,  ..., -0.0402, -0.040

In [12]:
cos = nn.CosineSimilarity(dim=-1)
means_m = torch.stack(list(means.values()))
cos(embeddings[0, :], means_m)

tensor([0.9738, 0.9946, 0.9976, 0.9908, 0.9987, 0.9948, 0.9939, 0.9974, 0.9971,
        0.9975, 0.9871, 0.9827, 0.9657, 0.9784, 0.9660, 0.9657, 0.9972, 0.9852])

In [None]:
cluster_class = []
cos = nn.CosineSimilarity(dim=-1)
means_m = torch.stack(list(means.values()))
for i in range(embeddings.shape[0]):
    abstracted_claim = torch.argmax(cos(embeddings[i, :], means_m)).item()
    cluster_class.append(abstracted_claim)
    
data["cluster_class"] = le.inverse_transform(cluster_class)

In [117]:
(data.loc[data.DATASET=="cards", "claim"] == data.loc[data.DATASET=="cards", "cluster_class"]).value_counts()

False    21399
True      7546
dtype: int64

In [118]:
(data.loc[:, "claim"] == data.loc[:, "cluster_class"]).value_counts()

False    37986
True     22840
dtype: int64

In [61]:
embeddings[0,:].shape

torch.Size([1024])

In [58]:
means_m.shape

torch.Size([1024, 18])

In [8]:
import plotly.express as px

# def cluster_data(data, embeddings, labels):    
#     preds, model_outputs, all_embedding_outputs, all_layer_hidden_states = roberta_model.predict(texts)
# data_train = data[data.PARTITION=="TRAIN"]
# texts = data.p_text.tolist()

labels = le.transform(data.claim.to_list())
# cluster_data(data, embeddings.numpy(), labels)

tsne = TSNE(perplexity=30, n_components=2, init='pca')
data[["x", "y"]] = tsne.fit_transform(embeddings)



In [9]:
fig = px.scatter(
    data, 
    x="x", y="y", color="claim", 
    facet_col="PARTITION", facet_row="DATASET", width=900, height=800,
    color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.write_html("clustering.html")
fig