In [1]:
import re
import torch
import pickle
import unicodedata
import pandas as pd
import matplotlib.pyplot as plt

from scipy.special import softmax
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

In [2]:
cards_data = pd.read_csv(
    f"../datasets/augmented/9834838408490912248/cards_augmented_0_V1.csv")
augmentedv1 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/CHATGPT_V1.csv")
augmentedv1["DATASET"] = "CHATGPT (1)"
augmentedv1["claim"] = augmentedv1["generated_label"] 

augmentedv2 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/CHATGPT_V22.csv")
augmentedv2["DATASET"] = "CHATGPT2 (2)"
augmentedv2["claim"] = augmentedv2["generated_label"] 

augmentedv3 = pd.read_csv(
    f"../datasets/generated_disinformation/taxonomy/GPT-4_V2.csv")
augmentedv3["DATASET"] = "GPT-4 (1)"
augmentedv3["claim"] = augmentedv3["generated_label"]

data = pd.concat([cards_data, augmentedv1, augmentedv2, augmentedv3])
data.loc[data.DATASET!="cards", "PARTITION"] = "TRAIN"
data.DATASET.value_counts()

cards           28945
CHATGPT (1)     18891
CHATGPT2 (2)     8500
GPT-4 (1)        4490
Name: DATASET, dtype: int64

In [3]:
with open('../cards/models/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
# Load and pre-process the text data
# Define text pre-processing functions
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

data["p_text"] = data.text.astype(str).apply(denoise_text)

In [5]:
from simcse import SimCSE
model = SimCSE("../SimCSE/models")

Some weights of RobertaModel were not initialized from the model checkpoint at ../SimCSE/models and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
texts = data["p_text"].tolist()
embeddings = model.encode(texts)

100%|██████████| 951/951 [05:20<00:00,  2.96it/s]


In [7]:
embeddings.shape

torch.Size([60826, 768])

In [8]:
import plotly.express as px

# def cluster_data(data, embeddings, labels):    
#     preds, model_outputs, all_embedding_outputs, all_layer_hidden_states = roberta_model.predict(texts)
# data_train = data[data.PARTITION=="TRAIN"]
# texts = data.p_text.tolist()

labels = le.transform(data.claim.to_list())
# cluster_data(data, embeddings.numpy(), labels)

tsne = TSNE(perplexity=30, n_components=2, init='pca')
data[["x", "y"]] = tsne.fit_transform(embeddings)



In [9]:
fig = px.scatter(
    data, 
    x="x", y="y", color="claim", 
    facet_col="PARTITION", facet_row="DATASET", width=900, height=800,
    color_discrete_sequence=px.colors.qualitative.Light24,
)
fig.write_html("clustering.html")
fig

In [10]:
data.DATASET.value_counts()

cards           28945
CHATGPT (1)     18891
CHATGPT2 (2)     8500
GPT-4 (1)        4490
Name: DATASET, dtype: int64

In [11]:
data_train = data[data.PARTITION=="VALID"]
texts = data.p_text.tolist()
labels = le.transform(data.claim.to_list())
cluster_data(texts, labels)

NameError: name 'cluster_data' is not defined

In [None]:
data_train = data[data.PARTITION=="TEST"]
texts = data.p_text.tolist()
labels = le.transform(data.claim.to_list())
cluster_data(embeddings.numpy(), labels)