In [15]:
import typing

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re

from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.metrics.pairwise import cosine_similarity
import config
import src
import requests
import tqdm
import json
import numpy as np
import logging
import torch

from transformers import AutoTokenizer, AutoModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sjoerdstolwijk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sjoerdstolwijk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
CFG = config.Config()

In [4]:
dataset: pd.DataFrame = pd.read_csv('data/publicsphere/full_data.csv')
dataset

Unnamed: 0,StartDate,RecordedDate,IPAddress,Finished,Coder,ID,Mark_ID,Genre,topiccode,Platform,...,dislikeCount_video,likeCount_video,date_difference,commentCount_video,replyCount_comment,topic,subscribers,HATELIST_FOCUSED_DUMMY,Time_comment_year,Time_video_year
0,5/30/2021 13:03:17,5/30/2021 13:04:17,62.194.51.29,1,6,UgyPHwv8G0cDE6-wEgl4AaABAg.8_0ZjJKSJty8_0kXGkAd2U,119,0,0,1,...,,,,,,,,0,2017,2017.0
1,10/11/2021 10:34:05,10/11/2021 10:36:46,213.127.109.191,1,6,Ugx2WXq9UdV8mPPjejJ4AaABAg.8yHCKV0Boe58yYRxEQEF45,282,1,2,1,...,195.0,3817.0,743.0,1748.0,,economy,3630000.0,0,2019,2019.0
2,9/9/2021 18:49:48,9/9/2021 18:51:32,213.127.110.0,1,6,1110578710648890000,372,2,4,2,...,,,,,,,,0,2019,
3,6/6/2021 16:12:46,6/6/2021 16:16:16,213.127.76.145,1,6,UgwUPFScjJ0MCeaP2F54AaABAg.8lvp3fc9Euf8lvvgsUgEgV,769,0,0,1,...,,,,,,,,0,2018,2018.0
4,6/13/2021 13:25:49,6/13/2021 13:27:28,213.127.82.232,1,6,UgwWKCWtSJdFvjGHvTp4AaABAg.8kUC5dGrQ2H8kUDRihE2f3,1206,0,0,1,...,,,,,,,,0,2018,2018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3857,8/19/2021 14:50:13,8/19/2021 14:54:28,62.194.51.29,1,6,1152219467579100000,10000695,0,4,2,...,,,,,,,,0,2019,
3858,8/19/2021 15:10:27,8/19/2021 15:12:21,62.194.51.29,1,6,1085362296472430000,10007008,1,4,2,...,,,,,,,,0,2019,
3859,10/6/2021 16:08:39,10/6/2021 16:10:42,213.127.113.113,1,6,UghFY3QJ6nmT_ngCoAEC.7-H0Z7--wxd8goqpaPs-bl,20000102,0,3,1,...,2820.0,12475.0,3803.0,4785.0,,east,6740000.0,0,2018,2010.0
3860,10/15/2021 18:30:04,10/15/2021 18:35:40,213.127.109.191,1,6,UgyWabsmmnq3zam4DgZ4AaABAg,20000418,2,3,1,...,118.0,31761.0,1531.0,2206.0,0.0,east,6800000.0,0,2018,2015.0


In [5]:
#first just try pre-processing and a simple tf-ivf:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [35]:
def preprocess(texts):
    x_train = []
    for sent in tqdm.tqdm(texts):
        sent = re.sub(r'@[^ ]+', '', sent)  #remove all usernames
        sent = re.sub(r'https?://[^ ]+', '', sent) #remove all hyperlinks
        sent = re.sub(r'#', '', sent) #remove all hashtags
        sent = re.sub(r'([A-Za-z])\1{2,}', r'\1', sent) #normalize language use by replacing duplicate letters by single letters
        sent = re.sub("[^a-zA-Z ]", "", sent) #remove all non-words
        sent = sent.lower().split()
        sent = [lemmatizer.lemmatize(word) for word in sent if word not in set(stop_words)]
        sent = ' '.join(sent)
        x_train.append(sent)
    return x_train

In [36]:
X = preprocess(dataset["commentText"])

100%|██████████| 3862/3862 [00:05<00:00, 654.85it/s] 


In [37]:
tfidf = TfidfVectorizer(max_features=5000, analyzer='word', ngram_range=(1,2), stop_words='english')
X_tfidf = tfidf.fit_transform(X).toarray()

In [12]:
X_tfidf[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [73]:
dataset['tfidf_embedding'] = [torch.tensor(X_tfidf[i], dtype=torch.float32).flatten().tolist() for i in range(X_tfidf.shape[0])]


In [None]:
dataset['tfidf_embedding']

In [38]:
cosine_similarities = cosine_similarity(X_tfidf)
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.0110684 , 0.06654491,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.08009685, 0.        ,
        0.        ],
       ...,
       [0.        , 0.0110684 , 0.08009685, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.06654491, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [54]:
num_documents = len(cosine_similarities)

# Initialize a list to store the most similar document pairs
top_similar_pairs = []

# Iterate through all document pairs
for i in range(num_documents):
    for j in range(i + 1, num_documents):
        similarity = cosine_similarities[i][j]
        
        # Check if the similarity is NaN or zero
        if not np.isnan(similarity) and similarity >= 1:
            pair = (i, j)
            top_similar_pairs.append((similarity, pair))
            
# Sort the list based on similarity in descending order
top_similar_pairs.sort(key=lambda x: x[0], reverse=True)
            


In [56]:
top_similar_pairs

[(1.0000000000000002, (139, 1990)),
 (1.0000000000000002, (246, 1179)),
 (1.0000000000000002, (362, 736)),
 (1.0000000000000002, (722, 739)),
 (1.0, (21, 1856)),
 (1.0, (75, 366)),
 (1.0, (75, 1022)),
 (1.0, (75, 2122)),
 (1.0, (75, 2421)),
 (1.0, (75, 2946)),
 (1.0, (101, 850)),
 (1.0, (133, 1057)),
 (1.0, (141, 3838)),
 (1.0, (164, 1240)),
 (1.0, (164, 2182)),
 (1.0, (198, 3213)),
 (1.0, (242, 2779)),
 (1.0, (242, 3278)),
 (1.0, (259, 490)),
 (1.0, (313, 2607)),
 (1.0, (341, 352)),
 (1.0, (366, 1022)),
 (1.0, (366, 2122)),
 (1.0, (366, 2421)),
 (1.0, (366, 2946)),
 (1.0, (395, 3340)),
 (1.0, (400, 585)),
 (1.0, (400, 877)),
 (1.0, (400, 962)),
 (1.0, (400, 2099)),
 (1.0, (400, 2848)),
 (1.0, (400, 3752)),
 (1.0, (436, 3211)),
 (1.0, (487, 936)),
 (1.0, (487, 2484)),
 (1.0, (546, 911)),
 (1.0, (559, 3708)),
 (1.0, (585, 877)),
 (1.0, (585, 962)),
 (1.0, (585, 2099)),
 (1.0, (585, 2848)),
 (1.0, (585, 3752)),
 (1.0, (643, 1401)),
 (1.0, (673, 1743)),
 (1.0, (678, 2244)),
 (1.0, (678, 2

In [55]:
result = [pair for value, pair in top_similar_pairs if value >= 1]
print(result)

[(139, 1990), (246, 1179), (362, 736), (722, 739), (21, 1856), (75, 366), (75, 1022), (75, 2122), (75, 2421), (75, 2946), (101, 850), (133, 1057), (141, 3838), (164, 1240), (164, 2182), (198, 3213), (242, 2779), (242, 3278), (259, 490), (313, 2607), (341, 352), (366, 1022), (366, 2122), (366, 2421), (366, 2946), (395, 3340), (400, 585), (400, 877), (400, 962), (400, 2099), (400, 2848), (400, 3752), (436, 3211), (487, 936), (487, 2484), (546, 911), (559, 3708), (585, 877), (585, 962), (585, 2099), (585, 2848), (585, 3752), (643, 1401), (673, 1743), (678, 2244), (678, 2481), (678, 2843), (678, 2856), (678, 2971), (702, 1392), (709, 1800), (753, 930), (753, 1564), (753, 1887), (753, 2728), (793, 2935), (813, 1872), (816, 1088), (877, 962), (877, 2099), (877, 2848), (877, 3752), (894, 1060), (914, 2707), (914, 2914), (919, 3578), (930, 1564), (930, 1887), (930, 2728), (936, 2484), (937, 1005), (937, 1192), (955, 1404), (962, 2099), (962, 2848), (962, 3752), (969, 1196), (969, 1449), (969, 

In [62]:
for first, second in result:
    print(dataset["commentText"][[first, second]])
    print()

139          vote trump
1990    Vote Trump 2020
Name: commentText, dtype: object

246                        hell yeah fam
1179    @TheDailyShow Hells to the Yeah.
Name: commentText, dtype: object

362                @James Persinger \n apple and orange.
736    @hardball @MSNBC Just a few apples and oranges...
Name: commentText, dtype: object

722             @hardball @KenSalazar Good grief!
739    @NBCNews My opinion......lol.  Good grief.
Name: commentText, dtype: object

21                     Mika is low IQ
1856    @Rusty you still very low IQ.
Name: commentText, dtype: object

75                                               I agree
366    Kai Watson So you agree that Haiti is a Shithole?
Name: commentText, dtype: object

75                                 I agree
1022    @RealTimers @SteveSchmidtSES Agree
Name: commentText, dtype: object

75                 I agree
2122    Rtb boone i agree.
Name: commentText, dtype: object

75                  I agree
2421    Braulio V.  I agre

In [61]:
print(X[246])
print(X[1179])
print(X[139])
print(X[1990])
print(X[21])
print(X[1856])
print(X[722])
print(X[739])
print(X[362])
print(X[736])

hell yeah fam
hell yeah
vote trump
vote trump
mika low iq
still low iq
good grief
opinionlol good grief
persinger apple orange
apple orange


In [74]:
grouped_data = dataset.groupby("Platform")
dist = torch.nn.PairwiseDistance()
resultsplatform: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):
    for model_2, c_2 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):

        if (
            (model_1, model_2) in resultsplatform.keys() or 
            (model_2, model_1) in resultsplatform.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        resultsplatform[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

KeyboardInterrupt: 

In [None]:
grouped_data = dataset.groupby("topiccode")
dist = torch.nn.PairwiseDistance()
resultstopic: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):
    for model_2, c_2 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):

        if (
            (model_1, model_2) in resultstopic.keys() or 
            (model_2, model_1) in resultstopic.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        resultstopic[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

In [None]:
grouped_data = dataset.groupby("Genre")
dist = torch.nn.PairwiseDistance()
resultsgenre: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):
    for model_2, c_2 in tqdm.tqdm(grouped_data['tfidf_embedding'], total=grouped_data.ngroups):

        if (
            (model_1, model_2) in resultsgenre.keys() or 
            (model_2, model_1) in resultsgenre.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        resultsgenre[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

In [4]:
MODEL: str = 'mixtral:8x7b-instruct-v0.1-q6_K' # options: 'gemma:7b-instruct-q6_K', 'gemma2:27b-instruct-q6_K', 'llama3.1:8b-instruct-q6_K', 'llama3.1:70b-instruct-q6_K', 'mistral:7b-instruct-v0.3-q6_K', 'mistral-large:123b-instruct-2407-q6_K', 'mixtral:8x7b-instruct-v0.1-q6_K', 'mixtral:8x22b-instruct-v0.1-q6_K', 'phi3:14b-medium-128k-instruct-q6_K' or 'qwen2:72b-instruct-q6_K'


In [5]:
embed_MXP: typing.Dict[str, np.ndarray] = {}

In [10]:
context = 'social media replies to a news- or infotainment-post'
for index, row in tqdm.tqdm(dataset["commentText"].items(), total=len(dataset)):
    try: 
        embed = np.array(requests.post(
            'https://inf.cl.uni-trier.de/embed/',
            json={'model': MODEL, 
                  'prompt': 'You help me get embeddings for a sentence. I provide you with a context and a sentence and you reply only with that exact sentence. Context = ' + context + '; Sentence: ' + row}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        embed = None
    
    embed_MXP[index] = embed

  3%|▎         | 123/3862 [04:58<2:31:24,  2.43s/it]


KeyboardInterrupt: 

In [None]:
dataset_w_embeds = dataset.join(pd.Series(embed_MXP, name="embed_MXP"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

In [None]:
#create BERT-base embeddings for each word:

model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
#create Twhin-BERT-base embeddings for each word:
model_name = "Twitter/twhin-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [12]:
#calculate DiffCSE embeddings:
model_name = "voidism/diffcse-roberta-base-sts"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at voidism/diffcse-roberta-base-sts were not used when initializing RobertaModel: ['aux_bert.embeddings.LayerNorm.bias', 'aux_bert.embeddings.LayerNorm.weight', 'aux_bert.embeddings.position_embeddings.weight', 'aux_bert.embeddings.position_ids', 'aux_bert.embeddings.token_type_embeddings.weight', 'aux_bert.embeddings.word_embeddings.weight', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.bias', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.weight', 'aux_bert.encoder.layer.0.attention.output.dense.bias', 'aux_bert.encoder.layer.0.attention.output.dense.weight', 'aux_bert.encoder.layer.0.attention.self.key.bias', 'aux_bert.encoder.layer.0.attention.self.key.weight', 'aux_bert.encoder.layer.0.attention.self.query.bias', 'aux_bert.encoder.layer.0.attention.self.query.weight', 'aux_bert.encoder.layer.0.attention.self.value.bias', 'aux_bert.encoder.layer.0.attention.self.value.weight', 'aux_bert.encoder.layer.0.intermediate.dense.bias', 'aux

In [13]:
#function for encoding sentence embeddings based on the DiffCSE approach which is based on (and refers to) SimCSE/evaluation.py 
def encode_sentence(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.ones_like(input_ids_tensor)  # Creating attention mask 
    with torch.no_grad():
            outputs = model(input_ids_tensor, output_hidden_states=True, return_dict=True, attention_mask=attention_mask)
            sentence_embedding = outputs.last_hidden_state[:, 0].cpu()
            return sentence_embedding.tolist()

In [15]:
#for DiffCSE: Create embeddings for each argument and store them in a new column
dataset.loc[:, 'DiffCSE_embedding'] = dataset["commentText"].apply(encode_sentence)

KeyboardInterrupt: 