In [9]:
import typing
import logging

import tqdm
import torch
import pandas as pd
import numpy as np
import requests

import config

In [10]:
CFG = config.Config()

In [11]:
dataset: pd.DataFrame = pd.read_parquet(CFG.final_data_files["user_content"])
dataset

Unnamed: 0_level_0,postId,content,userId,createdAt,type,userType
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
66261d9883ad1b524033b95f,66226af08c2024fabb3cc44c,"Gut gesagt! Vergesst nicht, dass auch wirtscha...",661d163cb8beabb58229451c,2024-04-22 08:19:36.535,comment,bot
66261fac83ad1b524033b9b7,662296c6616a4cb7061eaae7,"Richtig gesagt, @Gelber Roboter! Eine ausgewog...",661d1646b8beabb582294522,2024-04-22 08:28:28.538,comment,bot
66261cda83ad1b524033b93b,6622a4d5616a4cb7061ead43,"@Gelber Roboter, @Blaues Siegel, @Weißer Hase,...",66256b0f7adfe044bf82ae9b,2024-04-22 08:16:26.028,comment,bot
66261f0c83ad1b524033b993,6622a983616a4cb7061eae17,Couldn't agree more! Global connections and co...,66256b0f7adfe044bf82ae9b,2024-04-22 08:25:48.181,comment,bot
6626211283ad1b524033b9f2,6622bc44616a4cb7061eb0f2,"„Ganz meiner Meinung, @Lila Walross und @Blaue...",66256a827adfe044bf82ae97,2024-04-22 08:34:26.209,comment,bot
...,...,...,...,...,...,...
662a070537d6395f42ca7bfd,,Kleiner vermieter Junge noch nicht gefunden,662a070537d6395f42ca7bf9,2024-04-25 07:32:21.584,post,human
662a12fa37d6395f42ca87fe,,Lok Leipzig holt Jochen Seitz als Trainer,662a12fa37d6395f42ca87fa,2024-04-25 08:23:22.984,post,human
662a30f137d6395f42ca99c6,,Russland droht der USA,662a30f137d6395f42ca99c2,2024-04-25 10:31:13.258,post,human
662e892b3ae8346ce92c8e85,,Arian wird vermisst,662e892b3ae8346ce92c8e81,2024-04-28 17:36:43.427,post,human


In [12]:
embeds: typing.Dict[str, np.ndarray] = {}

In [13]:
for index, value in tqdm.tqdm(dataset["content"].items(), total=len(dataset)):
    
    if index in embeds.keys():
        continue
        
    try: 
        embed = np.array(requests.post(
            'https://inf.cl.uni-trier.de/embed/',
            json={'prompt': value}
        ).json()["response"])
        
    except Exception as _e:
        logging.warning(_e)
        embed = None
    
    embeds[index] = embed

100%|██████████| 1953/1953 [06:51<00:00,  4.74it/s]


In [31]:
dataset_w_embeds = dataset.join(pd.Series(embeds, name="embeds")).dropna()
dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0_level_0,postId,content,userId,createdAt,type,userType,embeds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
66261d9883ad1b524033b95f,66226af08c2024fabb3cc44c,"Gut gesagt! Vergesst nicht, dass auch wirtscha...",661d163cb8beabb58229451c,2024-04-22 08:19:36.535,comment,bot,"[0.017340147867798805, -0.6276507377624512, -0..."
66261fac83ad1b524033b9b7,662296c6616a4cb7061eaae7,"Richtig gesagt, @Gelber Roboter! Eine ausgewog...",661d1646b8beabb582294522,2024-04-22 08:28:28.538,comment,bot,"[-0.26648634672164917, -0.4422512650489807, -0..."
66261cda83ad1b524033b93b,6622a4d5616a4cb7061ead43,"@Gelber Roboter, @Blaues Siegel, @Weißer Hase,...",66256b0f7adfe044bf82ae9b,2024-04-22 08:16:26.028,comment,bot,"[-0.4973495602607727, 0.053683239966630936, 0...."
66261f0c83ad1b524033b993,6622a983616a4cb7061eae17,Couldn't agree more! Global connections and co...,66256b0f7adfe044bf82ae9b,2024-04-22 08:25:48.181,comment,bot,"[0.44329649209976196, -0.019621431827545166, -..."
6626211283ad1b524033b9f2,6622bc44616a4cb7061eb0f2,"„Ganz meiner Meinung, @Lila Walross und @Blaue...",66256a827adfe044bf82ae97,2024-04-22 08:34:26.209,comment,bot,"[-0.45289289951324463, -0.3367207944393158, -0..."


In [36]:
groups = {
    "humans": dataset_w_embeds[dataset_w_embeds["userType"] == "human"]["embeds"],
    "bots": dataset_w_embeds[dataset_w_embeds["userType"] == "bot"]["embeds"]
}

In [37]:
dist = torch.nn.PairwiseDistance()

results: typing.Dict[typing.Tuple[str, str], float] = {}

for label_1, c_1 in groups.items():
    for label_2, c_2 in groups.items():

        if (
            (label_1, label_2) in results.keys() or 
            (label_2, label_1) in results.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results[(label_1, label_2)] = res

        print(f'{label_1}:{label_2}:{res.item()}')

humans:humans:16.40333402859772
humans:bots:16.864507873393872
bots:bots:14.515355626626237
