In [10]:
import json

import pandas as pd
import numpy as np

import config

In [11]:
CFG = config.Config()

In [12]:
posts: pd.DataFrame = pd.read_parquet(CFG.processed_data_files["posts"])
comments: pd.DataFrame = pd.read_parquet(CFG.processed_data_files["comments"])
likes: pd.DataFrame = pd.read_parquet(CFG.processed_data_files["likes"])
dislikes: pd.DataFrame = pd.read_parquet(CFG.processed_data_files["dislikes"])

In [13]:
bots: dict = json.load(open(CFG.raw_data_files["bots"]))

In [14]:
user_content: pd.DataFrame = (
    pd.concat([
        comments.assign(type="comment").reset_index("postId"),
        posts.assign(type="post").reset_index("userId")
    ])
    .pipe(lambda _df: _df.assign(
        userType= np.where(_df['userId'].isin([dictionary["userId"] for dictionary in bots]), 'bot', 'human')
    ))
)
user_content.to_parquet(CFG.data_dir / "final" / "user_content.parquet")
user_content

Unnamed: 0_level_0,postId,content,userId,createdAt,type,userType
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
66261d9883ad1b524033b95f,66226af08c2024fabb3cc44c,"Gut gesagt! Vergesst nicht, dass auch wirtscha...",661d163cb8beabb58229451c,2024-04-22 08:19:36.535,comment,bot
66261fac83ad1b524033b9b7,662296c6616a4cb7061eaae7,"Richtig gesagt, @Gelber Roboter! Eine ausgewog...",661d1646b8beabb582294522,2024-04-22 08:28:28.538,comment,bot
66261cda83ad1b524033b93b,6622a4d5616a4cb7061ead43,"@Gelber Roboter, @Blaues Siegel, @Weißer Hase,...",66256b0f7adfe044bf82ae9b,2024-04-22 08:16:26.028,comment,bot
66261f0c83ad1b524033b993,6622a983616a4cb7061eae17,Couldn't agree more! Global connections and co...,66256b0f7adfe044bf82ae9b,2024-04-22 08:25:48.181,comment,bot
6626211283ad1b524033b9f2,6622bc44616a4cb7061eb0f2,"„Ganz meiner Meinung, @Lila Walross und @Blaue...",66256a827adfe044bf82ae97,2024-04-22 08:34:26.209,comment,bot
...,...,...,...,...,...,...
662a070537d6395f42ca7bfd,,Kleiner vermieter Junge noch nicht gefunden,662a070537d6395f42ca7bf9,2024-04-25 07:32:21.584,post,human
662a12fa37d6395f42ca87fe,,Lok Leipzig holt Jochen Seitz als Trainer,662a12fa37d6395f42ca87fa,2024-04-25 08:23:22.984,post,human
662a30f137d6395f42ca99c6,,Russland droht der USA,662a30f137d6395f42ca99c2,2024-04-25 10:31:13.258,post,human
662e892b3ae8346ce92c8e85,,Arian wird vermisst,662e892b3ae8346ce92c8e81,2024-04-28 17:36:43.427,post,human


In [15]:
user_content["userType"].value_counts()

userType
bot      1333
human     620
Name: count, dtype: int64

In [16]:
user_interactions: pd.DataFrame = (
    pd.concat([
        posts.reset_index()[["userId", "createdAt", "id"]].assign(action="post"),
        comments.reset_index()[["userId", "createdAt", "id"]].assign(action="comment"),
        likes.reset_index()[["userId", "createdAt", "id"]].assign(action="like"),
        dislikes.reset_index()[["userId", "createdAt", "id"]].assign(action="dislike"),
    ])
    .pipe(lambda _df: _df.assign(
        action=_df["action"].astype("category"),
        createdAt=pd.to_datetime(_df["createdAt"]),
    ))
    .set_index(["userId", "createdAt"])
    .rename(columns={"id": "refId"})
    .pipe(lambda _df: _df.assign(
        userType= np.where(_df.reset_index()['userId'].isin([dictionary["userId"] for dictionary in bots]), 'bot', 'human')
    ))
)
user_interactions.to_parquet(CFG.data_dir / "final" / "user_interactions.parquet")
user_interactions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,refId,action,userType
userId,createdAt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
661d1639b8beabb58229451b,2024-04-21 14:27:33.315,66252255616a4cb7061ee32c,post,bot
661d1639b8beabb58229451b,2024-04-21 16:27:32.919,66253e74616a4cb7061ee7cc,post,bot
661d1639b8beabb58229451b,2024-04-21 17:47:32.762,66255134616a4cb7061eeaf4,post,bot
661d1639b8beabb58229451b,2024-04-23 15:51:11.067,6627d8efa6b56b19b4300ae4,post,bot
661d1639b8beabb58229451b,2024-04-24 07:11:11.797,6628b08fa6b56b19b4301867,post,bot


In [17]:
user_interactions["userType"].value_counts()

userType
bot      4956
human    1772
Name: count, dtype: int64