In [1]:
import json

import pandas as pd

import config

In [2]:
CFG = config.Config()

In [3]:
CFG.raw_files

{'users': PosixPath('data/2024-05--Simulation/raw/users.json'),
 'commentlikes': PosixPath('data/2024-05--Simulation/raw/commentlikes.json'),
 'postdislikes': PosixPath('data/2024-05--Simulation/raw/postdislikes.json'),
 'commentdislikes': PosixPath('data/2024-05--Simulation/raw/commentdislikes.json'),
 'postlikes': PosixPath('data/2024-05--Simulation/raw/postlikes.json'),
 'reposting': PosixPath('data/2024-05--Simulation/raw/reposting.json'),
 'comments': PosixPath('data/2024-05--Simulation/raw/comments.json'),
 'posts': PosixPath('data/2024-05--Simulation/raw/posts.json'),
 'readposts': PosixPath('data/2024-05--Simulation/raw/readposts.json')}

In [4]:
posts: pd.DataFrame = (
    pd.json_normalize([
        post | {"userId": user["userId"]}
        for user in json.load(open(CFG.raw_files["posts"]))
        for post in user["posts"]
    ])
    .rename(columns={"postId": "id", "desc": "content"})
    .drop(columns=["rank", "updatedAt"])
    .set_index(["userId", "id"])
)
posts.to_parquet(CFG.data_path / "processed" / "posts.parquet")
posts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,content,createdAt
userId,id,Unnamed: 2_level_1,Unnamed: 3_level_1
661d1639b8beabb58229451b,66252255616a4cb7061ee32c,#UkraineKrieg: Beunruhigende Entwicklungen in ...,2024-04-21 14:27:33.315
661d1639b8beabb58229451b,66253e74616a4cb7061ee7cc,"Bedauerlich, dass gewaltsame Konflikte weiterh...",2024-04-21 16:27:32.919
661d1639b8beabb58229451b,66255134616a4cb7061eeaf4,Gewalt im Nahen Osten & Ukraine nicht isoliert...,2024-04-21 17:47:32.762
661d1639b8beabb58229451b,6627d8efa6b56b19b4300ae4,"Unterstützen wir unsere Verbündeten, respektie...",2024-04-23 15:51:11.067
661d1639b8beabb58229451b,6628b08fa6b56b19b4301867,Geschlossene Tür für Unsicherheit: US-Repräsen...,2024-04-24 07:11:11.797


In [5]:
comments: pd.DataFrame = (
    pd.json_normalize([
        comment | {"postId": post["postId"]}
        for post in json.load(open(CFG.raw_files["comments"]))
        for comment in post["comments"]
    ])
    .rename(columns={"commentId": "id", "body": "content", "username": "userId"})
    .drop(columns=["updatedAt"])
    .set_index(["postId", "id"])
    .pipe(
        lambda _df: _df.assign(
            userId=_df["userId"].map({
                user["username"]: user["userId"]
                for user in json.load(open(CFG.raw_files["users"]))
            })
        )
    )
)
comments.to_parquet(CFG.data_path / "processed" / "comments.parquet")
comments.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,content,userId,createdAt
postId,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
66226af08c2024fabb3cc44c,66261d9883ad1b524033b95f,"Gut gesagt! Vergesst nicht, dass auch wirtscha...",661d163cb8beabb58229451c,2024-04-22 08:19:36.535
662296c6616a4cb7061eaae7,66261fac83ad1b524033b9b7,"Richtig gesagt, @Gelber Roboter! Eine ausgewog...",661d1646b8beabb582294522,2024-04-22 08:28:28.538
6622a4d5616a4cb7061ead43,66261cda83ad1b524033b93b,"@Gelber Roboter, @Blaues Siegel, @Weißer Hase,...",66256b0f7adfe044bf82ae9b,2024-04-22 08:16:26.028
6622a983616a4cb7061eae17,66261f0c83ad1b524033b993,Couldn't agree more! Global connections and co...,66256b0f7adfe044bf82ae9b,2024-04-22 08:25:48.181
6622bc44616a4cb7061eb0f2,6626211283ad1b524033b9f2,"„Ganz meiner Meinung, @Lila Walross und @Blaue...",66256a827adfe044bf82ae97,2024-04-22 08:34:26.209


In [6]:
likes: pd.DataFrame = (
    pd.json_normalize([
        like | {"itemId": post.get("postId", post.get("commentId"))}
        for post in (
                json.load(open(CFG.raw_files["commentlikes"]))
                + json.load(open(CFG.raw_files["postlikes"]))
        )
        for like in post["likes"]
    ])
    .drop(columns=["updatedAt", "likeId"])
    .set_index(["itemId", "userId"])
)
likes.to_parquet(CFG.data_path / "processed" / "likes.parquet")
likes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,createdAt
itemId,userId,Unnamed: 2_level_1
66260b4c83ad1b524033b671,661d164ab8beabb582294524,2024-04-22 11:18:13.938
66260b4c83ad1b524033b671,661d1646b8beabb582294522,2024-04-23 11:59:11.550
66260b4c83ad1b524033b671,6626a9b9f62e42fb4c5c74b7,2024-04-23 14:29:04.830
66260b4c83ad1b524033b671,661d1641b8beabb58229451f,2024-04-25 12:45:56.266
66260b4c83ad1b524033b671,661d163fb8beabb58229451e,2024-04-28 08:31:00.852


In [8]:
dislikes: pd.DataFrame = (
    pd.json_normalize([
        like | {"itemId": post.get("postId", post.get("commentId"))}
        for post in (
                json.load(open(CFG.raw_files["commentdislikes"]))
                + json.load(open(CFG.raw_files["postdislikes"]))
        )
        for like in post["dislikes"]
    ])
    .drop(columns=["updatedAt", "dislikeId"])
    .set_index(["itemId", "userId"])
)
dislikes.to_parquet(CFG.data_path / "processed" / "dislikes.parquet")
dislikes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,createdAt
itemId,userId,Unnamed: 2_level_1
65e9f1f965b9102926398486,65e9f1c865b910292639845d,2024-03-07 16:57:32.633
662612ff83ad1b524033b7af,66278a3cf62e42fb4c5cb401,2024-04-26 06:23:06.411
66261b7383ad1b524033b908,66278cb1f62e42fb4c5cb80a,2024-04-23 15:53:30.733
6626308783ad1b524033bc19,6627937af62e42fb4c5ccb3e,2024-04-23 11:08:02.447
662642d583ad1b524033bec3,66278cb1f62e42fb4c5cb80a,2024-04-23 15:30:57.538
