In [1]:
import os
import json
import pandas as pd

# Directory containing the JSON files
json_directory = "./dataset/rumdect/Weibo/"

# Function to process individual JSON files
def process_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        try:
            data = json.load(file)
            posts = []
            users = []
            for entry in data:  # Assuming JSON structure has 'value' as key
                # Extract post data
                post = {
                    "post_id": entry.get("id"),
                    "thread_id": entry.get("mid"),
                    "user_id": entry.get("uid"),
                    "text": entry.get("text"),
                    "reposts_count": entry.get("reposts_count"),
                    "likes_count": entry.get("attitudes_count"),
                    "comments_count": entry.get("comments_count"),
                    "parent_thread_id": entry.get("parent"),
                    "picture_url": entry.get("picture"),
                    "timestamp": entry.get("t")
                }
                posts.append(post)

                # Extract user data
                user = {
                    "user_id": entry.get("uid"),
                    "username": entry.get("username"),
                    "friends_count": entry.get("friends_count"),
                    "followers_count": entry.get("followers_count"),
                    "bi_followers_count": entry.get("bi_followers_count"),
                    "user_created_at": entry.get("user_created_at"),
                    "last_activity": entry.get("t")
                }
                users.append(user)
            return posts, users
        except json.JSONDecodeError:
            print(f"Error decoding JSON from file: {file_path}")
            return [], []

# Process all JSON files in the directory
def process_all_json_files(directory):
    consolidated_posts = []
    consolidated_users = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):

            file_path = os.path.join(directory, filename)
            posts, users = process_json_file(file_path)
            # print(posts, users)
            consolidated_posts.extend(posts)
            consolidated_users.extend(users)
    return consolidated_posts, consolidated_users

# Process Labels
def process_labels(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        for line in file:
            post_data = line.split("\t")
            data.append({
                "post_id": post_data[0].split(":")[-1],
                "label": post_data[1].split(":")[-1],
                "children": post_data[2].strip()
            })
    return pd.DataFrame(data)

In [2]:
# Extract posts and users data into DataFrame
all_posts, all_users = process_all_json_files(json_directory)
posts_df, users_df = pd.DataFrame(all_posts), pd.DataFrame(all_users)

In [3]:
users_df = users_df.loc[users_df.groupby("user_id")["last_activity"].idxmax()]

In [4]:
post_label_df = process_labels("./dataset/rumdect/Weibo.txt")
posts_merged_df = pd.merge(post_label_df, posts_df, on="post_id", how="inner")

In [5]:
posts_df.to_csv("./dataset/weibo_posts_eann_df.csv", index=False)
users_df.to_csv("./dataset/weibo_users_eann_df.csv", index=False)
posts_merged_df.to_csv("./dataset/weibo_posts_merged_eann_df.csv", index=False)