In [None]:
for file in all_file:
    df = pd.read_parquet(file, columns=["userID", "parentID", "post_id", "embedding"])
    if df["embedding"].isnull().any():
        print(file)

In [None]:
import json
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional

subject_to_targetUser = {}


def parse_datetime_to_timestamp(dt_str: str) -> float:

    s = dt_str.replace(" UTC", "").strip()
    return datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timestamp()


def parse_subject_json_to_csv(
    input_path: str,
    output_dir: Optional[str] = None,
) -> str:
    input_path = Path(input_path)
    
    with input_path.open("r", encoding="utf-8") as f:
        threads = json.load(f)
    
    if len(threads) == 0:
        raise ValueError("JSON file is empty")
    
    # target_subject = threads[0].get("targetSubject")
    # if target_subject is None:
    #     target_subject = input_path.stem
    #     print(f"Warning: No targetSubject found, using filename: {target_subject}")
    # else:
    #     for thread in threads:
    #         if thread.get("targetSubject") != target_subject:
    #             print(f"Warning: Different targetSubject found: {thread.get('targetSubject')} vs {target_subject}")
    target_subject = input_path.stem
    for thread in threads:
        sub = thread.get("submission") or {}
        if sub.get("target") is True:
            target_subject = sub.get("user_id") or target_subject
            break
    
    if output_dir is None:
        output_dir = input_path.parent
    else:
        output_dir = Path(output_dir)
    
    output_path = output_dir / f"{target_subject}.csv"
    
    events: List[Dict[str, Any]] = []
    deleted_counter = 0

    for thread in threads:
        # submission_id = thread.get("submissionId")
        # submission_author = thread.get("author")
        # submission_date = thread.get("date")
        # submission_title = (thread.get("title") or "").strip()
        # submission_body = (thread.get("body") or "").strip()
        sub = thread.get("submission") or {}
        submission_id = sub.get("submission_id")
        submission_author = sub.get("user_id")
        submission_date = sub.get("created_utc")
        submission_title = (sub.get("title") or "").strip()
        submission_body = (sub.get("body") or "").strip()

        conversation_id = submission_id
        id_to_author: Dict[str, str] = {}

        if submission_id is None or submission_author is None or submission_date is None:
            continue

        if submission_author == "AutoModerator":
            processed_submission_author: Optional[str] = None
        else:
            full_text = (submission_title + "\n\n" + submission_body).strip()
            if full_text in ("[deleted]", "[removed]"):
                processed_submission_author = None
            else:
                if submission_author == "[deleted]":
                    processed_submission_author = f"deleted_{deleted_counter}"
                    deleted_counter += 1
                else:
                    processed_submission_author = submission_author

        if processed_submission_author is not None:
            id_to_author[submission_id] = processed_submission_author
            try:
                ts = parse_datetime_to_timestamp(submission_date)
            except Exception:
                ts = None

            events.append(
                {
                    "userID": processed_submission_author,
                    "parentID": None,
                    "timestamp": ts,
                    "post_id": submission_id,
                    "conversation_id": conversation_id,
                    # "body": full_text,
                }
            )

        comments = thread.get("comments", []) or []

        # Pass 1: build id_to_author cho comments
        for c in comments:
            # cid = c.get("commentId")
            # cauthor = c.get("author")
            cid = c.get("comment_id")
            cauthor = c.get("user_id")
            cbody = (c.get("body") or "").strip()

            if cid is None or cauthor is None:
                continue

            if cauthor == "AutoModerator":
                continue

            if cbody in ("[deleted]", "[removed]"):
                continue

            if cauthor == "[deleted]":
                new_name = f"deleted_{deleted_counter}"
                deleted_counter += 1
                processed_author = new_name
            else:
                processed_author = cauthor

            id_to_author[cid] = processed_author

        # Pass 2: tạo events cho comments
        for c in comments:
            # comment_id = c.get("commentId")
            # author = c.get("author")
            # date_str = c.get("date")
            # parent_token = c.get("parent")
            comment_id = c.get("comment_id")
            author = c.get("user_id")
            date_str = c.get("created_utc")
            parent_token = c.get("parent_id")
            body = (c.get("body") or "").strip()

            if comment_id is None or author is None or date_str is None:
                continue

            if author == "AutoModerator":
                continue

            if body in ("[deleted]", "[removed]"):
                continue

            if comment_id not in id_to_author:
                continue
            processed_author = id_to_author[comment_id]

            parent_author: Optional[str] = None
            if parent_token and parent_token in id_to_author:
                parent_author = id_to_author[parent_token]

            try:
                ts = parse_datetime_to_timestamp(date_str)
            except Exception:
                ts = None

            events.append(
                {
                    "userID": processed_author,
                    "parentID": parent_author,
                    "timestamp": ts,
                    "post_id": comment_id,
                    "conversation_id": conversation_id,
                    # "body": body,
                }
            )

    df = pd.DataFrame(events)
    
    expected_cols = ["userID", "parentID", "timestamp", "post_id", "conversation_id"]
    for col in expected_cols:
        if col not in df.columns:
            df[col] = None
    
    df = df[expected_cols]
    
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    df.to_csv(output_path, index=False)
    
    print(f"Saved {len(events)} events to {output_path}")
    print(f"Target subject: {target_subject}")
    print(f"Total synthetic deleted_* users: {deleted_counter}")
    
    return str(output_path)


if __name__ == "__main__":
    # Ví dụ sử dụng:
    parse_subject_json_to_csv(
        input_path="tgn_depression/data/sample_data/2022_subject1200.json",
    )
    # Output: tgn_depression/data/sample_data/metsadeer.csv


Saved 1807 events to tgn_depression\data\sample_data\metsadeer.csv
Target subject: metsadeer
Total synthetic deleted_* users: 188


In [None]:
import numpy as np

post_ids_in_df = df["post_id"].unique()

if "embedding" not in df.columns:
    df["embedding"] = None

for post_id, embedding in data_embedding["2022_subject1301"].items():
    if post_id in post_ids_in_df:
        idx = df["post_id"] == post_id
        emb_np = np.array(embedding)
        if emb_np.shape == (768,):
            # Assign each row individually to avoid ValueError with ndarray/list assignment
            for i in df.index[idx]:
                df.at[i, "embedding"] = emb_np
        else:
            print(f"Warning: Embedding for {post_id} does not have shape (768,): {emb_np.shape}")
    else:
        print(f"Warning: {post_id} not in df")

NameError: name 'data_embedding' is not defined

In [None]:
for subject in data_embedding.keys():
    targetUser = subject_to_targetUser[subject]
    if targetUser in neg_user:
        df = pd.read_csv(f"/kaggle/working/2022/neg/{targetUser}.csv")
        output_path = f"/kaggle/working/2022/neg/{targetUser}.csv"
    elif targetUser in pos_user:
        df = pd.read_csv(f"/kaggle/working/2022/pos/{targetUser}.csv")
        output_path = f"/kaggle/working/2022/pos/{targetUser}.csv"
    else:
        print(f"Cant find {targetUser}")

    if "embedding" not in df.columns:
        df["embedding"] = None
        
    post_ids_in_df = df["post_id"].unique()
    count_error_shape, count_error_post_id = 0, 0
    for post_id, embedding in data_embedding[subject].items():
        if post_id in post_ids_in_df:
            idx = df["post_id"] == post_id
            emb_np = np.array(embedding)
            if emb_np.shape == (1024,):
                # Assign each row individually to avoid ValueError with ndarray/list assignment
                for i in df.index[idx]:
                    df.at[i, "embedding"] = emb_np
            else:
                count_error_shape += 1
        else:
            count_error_post_id += 1
    print(f"{targetUser}: Error shape: {count_error_shape}, Error post_id: {count_error_post_id}")
    df.to_csv(f"/kaggle/working/2022/{subject}.csv", index=False)

In [None]:
with open("/kaggle/input/erisk2025-t2-official/labels/2025.txt", "r") as f:
    for line in f: