# Compose a data frame with all post descriptions

In [1]:
!pip install emoji unicodedata2



In [2]:
import json
import pandas as pd
import re
import emoji
import unicodedata

In [3]:
def extract_full_text(message):
    text_parts = []
    for part in message['text']:
        if isinstance(part, str):
            text_parts.append(part)
        elif isinstance(part, dict) and 'text' in part:
            text_parts.append(part['text'])
    result = ''.join(text_parts)
    return None if result == '' else result


def preprocess_event_text(text):
    text = emoji.replace_emoji(text, replace='')
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s.,!?;:()"\'-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
file_path = "../data/o4u_logs_Jun_07_2025.json"
with open(file_path, "r") as f:
    data = json.load(f)

In [5]:
processed_posts = []
for message in data.get("messages", []):
    # extract messages with text content, not service messages
    if message.get("type") == "message" and message.get("text"):
        post_id = message["id"]
        raw_post = extract_full_text(message)

        if not raw_post.strip():
            continue

        preprocessed_post = preprocess_event_text(raw_post)

        processed_posts.append({
            "post_id": post_id,
            "post": raw_post,
            "preprocessed_post": preprocessed_post,
            "embedding": None  # placeholder
        })

df_posts = pd.DataFrame(processed_posts)

In [6]:
df_posts

Unnamed: 0,post_id,post,preprocessed_post,embedding
0,3,"Dear students,\n\nThis channel advertises mino...","Dear students, This channel advertises minor e...",
1,6,📣Hi there!\n\nStudent Affairs is urgently look...,Hi there! Student Affairs is urgently looking ...,
2,7,📣Hi there! Want any of these?\n\nStudent Affai...,Hi there! Want any of these? Student Affairs a...,
3,8,Bonjour! Ça va?\nС'est la vie.. \nCroissant.\n...,Bonjour! Ça va? С'est la vie.. Croissant. If t...,
4,9,"📣On December 14, comedian Vladimir Marconi arr...","On December 14, comedian Vladimir Marconi arri...",
...,...,...,...,...
2845,3237,"🔥 ""Slippers of the Year"" Contest!\n\nYour slip...","""Slippers of the Year"" Contest! Your slippers ...",
2846,3238,🌍 ECO ACTION for World Environment Day!\n\n5 J...,ECO ACTION for World Environment Day! 5 June i...,
2847,3239,"📣 PreParty of the City Day for homies ""Super I...","PreParty of the City Day for homies ""Super Inn...",
2848,3240,🚀 International Acceleration Program 2025\n\nT...,International Acceleration Program 2025 The pr...,


# Populate the `embedding` field of the table

In [7]:
...

Ellipsis