In [2]:
# ==========================================
# PHASE 1 FIX (Smart Extractor)
# Goal: Extract topics hidden inside dictionary-like strings.
# Language: English Only
# ==========================================

import pandas as pd
import glob
import re
import ast

# 1. Load Files
file_pattern = "*.pkl"
file_list = glob.glob(file_pattern)
print(f"[SYSTEM] Found {len(file_list)} pickle files.")

dfs = []
for file in file_list:
    try:
        df = pd.read_pickle(file, compression='gzip')
        dfs.append(df)
    except:
        pass

if dfs:
    full_df = pd.concat(dfs, ignore_index=True)
    
    # 2. Smart Extraction Logic
    raw_data = full_df.iloc[:, 0].astype(str)
    
    extracted_roots = []
    
    for item in raw_data:
        # Check if item looks like a dictionary {'TopicName': ...}
        # We use Regex to find the text inside the first quotes
        match = re.search(r"['\"](.*?)['\"]", item)
        
        if match:
            # Found a topic inside quotes (e.g., 'home/temp')
            full_topic = match.group(1)
        else:
            # It's just a normal string
            full_topic = item
            
        # Get the Root (part before first /)
        root = full_topic.split('/')[0]
        
        # Clean up (remove empty or weird chars)
        root = root.strip()
        if root and "{" not in root and "}" not in root:
            extracted_roots.append(root)

    # 3. Count and Select Top 50
    # Convert list to Series to count easily
    root_series = pd.Series(extracted_roots)
    top_50 = root_series.value_counts().head(50).index.tolist()
    
    print(f"[INFO] Extracted {len(top_50)} unique root topics.")
    print(f"Sample: {top_50[:5]}")

    # 4. Save to clean_topics.txt
    filename = "clean_topics.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for root in top_50:
            f.write(f"{root}/#\n")
            
    print(f"\n✅ SUCCESS! 'clean_topics.txt' is ready.")
    print("Please check the file content now. It should contain names like 'ActiveMQ/#'.")

else:
    print("❌ Error: No .pkl files found.")

[SYSTEM] Found 3 pickle files.
[INFO] Extracted 50 unique root topics.
Sample: ['ActiveMQ', 'gateway', 'tele', 'teslamate', 'zigbee2mqtt']

✅ SUCCESS! 'clean_topics.txt' is ready.
Please check the file content now. It should contain names like 'ActiveMQ/#'.
