In [None]:
import polars as pl
from dtale import show
import dtale.global_state as global_state

global_state.set_app_settings(dict(max_column_width=300))

msg = "/Users/ma9o/Downloads/latest 2/Estela.json"

df = pl.read_json(msg).with_columns(pl.col("datetime").str.strptime(pl.Datetime).alias("datetime")).sort("datetime")

show(df.to_pandas()).open_browser()


In [None]:
df = (
    # Detect whenever role or content changes from the previous row
    df.with_columns(
        (
            (pl.col("role") != pl.col("role").shift()) | 
            (pl.col("content") != pl.col("content").shift())
        )  # This will be True at each boundary between consecutive message groups
        .cast(pl.Int8)  # Convert boolean to integer 0/1
        .cum_sum()       # Cumulative sum will create a unique 'group_id' for each run of consecutive duplicates
        .alias("group_id")
    )
    # Now group by role, content, and group_id
    .group_by(["group_id", "role", "content"])
    .agg([
        pl.col("datetime").min().alias("datetime"),
        pl.len().alias("message_count")
    ])
    .drop("group_id")
    .sort("datetime")
)

show(df.to_pandas()).open_browser()


In [None]:
# Group messages by hour and count them
hourly_messages = (
    df
    .with_columns(pl.col("datetime").str.strptime(pl.Datetime).alias("datetime"))  # Convert string to datetime
    .with_columns(pl.col("datetime").dt.truncate("1h").alias("hour"))
    .filter(pl.col("hour").gt(pl.datetime(2025, 1, 1)))
    .sort("hour")
    .group_by("hour")
    .agg(pl.col("message_count").sum().alias("total_messages"))
    # Create a complete sequence of hours and fill missing values with 0
    .upsample(
        time_column="hour",
        every="1h"
    )
    .fill_null(0)
    .sort("hour")
)

# Create an interactive plot using dtale
show(hourly_messages.to_pandas()).open_browser()

In [None]:
import polars as pl
import numpy as np
from ruptures import Pelt
import time

def add_chat_sessions(df: pl.DataFrame) -> pl.DataFrame:

    base_time = df.with_columns([
        pl.col('datetime').dt.epoch(time_unit='s')
    ]).get_column('datetime').min()

    
    # Convert datetime to seconds since start
    seconds = df.with_columns([
        pl.col('datetime').dt.epoch(time_unit='s') - pl.lit(base_time)
    ]).get_column('datetime').to_numpy()

    # Create signal combining time gaps and message counts
    message_counts = df['message_count'].to_numpy()
    
    # Combine into 2D signal - time gaps between messages and message counts
    time_gaps = np.diff(seconds, prepend=seconds[0])
    signal = np.column_stack((time_gaps, message_counts))

    print("Running PELT")
    start = time.time()
    
    # Apply PELT
    algo = Pelt(model="rbf", min_size=3, jump=1).fit(signal)
    change_points = algo.predict(pen=np.log(len(signal)))

    print(f"PELT took {time.time() - start} seconds")

    print(change_points)
    
    # Create session labels
    session_labels = np.zeros(len(df), dtype=int)
    current_session = 0
    
    for cp in change_points[:-1]:  # Last point is always signal length
        session_labels[cp:] = current_session + 1
        current_session += 1
    
    # Add session labels to DataFrame
    return df.with_columns(
        session=pl.Series(session_labels)
    )

In [None]:
df2 = add_chat_sessions(df)
show(df2.to_pandas()).open_browser()

In [None]:
import hdbscan
from hdbscan.prediction import all_points_membership_vectors


def cluster_messages(time_dim: np.ndarray, density_dim: np.ndarray) -> pl.Series:
    data = np.column_stack((time_dim, density_dim))
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', prediction_data=True).fit(data)

    membership_vectors = all_points_membership_vectors(clusterer)

    fine_cluster_labels_soft = membership_vectors.argmax(axis=1)

    return pl.Series(fine_cluster_labels_soft)

In [None]:
import numpy as np
import math
from llama_cpp.llama_chat_format import format_llama3
from huggingface_hub import hf_hub_download
import llama_cpp


filename = "llama-3.2-1b-instruct-q8_0.gguf"

model_path = hf_hub_download(
    repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
    filename=filename,
    local_dir=".",  # Download to current directory
    local_dir_use_symlinks=False  # Get actual file instead of symlink
)

llm = llama_cpp.Llama(
            model_path=model_path,
            n_ctx=2048,   # Context window size
            n_batch=512,  # Batch size for prompt processing
            logits_all=True,  # <-- IMPORTANT: Collect all logits
            n_threads=8,
            n_gpu_layers=-1,
        )


def get_perplexity(text: str):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant, trying to guess the context behind single messages from a chat app."
        },
        {
            "role": "user",
            "content": text
        },
    ]

    chat = format_llama3(messages)
    prompt = chat.prompt + chat.stop


    llm.reset() # important to reset the model state before each run
        
    # Encode the text to tokens
    tokens = llm.tokenize(prompt.encode())
    llm.eval(tokens)
    logits = np.array(llm.eval_logits)
    logprobs = llm.logits_to_logprobs(logits)

    # Skip the first token (there is no "previous" context for it)
    selected_logprobs = []
    for i in range(1, len(tokens)):
        token_id = tokens[i]
        selected_logprobs.append(logprobs[i-1, token_id])


    # Calculate metrics
    cross_entropy = -sum(selected_logprobs) / len(selected_logprobs)
    perplexity = math.exp(cross_entropy)

    return perplexity

In [None]:
from tqdm.notebook import tqdm

# First, calculate perplexities with progress bar
perplexities = []
for content in tqdm(df['content'], desc="Calculating perplexities"):
    perplexities.append(get_perplexity(content))

# Then, add them to the dataframe
df = df.with_columns(
    perplexity=pl.Series(perplexities)
)

In [None]:
import gzip

def compression_ratio(text):
    text_bytes = text.encode('utf-8')
    compressed = gzip.compress(text_bytes)
    return len(compressed) / len(text_bytes)

In [None]:
df4 = df.with_columns(
    compression=pl.col('content').map_elements(compression_ratio, return_dtype=pl.Float64)
).with_columns(
    clusters=cluster_messages(df4['datetime'].dt.timestamp().to_numpy(), df4['compression'].to_numpy())
)

show(df4.to_pandas()).open_browser()