In [46]:
import pandas as pd

In [52]:
df = pd.read_csv("raw-data.csv")

In [18]:
df = pd.read_csv("raw-data-2.csv")

In [None]:
df = pd.read_csv("test.csv")

In [55]:
# Get user's content only
df = df[df["role"] == "user"].copy()
df["content"] = (
    df["content"]
      .astype(str)
      .str.replace("\r\n", "\n", regex=False)   # normalize Windows newlines
      .str.replace("\r", "\n", regex=False)     # normalize old Mac newlines
      .str.replace("\n", r"\n", regex=False)    # make newline visible
      .str.slice(0, 2000)                       # trim to max 2000 chars
)

# 2) parse + sort (oldest first within each conversation)
df["date"] = pd.to_datetime(df["date"], utc=True)
# tiebreaker: user first, then assistant
role_order = {"user": 0, "assistant": 1}
df["_role_order"] = df["role"].map(role_order).fillna(9).astype(int)
df = df.sort_values(["conversation_id", "date","_role_order"], ascending=True)

# Separator that is very easy for an LLM to parse
SEP = "\n<USER_TURN>\n"

# Aggregate per conversation_id
out = (
    df.groupby("conversation_id", as_index=False)
      .agg(
          content=("content", lambda s: SEP.join(s.tolist())),
          assistant_id=("assistant_id", "first"),   # same within a conversation in your data
          user_count=("content", "size")            # number of user rows
      )
)

# Keep only requested columns (order matters)
out = out[["conversation_id", "content", "user_count"]]

# Save
out.to_csv("users-by-conversation.csv", index=False)



In [None]:
# Count how many conversations have N user messages
counts = (
    out["user_count"]
      .value_counts()
      .sort_index()
)

print(counts)


user_count
1     3825
2     2980
3     1967
4      996
5      644
6      365
7      225
8      155
9       90
10      51
11      32
12      24
13      12
14      10
15       6
16       5
17       3
18       2
19       4
21       1
22       4
23       5
24       1
25       4
26       1
28       1
30       1
31       1
33       1
35       2
39       1
40       1
50       1
52       1
Name: count, dtype: int64


In [59]:
# Total rows (total conversations)
total_rows = len(out)
print("Total rows:", total_rows)

# Average questions per conversation
avg_questions = out["user_count"].mean()
print("Average questions asked:", round(avg_questions, 2))

Total rows: 11422
Average questions asked: 2.73


In [None]:
import json

# 1) keep only user + assistant
df = df[df["role"].isin(["user", "assistant"])].copy()

# Trim user and assistant's content to avoid exceeding LLM context window
max_user_chars = 2000
max_assistant_chars = 500
def trim_by_role(row: pd.Series) -> str:
    text = "" if pd.isna(row["content"]) else str(row["content"])
    limit = max_user_chars if row["role"] == "user" else max_assistant_chars if row["role"] == "assistant" else None
    return text[:limit] if limit is not None else text
df["content"] = df.apply(trim_by_role, axis=1)

# 2) parse + sort (oldest first within each conversation)
df["date"] = pd.to_datetime(df["date"], utc=True)
# tiebreaker: user first, then assistant
role_order = {"user": 0, "assistant": 1}
df["_role_order"] = df["role"].map(role_order).fillna(9).astype(int)
df = df.sort_values(["conversation_id", "date","_role_order"], ascending=True)

# 3) build a per-conversation "full_conversation" JSONL-style string (easy to parse)
def build_full_conversation(group: pd.DataFrame) -> str:
    msgs = [{"role": r, "content": c} for r, c in zip(group["role"], group["content"])]
    # one JSON object per line (JSONL) is very LLM-friendly
    return "\n".join(json.dumps(m, ensure_ascii=False) for m in msgs)

df_conversations = (
    df.groupby("conversation_id", as_index=False)
      .apply(lambda g: pd.Series({"full_conversation": build_full_conversation(g)}))
      .reset_index(drop=True)
)

  .apply(lambda g: pd.Series({"full_conversation": build_full_conversation(g)}))


In [41]:
pd.set_option("display.max_colwidth", None)

print(final_df.loc[1, "full_conversation"])

{"role": "user", "content": "matching offer  from Rogers"}
{"role": "assistant", "content": "I'd be happy to help you find information about matching offers against Rogers! To give you the most relevant guidance, I need a bit more context:\n\n---\n\n### ðŸ¤” **Quick Questions:**\n\n1. **What service are we matching?** (Internet, TV, Phone, or a bundle?)\n2. **Which province is the customer in?** (This affects available options)\n3. **What's the Rogers offer details?** (Speed, price, services included?)\n4. **Is this an existing TELUS customer or a prospect?**\n5. **What's their main concern?** (Pric"}
