In [1]:
import json, csv, re, textwrap, urllib.parse, pathlib
import pandas as pd
from sqlalchemy import create_engine

# ── file paths ───────────────────────────────────────────────────────
IN_JSONL  = "sprint_goals_training_data-qwen-3B.jsonl"
OUT_JSONL = "sprint_goals_training_data-qwen-3B_sep.jsonl"
SEP       = " ||||| "         # delimiter inside formatted_issues

print("JSONL  in :", pathlib.Path(IN_JSONL).resolve())
print("JSONL out :", pathlib.Path(OUT_JSONL).resolve())


JSONL  in : C:\Users\Marc\Desktop\AgenticRAG\AGENT\data\sprint_goals_training_data-qwen-3B.jsonl
JSONL out : C:\Users\Marc\Desktop\AgenticRAG\AGENT\data\sprint_goals_training_data-qwen-3B_sep.jsonl


In [2]:
# -------- build SQLAlchemy engine --------
password = urllib.parse.quote_plus("Marcmaud2001@1")
engine   = create_engine(f"mysql+pymysql://root:{password}@localhost/tawosdb")

sql = """
SELECT
    s.Name        AS sprint_name,
    i.Title       AS issue_title
FROM Issue i
JOIN Sprint s ON i.Sprint_ID = s.ID
WHERE s.Name IS NOT NULL;
"""
issues_df = pd.read_sql(sql, con=engine)
print("Rows from DB:", len(issues_df))


Rows from DB: 43475


In [3]:
def join_issues(issue_series, sep=" ||||| "):
    """
    Join a Series of issue titles with ' ||| ' delimiter **without** outer quotes.
    Inner quotes (apostrophes, etc.) are kept untouched.
    """
    cleaned = []
    for t in issue_series:
        if not isinstance(t, str):
            continue
        t = t.strip()
        # remove ONE leading + trailing double-quote if present
        if len(t) >= 2 and t[0] == '"' and t[-1] == '"':
            t = t[1:-1]
        cleaned.append(t)
    return sep.join(cleaned)


In [4]:
issues_grouped = (
    issues_df.groupby("sprint_name")["issue_title"]
             .apply(join_issues)               # single string per sprint
             .reset_index()
             .rename(columns={"issue_title": "formatted_issues"})
)

print("Example sprint block:\n",
      issues_grouped.iloc[0]["formatted_issues"][:120], "...")


Example sprint block:
 ConfigurationAction.doSetCardColor Persistent XSS ||||| UpdatingStatus Persistent XSS ||||| UpdatePField Reflected XSS | ...


In [5]:
rows = []
with open(IN_JSONL, encoding="utf-8") as f:
    for ln in f:
        rows.append(json.loads(ln))

orig_df = pd.DataFrame(rows)
print("Original rows:", len(orig_df))


Original rows: 2945


In [6]:
merged = (
    orig_df.drop(columns=["formatted_issues", "num_issues"])
           .merge(issues_grouped, on="sprint_name", how="left")
)

# compute fresh num_issues
merged["num_issues"] = merged["formatted_issues"].apply(
    lambda x: len(x.split(SEP)) if isinstance(x, str) else 0
)

# sanity-check a few
print(merged.head()[["sprint_name", "num_issues", "formatted_issues"]])

# write out
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for _, row in merged.iterrows():
        f.write(json.dumps(row.to_dict(), ensure_ascii=False) + "\n")

print("✔  wrote", len(merged), "rows to", OUT_JSONL)


                     sprint_name  num_issues  \
0       $.krypton - 6.3.1 part 2          10   
1  0 Blast off - Mar 13-17 - SYD           1   
2                             10          29   
3            10-Annamite range 2           3   
4                       10tative          36   

                                    formatted_issues  
0  ConfigurationAction.doSetCardColor Persistent ...  
1  Heading is affecting further than what's being...  
2  Improve logs per request ||||| [POOL_UPGRADE] ...  
3  JIRA Issues Macro returns ""Data cannot be ret...  
4  UnsupportedOperationException: Asynchronous re...  
✔  wrote 2945 rows to sprint_goals_training_data-qwen-3B_sep.jsonl
