In [None]:
from IPython.display import display
import pandas as pd
from constants.abs_paths import AbsDirPath
from pathlib import Path
from processing_pipeline.utilities.data_transformation import load_all_files
import openpyxl

# ==============================
# Load single source
# ==============================
base_dir = Path(AbsDirPath.PR_KEYWORDS_MATCHING)
df_git = load_all_files(base_dir)

# ==============================
# Filter into three categories
# ==============================
df_git_datatransfer = df_git[df_git["qa"] == "datatransfer"]
df_git_ui           = df_git[df_git["qa"] == "UI"]
df_git_codeopt      = df_git[df_git["qa"] == "code_optimization"]

# ==============================
# Output paths
# ==============================
excel_filename = "pr_first_iteration2.xlsx"  # change as needed
out_dir = Path("/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/analysis/analyzed_matches")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / excel_filename

# ==============================
# Helpers (same logic as before)
# ==============================
def excel_hyperlink_formula(url: str) -> str:
    """Return an Excel HYPERLINK() formula that shows the raw URL text and is clickable."""
    if not isinstance(url, str) or not url.strip():
        return ""
    safe = url.replace('"', '""')
    return f'=HYPERLINK("{safe}", "{safe}")'

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure required columns exist and add row_id (original index)."""
    d = df.copy()
    if "pattern" not in d.columns:
        d["pattern"] = ""           # default empty
    if "url" not in d.columns:
        d["url"] = ""
    if "commit_url" not in d.columns:
        d["commit_url"] = ""        # still supported
    if "comment" not in d.columns:
        d["comment"] = ""           # still supported
    d["row_id"] = d.index           # keep original index for traceability
    return d

# Columns to include in the Excel (same as before)
review_cols = ["row_id", "matched_word", "sentence", "source", "url", "pattern", "commit_url", "comment"]

# ==============================
# Prepare the three sheets
# ==============================
names = [
    "git_datatransfer",
    "git_ui",
    "git_codeopt",  # exact name requested
]

dfs = [
    df_git_datatransfer,
    df_git_ui,
    df_git_codeopt,
]

# Deduplicate each df (by url + matched_word) and report removals
for i, df in enumerate(dfs):
    before = len(df)
    deduped = df.drop_duplicates(subset=["url", "matched_word"], keep="first").copy()
    removed = before - len(deduped)
    dfs[i] = deduped
    print(f"[dedupe] Removed {removed} duplicates (from {before} → {len(deduped)}) in dfs[{i}] ({names[i]})")

# Count deduplicated matches
count_git_datatransfer = len(dfs[0])
count_git_ui           = len(dfs[1])
count_git_codeopt      = len(dfs[2])

print("\n=== Counts (deduplicated) ===")
print(f"datatransfer - git: {count_git_datatransfer}")
print(f"UI           - git: {count_git_ui}")
print(f"code_opt     - git: {count_git_codeopt}")
print("----")
print(f"TOTAL git: {count_git_datatransfer + count_git_ui + count_git_codeopt}")

# ==============================
# Write Excel with three sheets
# ==============================
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for name, df in zip(names, dfs):
        dfx = ensure_schema(df)
        out = dfx.reindex(columns=review_cols)

        # Make both URL columns clickable with raw URL as text
        out["url"] = out["url"].apply(excel_hyperlink_formula)
        out["commit_url"] = out["commit_url"].apply(excel_hyperlink_formula)

        # Write sheet with explicit, unique name
        out.to_excel(writer, sheet_name=name, index=False)

        # Freeze header row
        ws = writer.sheets[name]
        ws.freeze_panes = "A2"

print(f"Saved review workbook to: {out_path}")


Loaded 3 files, [PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/paperless-ngx.paperless-ngx.v2.18.4.pr_corpus.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/saleor.saleor.3.21.19.pr_corpus.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/netbox-community.netbox.v4.4.1.pr_corpus.parquet')]

=== Processing: git_datatransfer ===
[dedupe] Removed 41 duplicates (from 121 → 80) on (url, matched_word)
[merge] After merging by url: 67 rows

=== Processing: git_ui ===
[dedupe] Removed 0 duplicates (from 8 → 8) on (url, matched_word)
[merge] After merging by url: 6 rows

=== Processing: git_codeopt ===
[dedupe] Removed 32 duplicates (from 103 → 71) on (url, matched_word)
[merge] After merging by url: 65 rows

=== Counts (after merge-by-url) ===
datatransfer - git: 67
UI           - git: 6
code_opt     - git: 65
----
TOTAL 

In [7]:
import pandas as pd
from pathlib import Path
import openpyxl

# ==============================
# Paths
# ==============================
in_path = Path("/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/analysis/analyzed_matches/pr_first_iteration2.xlsx")
out_path = in_path.with_name("pr_first_iteration2_merged.xlsx")

# Columns expected / order to keep
review_cols = ["row_id", "matched_word", "sentence", "source", "url", "pattern", "commit_url", "comment"]

# ==============================
# Helpers
# ==============================
def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    # Ensure all expected columns exist
    for col in review_cols:
        if col not in d.columns:
            d[col] = ""
    return d

def _join_unique(series, sep=", "):
    """Join unique, non-empty strings in original order."""
    seen = set()
    out = []
    for x in series:
        s = "" if pd.isna(x) else str(x).strip()
        if not s:
            continue
        if s not in seen:
            seen.add(s)
            out.append(s)
    return sep.join(out)

def _join_all(series, sep=", "):
    """Join all values as strings (used for row_id)."""
    return sep.join(str(x) for x in series if pd.notna(x) and str(x).strip() != "")

def process_sheet(df: pd.DataFrame) -> pd.DataFrame:
    d = ensure_schema(df)

    # Step 1: exact de-dupe on (url, matched_word)
    before = len(d)
    d = d.drop_duplicates(subset=["url", "matched_word"], keep="first").copy()
    removed = before - len(d)
    print(f"[dedupe] Removed {removed} duplicates (from {before} → {len(d)}) on (url, matched_word)")

    # Step 2: merge by url
    merged = (
        d.groupby("url", dropna=False, sort=False)
         .agg({
             # merge ONLY these three as requested
             "matched_word": _join_unique,
             "row_id": _join_all,
             "sentence": _join_unique,
             # keep others empty strings
             "source": lambda s: "",
             "pattern": lambda s: "",
             "commit_url": lambda s: "",
             "comment": lambda s: "",
         })
         .reset_index()
    )

    # Reorder/ensure columns
    merged = merged.reindex(columns=review_cols)
    return merged

# ==============================
# Read all sheets, process, and write new Excel
# ==============================
with pd.ExcelFile(in_path) as xls:
    sheet_names = xls.sheet_names
    sheets = {name: xls.parse(name) for name in sheet_names}

with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for name, df in sheets.items():
        print(f"\n=== Processing sheet: {name} ===")
        out = process_sheet(df)
        out.to_excel(writer, sheet_name=name, index=False)
        ws = writer.sheets[name]
        ws.freeze_panes = "A2"

print(f"\n✅ Merged workbook saved to: {out_path}")



=== Processing sheet: git_datatransfer ===
[dedupe] Removed 0 duplicates (from 1 → 1) on (url, matched_word)

=== Processing sheet: git_ui ===
[dedupe] Removed 0 duplicates (from 1 → 1) on (url, matched_word)

=== Processing sheet: git_codeopt ===
[dedupe] Removed 0 duplicates (from 1 → 1) on (url, matched_word)

✅ Merged workbook saved to: /Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/analysis/analyzed_matches/pr_first_iteration2_merged.xlsx


In [2]:
cd /Users/danarapp/Desktop/energypattern-keyword-search

/Users/danarapp/Desktop/energypattern-keyword-search
