In [1]:
cd /Users/danarapp/Desktop/energypattern-keyword-search

/Users/danarapp/Desktop/energypattern-keyword-search


In [2]:
from IPython.display import display
import pandas as pd
from constants.abs_paths import AbsDirPath
from pathlib import Path
from processing_pipeline.utilities.data_transformation import load_all_files
import openpyxl


def display_and_save(df, path):
    df.to_excel(path, merge_cells=False)
    display(df)

def display_and_save_csv(df, path):
    df.to_csv(path)
    display(df)

In [6]:
base_dir = Path(AbsDirPath.PR_KEYWORDS_MATCHING)

df_git = load_all_files(base_dir)


Loaded 3 files, [PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/paperless-ngx.paperless-ngx.v2.18.4.pr_corpus.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/saleor.saleor.3.21.19.pr_corpus.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/pr_first_iteration/netbox-community.netbox.v4.4.1.pr_corpus.parquet')]


In [4]:
base_dir = Path(AbsDirPath.SECOND_KEYWORDS_MATCHING)
full_dir = base_dir / "full"

df_git = load_all_files(base_dir)
df_comments = load_all_files(full_dir)

Loaded 10 files, [PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/zulip.zulip.11.2.issue_comment.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/netbox-community.netbox.v4.4.1.issue.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/zulip.zulip.11.2.issue.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/netbox-community.netbox.v4.4.1.release.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/paperless-ngx.paperless-ngx.v2.18.4.issue.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/saleor.saleor.3.21.19.issue.parquet'), PosixPath('/Users/danarapp/Desktop/energypattern-keyword-search/data/keywords_2/second_testiteration/saleor.sa

In [None]:
# For datatransfer
df_git_datatransfer = df_git[df_git["qa"] == "datatransfer"]
df_comments_datatransfer = df_comments[df_comments["qa"] == "datatransfer"]

# For UI
df_git_ui = df_git[df_git["qa"] == "UI"]
df_comments_ui = df_comments[df_comments["qa"] == "UI"]

# Filter for code_optimization
df_git_codeopt = df_git[df_git["qa"] == "code_optimization"]
df_comments_codeopt = df_comments[df_comments["qa"] == "code_optimization"]


In [None]:
excel_filename = "pr_first_iteration.xlsx"  # change as needed
out_dir = Path("/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/analysis/analyzed_matches")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / excel_filename

# Explicit list of DataFrames to export
dfs = [
    df_git_datatransfer,
    df_comments_datatransfer,
    df_git_ui,
    df_comments_ui,
    df_git_codeopt,
    df_comments_codeopt,
]

# Deduplicate each df in-place (by url + matched_word) and report removals
for i, df in enumerate(dfs):
    before = len(df)
    deduped = df.drop_duplicates(subset=["url", "matched_word"], keep="first").copy()
    removed = before - len(deduped)
    dfs[i] = deduped
    print(f"[dedupe] Removed {removed} duplicates (from {before} → {len(deduped)}) in dfs[{i}]")


[dedupe] Removed 41 duplicates (from 121 → 80) in dfs[0]
[dedupe] Removed 0 duplicates (from 8 → 8) in dfs[1]
[dedupe] Removed 32 duplicates (from 103 → 71) in dfs[2]


In [7]:
# Count deduplicated matches
count_git_datatransfer = len(dfs[0])
count_comments_datatransfer = len(dfs[1])
count_git_ui = len(dfs[2])
count_comments_ui = len(dfs[3])
count_git_codeopt = len(dfs[4])
count_comments_codeopt = len(dfs[5])

# Totals
total_git = count_git_datatransfer + count_git_ui + count_git_codeopt
total_comments = count_comments_datatransfer + count_comments_ui + count_comments_codeopt

print("\n=== Counts (deduplicated) ===")
print(f"datatransfer - git: {count_git_datatransfer}, comments: {count_comments_datatransfer}")
print(f"UI           - git: {count_git_ui}, comments: {count_comments_ui}")
print(f"code_opt     - git: {count_git_codeopt}, comments: {count_comments_codeopt}")
print("----")
print(f"TOTAL git: {total_git}")
print(f"TOTAL comments: {total_comments}")



=== Counts (deduplicated) ===
datatransfer - git: 437, comments: 83
UI           - git: 28, comments: 8
code_opt     - git: 146, comments: 33
----
TOTAL git: 611
TOTAL comments: 124


In [9]:

# Columns to include in the Excel (includes new columns)
review_cols = ["row_id", "matched_word", "sentence", "source", "url", "pattern", "commit_url", "comment"]

# ==============================
# Helpers
# ==============================
def excel_hyperlink_formula(url: str) -> str:
    """Return an Excel HYPERLINK() formula that shows the raw URL text and is clickable."""
    if not isinstance(url, str) or not url.strip():
        return ""
    safe = url.replace('"', '""')
    return f'=HYPERLINK("{safe}", "{safe}")'

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure required columns exist and add row_id (original index)."""
    d = df.copy()
    if "pattern" not in d.columns:
        d["pattern"] = ""           # default empty
    if "url" not in d.columns:
        d["url"] = ""
    if "commit_url" not in d.columns:
        d["commit_url"] = ""        # new column
    if "comment" not in d.columns:
        d["comment"] = ""           # new column
    d["row_id"] = d.index           # keep original index for traceability
    return d



# ==============================
# Write Excel with multiple sheets (fixed sheet names)
# ==============================
names = [
    "git_datatransfer",
    "comments_datatransfer",
    "git_UI",
    "comments_UI",
    "git_code_optimization",
    "comments_code_optimization",
]

with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for name, df in zip(names, dfs):
        dfx = ensure_schema(df)
        out = dfx.reindex(columns=review_cols)

        # Make both URL columns clickable with raw URL as text
        out["url"] = out["url"].apply(excel_hyperlink_formula)
        out["commit_url"] = out["commit_url"].apply(excel_hyperlink_formula)

        # Write sheet with explicit, unique name
        out.to_excel(writer, sheet_name=name, index=False)

        # Freeze header row
        ws = writer.sheets[name]
        ws.freeze_panes = "A2"

print(f"Saved review workbook to: {out_path}")



Saved review workbook to: /Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/analysis/analyzed_matches/pr_first_iteration.xlsx
