In [20]:
cd /Users/danarapp/Desktop/scientific-software-architecture-analysis

/Users/danarapp/Desktop/scientific-software-architecture-analysis


In [22]:
from IPython.display import display
from constants.abs_paths import AbsDirPath
from pathlib import Path
from processing_pipeline.utilities.data_transformation import load_all_files
import openpyxl
import pandas as pd

def display_and_save(df, path):
    df.to_excel(path, merge_cells=False)
    display(df)

def display_and_save_csv(df, path):
    df.to_csv(path)
    display(df)

In [23]:
base_dir = Path(AbsDirPath.SECOND_KEYWORDS_MATCHING)
full_dir = base_dir / "full"

df_git = load_all_files(base_dir)
df_comments = load_all_files(full_dir)

AttributeError: type object 'AbsDirPath' has no attribute 'SECOND_KEYWORDS_MATCHING'

In [8]:
# For datatransfer
df_git_datatransfer = df_git[df_git["qa"] == "datatransfer"]
df_comments_datatransfer = df_comments[df_comments["qa"] == "datatransfer"]

# For UI
df_git_ui = df_git[df_git["qa"] == "UI"]
df_comments_ui = df_comments[df_comments["qa"] == "UI"]

# Filter for code_optimization
df_git_codeopt = df_git[df_git["qa"] == "code_optimization"]
df_comments_codeopt = df_comments[df_comments["qa"] == "code_optimization"]


In [9]:
excel_filename = "test.xlsx"  # change as needed
out_dir = Path("/Users/danarapp/Desktop/scientific-software-architecture-analysis/processing_pipeline/analysis/analyzed_matches")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / excel_filename

# Explicit list of DataFrames to export
dfs = [
    df_git_datatransfer,
    df_comments_datatransfer,
    df_git_ui,
    df_comments_ui,
    df_git_codeopt,
    df_comments_codeopt,
]

# Deduplicate each df in-place (by url + matched_word) and report removals
for i, df in enumerate(dfs):
    before = len(df)
    deduped = df.drop_duplicates(subset=["url", "matched_word"], keep="first").copy()
    removed = before - len(deduped)
    dfs[i] = deduped
    print(f"[dedupe] Removed {removed} duplicates (from {before} → {len(deduped)}) in dfs[{i}]")


[dedupe] Removed 91 duplicates (from 469 → 378) in dfs[0]
[dedupe] Removed 9 duplicates (from 80 → 71) in dfs[1]
[dedupe] Removed 140 duplicates (from 379 → 239) in dfs[2]
[dedupe] Removed 204 duplicates (from 342 → 138) in dfs[3]
[dedupe] Removed 8 duplicates (from 54 → 46) in dfs[4]
[dedupe] Removed 0 duplicates (from 12 → 12) in dfs[5]


In [None]:

# Columns to include in the Excel (includes new columns)
review_cols = ["row_id", "matched_word", "sentence", "source", "url", "pattern", "commit_url", "comment"]

# ==============================
# Helpers
# ==============================
def excel_hyperlink_formula(url: str) -> str:
    """Return an Excel HYPERLINK() formula that shows the raw URL text and is clickable."""
    if not isinstance(url, str) or not url.strip():
        return ""
    safe = url.replace('"', '""')
    return f'=HYPERLINK("{safe}", "{safe}")'

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure required columns exist and add row_id (original index)."""
    d = df.copy()
    if "pattern" not in d.columns:
        d["pattern"] = ""           # default empty
    if "url" not in d.columns:
        d["url"] = ""
    if "commit_url" not in d.columns:
        d["commit_url"] = ""        # new column
    if "comment" not in d.columns:
        d["comment"] = ""           # new column
    d["row_id"] = d.index           # keep original index for traceability
    return d



# ==============================
# Write Excel with multiple sheets (fixed sheet names)
# ==============================
names = [
    "git_datatransfer",
    "comments_datatransfer",
    "git_UI",
    "comments_UI",
    "git_code_optimization",
    "comments_code_optimization",
]

with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for name, df in zip(names, dfs):
        dfx = ensure_schema(df)
        out = dfx.reindex(columns=review_cols)

        # Make both URL columns clickable with raw URL as text
        out["url"] = out["url"].apply(excel_hyperlink_formula)
        out["commit_url"] = out["commit_url"].apply(excel_hyperlink_formula)

        # Write sheet with explicit, unique name
        out.to_excel(writer, sheet_name=name, index=False)

        # Freeze header row
        ws = writer.sheets[name]
        ws.freeze_panes = "A2"

print(f"Saved review workbook to: {out_path}")



Saved review workbook to: /Users/danarapp/Desktop/scientific-software-architecture-analysis/processing_pipeline/analysis/analyzed_matches/test.xlsx
