In [106]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os

# Add the project root to Python's module search path
sys.path.append(os.path.abspath(".."))

In [107]:
csv_folder = Path("../datas/raw/LLM_Results")
dfs = [pd.read_csv(f) for f in csv_folder.glob("*.csv")]
dfs_named = {f.stem: pd.read_csv(f) for f in csv_folder.glob("*.csv")}

In [108]:
dfs_weaponised = [df[df['weaponised'] == 'Weaponised'].reset_index(drop=True) for df in dfs]
dfs_weaponised = [df for df in dfs_weaponised if not df.empty]
dfs_weaponised_named = {
    name: df[df['weaponised'] == 'Weaponised'].reset_index(drop=True)
    for name, df in dfs_named.items()
    if not df[df['weaponised'] == 'Weaponised'].empty
}

In [132]:
def check_global(ngrams, dfs):
    """
    Check across all weaponised DataFrames if any of the ngrams appear
    in the 'changed_version' column. Return a list of matches.
    """
    matches = []

    for i, df in enumerate(dfs):
        if "changed_version" not in df.columns:
            continue

        for ng in ngrams:
            found_rows = df[df["changed_version"].astype(str).str.contains(ng, case=False, na=False, regex=False)]
            if not found_rows.empty:
                matches.append({
                    "df_index": i,
                    "ngram": ng,
                    "rows": found_rows.index.tolist()
                })

    return matches

def check_local(chunk, dfs_named):
    """
    Given a chunk (string) and a dictionary {article_name: df},
    find the article name(s) whose 'changed_version' column contains the chunk.
    """
    for name, df in dfs_named.items():
        if "changed_version" not in df.columns:
            continue

        found_rows = df[df["changed_version"].astype(str).str.contains(chunk, case=False, na=False, regex=False)]
        if not found_rows.empty:
            return name, found_rows

    return None, pd.DataFrame()

def generate_ngrams(text, n=4):
    words = text.split()
    return [" ".join(words[i:i+n]) for i in range(len(words)-n+1)]

def match_unknown_edits(fg, dfs_weaponised, dfs_named, n=4, limit=None):
    """
    Main pipeline:
    - For each unknown edit in fg, generate n-grams.
    - Search globally across weaponised DataFrames for matches.
    - Identify the most probable article via local check.
    - Retrieve the matching user and update fg with article, username, and date.
    """
    updated_rows = []
    total = len(fg) if limit is None else min(limit, len(fg))

    for idx, row in fg.head(total).iterrows():
        after_text = row.get('after_json_text', '')
        if not isinstance(after_text, str) or not after_text.strip():
            continue

        print('-' * 80)
        print(f"Checking edit {idx}/{total}")

        ngrams = generate_ngrams(after_text, n)
        matches = check_global(ngrams, dfs_weaponised)

        if not matches:
            print("→ No matches found in any DataFrame.")
            continue

        # Heuristic: choose the df_index with the most matches
        df_index_counts = {}
        for m in matches:
            df_index_counts[m["df_index"]] = df_index_counts.get(m["df_index"], 0) + 1
        best_df_index = max(df_index_counts, key=df_index_counts.get)
        print(f"→ Most matches found in DataFrame {best_df_index}")

        # Get one representative ngram from this df
        representative_ngram = next(m["ngram"] for m in matches if m["df_index"] == best_df_index)

        # Try to identify article name via local check
        article_name, found_rows = check_local(representative_ngram, dfs_named)
        if article_name is None:
            print("→ Could not identify article name.")
            continue

        print(f"→ Match found in article: {article_name}")

        # Retrieve user info
        for i in found_rows.index:
            user = dfs_named[article_name].loc[i, "user"] if "user" in dfs_named[article_name].columns else None
            date = dfs_named[article_name].loc[i, "date"] if "date" in dfs_named[article_name].columns else None

            print(f"   → User: {user}, Date: {date}")
            fg.at[idx, 'username'] = user
            fg.at[idx, 'article'] = article_name
            fg.at[idx, 'date'] = date
            updated_rows.append(idx)
            break  # take first match

    print("\n✅ Matching complete.")
    print(f"Total updated rows: {len(updated_rows)}")

    return fg


In [None]:
user = pd.read_csv('../datas/interim/fg_user_known.csv')