In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os

# Add the project root to Python's module search path
sys.path.append(os.path.abspath(".."))

In [15]:
csv_folder = Path("../datas/raw/LLM_Results")
dfs = [pd.read_csv(f) for f in csv_folder.glob("*.csv")]
dfs_named = {f.stem: pd.read_csv(f) for f in csv_folder.glob("*.csv")}

In [17]:
dfs_named['Immigration_to_Ukraine_analysis']

Unnamed: 0,initial_version,changed_version,comment,user,date,llm_output,weaponised
0,#REDIRECT [[List of numbers of people immigrat...,Initial revision,Amitchell125 moved page [[Immigration to Ukrai...,Amitchell125,2023-12-30T13:06:33Z,Change: Moved page [[Immigration to Ukraine]] ...,Not Weaponised


In [3]:
dfs_weaponised = [df[df['weaponised'] == 'Weaponised'].reset_index(drop=True) for df in dfs]
dfs_weaponised = [df for df in dfs_weaponised if not df.empty]
dfs_weaponised_named = {
    name: df[df['weaponised'] == 'Weaponised'].reset_index(drop=True)
    for name, df in dfs_named.items()
    if not df[df['weaponised'] == 'Weaponised'].empty
}

In [22]:
dfs_weaponised[0].head()

Unnamed: 0,initial_version,changed_version,comment,user,date,llm_output,weaponised
0,{{main|2020 coronavirus outbreak in Europe}}\n...,"--- \n+++ \n@@ -26,7 +26,7 @@\n On 3 March, Uk...",cite web,VanHelsing.16,2020-03-13T14:04:15Z,Change: Replaced a URL reference with a citati...,Weaponised
1,{{main|2020 coronavirus outbreak in Europe}}\n...,"--- \n+++ \n@@ -17,18 +17,17 @@\n The [[2019‚Äì2...",/* Timeline */ copyediting - millions of Ukrai...,Boud,2020-03-16T02:16:56Z,The change made in this revision is the replac...,Weaponised
2,{{main|2020 coronavirus outbreak in Europe}}\n...,"--- \n+++ \n@@ -33,6 +33,7 @@\n On '''17 March...",/* Timeline */,Euroserhi,2020-03-18T23:05:13Z,Described change: Added information about new ...,Weaponised
3,{{main|2020 coronavirus outbreak in Europe}}\n...,"--- \n+++ \n@@ -36,7 +36,7 @@\n \n Later that ...",A space removed,Ad√ªn√¢i,2020-03-20T03:04:05Z,Change Description: Removed a space in the tex...,Weaponised
4,{{main|2020 coronavirus outbreak in Europe}}\n...,"--- \n+++ \n@@ -28,13 +28,13 @@\n \n On '''12 ...",/* Timeline */,Rygor2002,2020-03-20T16:44:22Z,The change made was the correction of the spel...,Weaponised


In [18]:
len(dfs_weaponised)

94

In [69]:
from tqdm import tqdm
import pandas as pd
import json

def generate_ngrams(text, n=4):
    words = text.split()
    return [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]

def check_global(ngrams, dfs):
    """Search across all weaponised DataFrames for ngram matches in 'changed_version'."""
    matches = []
    for i, df in enumerate(dfs):
        if "changed_version" not in df.columns:
            continue
        for ng in ngrams:
            found_rows = df[df["changed_version"].astype(str).str.contains(ng, case=False, na=False, regex=False)]
            if not found_rows.empty:
                matches.append({
                    "df_index": i,
                    "ngram": ng,
                    "rows": found_rows.index.tolist()
                })
    return matches

def check_local(chunk, dfs_named):
    """Given a text chunk, find which article(s) contain it in 'changed_version'."""
    for name, df in dfs_named.items():
        if "changed_version" not in df.columns:
            continue
        found_rows = df[df["changed_version"].astype(str).str.contains(chunk, case=False, na=False, regex=False)]
        if not found_rows.empty:
            return name, found_rows
    return None, pd.DataFrame()

def match_unknown_edits(fg, dfs_weaponised, dfs_named, n=4, limit=None):
    """
    Enhanced version:
    - For each edit chunk in fg, generate n-grams.
    - Search all weaponised DataFrames for matches.
    - Collect all matching rows across articles into a separate DataFrame.
    - Fill 'username', 'article', 'date' in fg from the first match only.
    """
    updated_rows = []
    all_matches = []
    total = len(fg) if limit is None else min(limit, len(fg))

    print(total)
    for idx, row in tqdm(fg.iterrows(), total=total, desc="üîç Matching edits"):
        after_text = row.get("aligned_after_chunk", "")
        if not isinstance(after_text, str) or not after_text.strip():
            continue

        ngrams = generate_ngrams(after_text, n)
        matches = check_global(ngrams, dfs_weaponised)

        if not matches:
            continue

        detailed_matches = []

        for match in matches:
            ngram = match["ngram"]
            df_index = match["df_index"]
            df = dfs_weaponised[df_index]

            for row_index in match["rows"]:
                article_name, found_rows = check_local(ngram, dfs_named)
                if article_name is None or found_rows.empty:
                    continue

                df_article = dfs_named[article_name]
                if row_index not in df_article.index:
                    continue

                user = df_article.at[row_index, "user"] if "user" in df_article.columns else None
                date = df_article.at[row_index, "date"] if "date" in df_article.columns else None
                comment = df_article.at[row_index, "comment"] if "comment" in df_article.columns else None
                llm_output = df_article.at[row_index, "llm_output"] if "llm_output" in df_article.columns else None
                weaponised_label = df_article.at[row_index, "weaponised"] if "weaponised" in df_article.columns else None

                match_info = {
                    # From matched row in dfs_named
                    "article": article_name,
                    "user": user,
                    "date": date,
                    "comment": comment,
                    "llm_output": llm_output,
                    "weaponised": weaponised_label,
                    "ngram": ngram,
                    "df_index": df_index,
                    "row_index_matched": row_index,

                    # From current fg row
                    "fg_row_index": row.get("row_index"),
                    "detected_before": row.get("detected_before"),
                    "detected_after": row.get("detected_after"),
                    "clean_before": row.get("clean_before"),
                    "clean_after": row.get("clean_after"),
                    "type_of_change_extracted": row.get("type_of_change_extracted"),
                    "category_extracted_clean": row.get("category_extracted_clean"),
                    "propaganda_similarity": row.get("propaganda_similarity"),
                    "category_extracted_propaganda_mapped": row.get("category_extracted_propaganda_mapped"),
                    "aligned_before_chunk": row.get("aligned_before_chunk"),
                    "aligned_after_chunk": row.get("aligned_after_chunk"),
                    "similarity": row.get("similarity"),
                    "significance_extracted": row.get("significance_extracted"),
                }

                detailed_matches.append(match_info)

        if detailed_matches:
            first = detailed_matches[0]
            fg.at[idx, "username"] = first["user"]
            fg.at[idx, "article"] = first["article"]
            fg.at[idx, "date"] = first["date"]
            updated_rows.append(idx)
            all_matches.extend(detailed_matches)

    print("\n‚úÖ Matching complete.")
    print(f"Total updated rows in fg: {len(updated_rows)}")
    print(f"Total collected matches: {len(all_matches)}")

    all_matches_df = pd.DataFrame(all_matches)
    return fg, all_matches_df

In [70]:
best_chunks = pd.read_excel("../datas/raw/best_chunks_semi_automated_annotated_data_repaired.xlsx")
# best_chunks = best_chunks[best_chunks['Annot 1'].fillna('') == 'Correct']
# best_chunks = best_chunks[best_chunks['Annot 2'].fillna('') == 'Correct']
# best_chunks = best_chunks[best_chunks['Annot 1 - new'].fillna('') == 'Correct']
# best_chunks = best_chunks[best_chunks['Annot 2 - new'].fillna('') == 'Correct']

best_chunks = best_chunks[
    (best_chunks['Annot 1 - new'] == 'Correct') &
    (best_chunks['Annot 2 - new'] == 'Correct') &
    (best_chunks['Column 1'] == 'Agree')
].reset_index(drop=True)

best_chunks.head()

Unnamed: 0,row_index,detected_before,detected_after,clean_before,clean_after,type_of_change_extracted,category_extracted_clean,propaganda_similarity,category_extracted_propaganda_mapped,aligned_before_chunk,aligned_after_chunk,similarity,significance_extracted,Annot 1,Annot 2,Annot 1 - new,Annot 2 - new,Column 1
0,6,A referendum in the largely ethnic Russian [[A...,A referendum in the largely ethnic Russian [[A...,A referendum in the largely ethnic Russian Ukr...,A referendum in the largely ethnic Russian Ukr...,rephrasing and addition of descriptive terms,Terminology Manipulation,0.418396,"Obfuscation, intentional vagueness",A referendum in the largely ethnic Russian Ukr...,A referendum in the largely ethnic Russian Ukr...,0.925267,The use of terms like 'bloodless' and 'bloody ...,Correct,Correct,Correct,Correct,Agree
1,10,"In 1940, after securing the assent of Nazi Ger...","In 1940, after securing the assent of Nazi Ger...","In 1940, after securing the assent of Nazi Ger...","In 1940, after securing the assent of Nazi Ger...",rewording and addition of context,Terminology Manipulation,0.418396,"Obfuscation, intentional vagueness","In 1940, after securing the assent of Nazi Ger...","In 1940, after securing the assent of Nazi Ger...",0.981235,The change from 'annex' to 'invade and occupy'...,Correct,Correct,Correct,Correct,Agree
2,11,Although local Ukrainians have unsuccesfully a...,Although local Ukrainians have unsuccesfully a...,Although local Ukrainians have unsuccesfully a...,Although local Ukrainians have unsuccesfully a...,addition of a phrase,Terminology Manipulation,0.418396,"Obfuscation, intentional vagueness",Although local Ukrainians have unsuccesfully a...,Although local Ukrainians have unsuccesfully a...,0.947917,The addition of 'the policies of Rumanization ...,Correct,Correct,Correct,Correct,Agree
3,19,London-based military experts said the soldier...,Western-based military experts said the soldie...,London-based military experts said the soldier...,Western-based military experts said the soldie...,synonym swap,Terminology Manipulation,0.418396,"Obfuscation, intentional vagueness",London-based military experts said the soldier...,Western-based military experts said the soldie...,0.927424,This change shifts the attribution of expertis...,Correct,Possibly Incorrect,Correct,Correct,Agree
4,24,The Nazi administrators of conquered Soviet te...,The Nazi administrators of conquered Soviet te...,The Nazi administrators of conquered Soviet te...,The Nazi administrators of conquered Soviet te...,replacement of terms,Terminology Manipulation,0.418396,"Obfuscation, intentional vagueness",The Nazi administrators of conquered Soviet te...,The Nazi administrators of conquered Soviet te...,0.988355,This change shifts the focus from the genocide...,Correct,Possibly Incorrect,Correct,Correct,Agree


In [71]:
len(best_chunks)

65

In [77]:
fg, all_matches_df = match_unknown_edits(best_chunks, dfs_weaponised, dfs_named)

65


üîç Matching edits: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 65/65 [1:33:16<00:00, 86.09s/it]



‚úÖ Matching complete.
Total updated rows in fg: 49
Total collected matches: 45697


In [None]:
len(all_matches_df)

In [None]:
all_matches_df.iloc[0]

In [None]:
all_matches_df.iloc[0].clean_before

In [None]:
all_matches_df.iloc[0].detected_after

In [None]:
all_matches_df.iloc[0].detected_before

In [None]:
all_matches_df.iloc[0].llm_output

In [None]:
len(fg)

In [78]:
all_matches_df.to_csv("../datas/interim/matched_edits_all.csv", index=False)

In [None]:
# user = pd.read_csv('../datas/interim/fg_user_known.csv')