In [33]:
stance = "anti-armenian"

In [34]:
import pandas as pd
import os

data_dir = ".."

clusters_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_clusters_with_weaponization_techniques.csv")
cluster_topics_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_cluster_topics_with_weaponization_techniques.csv")
general_topics_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_general_topics_with_weaponization_techniques.csv")

clusters_with_techniques = pd.read_csv(clusters_path)
cluster_topics_with_techniques = pd.read_csv(cluster_topics_path)
general_topics_with_techniques = pd.read_csv(general_topics_path)


## PART I: ANALYSIS W/ MORE FINEGRAINED CATEGORIES

#### 1. Clusters Exploration

In [2]:
# for clusters_with_techniques, for EACH cluster, count and print out the occurences/number of entries of each unique weaponization technique in each cluster
def count_unique_techniques(df):
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        technique_counts = cluster_df['weaponization_technique'].value_counts()
        print(f"Cluster {cluster}, {len(cluster_df)} entries:")
        for technique, count in technique_counts.items():
            print(f"  Technique: {technique}, Count: {count}")
        print()

count_unique_techniques(clusters_with_techniques)


Cluster 0, 22 entries:
  Technique: Selective Omission, Count: 7
  Technique: Terminology Biasing, Count: 6
  Technique: Glorification & Vilification, Count: 5
  Technique: Selective Insertion, Count: 3
  Technique: Euphemism and Doublespeak, Count: 1

Cluster 1, 582 entries:
  Technique: Terminology Biasing, Count: 279
  Technique: Glorification & Vilification, Count: 125
  Technique: Selective Insertion, Count: 78
  Technique: Selective Omission, Count: 67
  Technique: Source Biasing, Count: 13
  Technique: Tag Manipulation, Count: 10
  Technique: Timeline Rewriting, Count: 5
  Technique: Citation Deletion, Count: 3
  Technique: Euphemism and Doublespeak, Count: 2

Cluster 2, 53 entries:
  Technique: Terminology Biasing, Count: 40
  Technique: Selective Omission, Count: 9
  Technique: Glorification & Vilification, Count: 2
  Technique: Selective Insertion, Count: 2

Cluster 3, 20 entries:
  Technique: Glorification & Vilification, Count: 9
  Technique: Terminology Biasing, Count: 5
 

In [3]:
# explore any given cluster in clusters_with_techniques
def explore_cluster(df, cluster_number):    
    cluster_df = df[df['cluster'] == cluster_number]
    print(f"Cluster {cluster_number}, {len(cluster_df)} entries:")
    technique_counts = cluster_df['weaponization_technique'].value_counts()
    for technique, count in technique_counts.items():
        print(f"  Technique: {technique}, Count: {count}")
    print()

explore_cluster(clusters_with_techniques, 0)

Cluster 0, 22 entries:
  Technique: Selective Omission, Count: 7
  Technique: Terminology Biasing, Count: 6
  Technique: Glorification & Vilification, Count: 5
  Technique: Selective Insertion, Count: 3
  Technique: Euphemism and Doublespeak, Count: 1



In [4]:
# explore all entries of any given cluster in clusters_with_techniques
def explore_cluster_entries(df, cluster_number):    
    cluster_df = df[df['cluster'] == cluster_number]
    for index, row in cluster_df.iterrows():
        print("Chosen manipulation technique:", row['weaponization_technique'])
        print("Original text:", row['original_text'])
        print()

explore_cluster_entries(clusters_with_techniques, 1)

Chosen manipulation technique: Terminology Biasing
Original text: The revision includes the addition of the phrase "and subsequently closed down in 1984." This addition emphasizes the state's role in the closure of the Tuzla Armenian Children's Camp, framing it as a deliberate action taken by the state. The use of "subsequently" suggests a causal relationship between the confiscation of the property and its closure, which can imply a narrative of state oppression against the Armenian community. The removal of "state." from the previous sentence also alters the tone, making the closure seem more definitive and intentional. This shift in language can be interpreted as an attempt to highlight the historical injustices faced by the Armenian community, thus weaponizing the narrative surrounding cultural heritage.

Chosen manipulation technique: Terminology Biasing
Original text: The added line prominently references the "Armenian Genocide," a term that is significant in framing the historic

#### 2. Cluster Topics Exploration

In [None]:
# for cluster_topics_with_techniques, for EACH topic within EACH cluster, count and print out the occurences/number of entries of each unique weaponization technique in each cluster

def count_unique_techniques_per_topic(df):
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        print(f"Cluster {cluster}, {len(cluster_df)} entries:")
        for topic in cluster_df['topic'].unique():
            topic_df = cluster_df[cluster_df['topic'] == topic]
            technique_counts = topic_df['weaponization_technique'].value_counts()
            print(f"  Topic: {topic}, {len(topic_df)} entries:")
            for technique, count in technique_counts.items():
                print(f"    Technique: {technique}, Count: {count}")
        print()

count_unique_techniques_per_topic(cluster_topics_with_techniques)


Cluster 1, 582 entries:
  Topic: 0, 509 entries:
    Technique: Terminology Biasing, Count: 234
    Technique: Glorification & Vilification, Count: 118
    Technique: Selective Insertion, Count: 71
    Technique: Selective Omission, Count: 58
    Technique: Tag Manipulation, Count: 10
    Technique: Source Biasing, Count: 10
    Technique: Timeline Rewriting, Count: 3
    Technique: Citation Deletion, Count: 3
    Technique: Euphemism and Doublespeak, Count: 2
  Topic: 1, 73 entries:
    Technique: Terminology Biasing, Count: 45
    Technique: Selective Omission, Count: 9
    Technique: Glorification & Vilification, Count: 7
    Technique: Selective Insertion, Count: 7
    Technique: Source Biasing, Count: 3
    Technique: Timeline Rewriting, Count: 2

Cluster 2, 50 entries:
  Topic: 0, 30 entries:
    Technique: Terminology Biasing, Count: 25
    Technique: Selective Omission, Count: 3
    Technique: Selective Insertion, Count: 2
  Topic: 1, 20 entries:
    Technique: Terminology Bias

#### 3. BERTopic exploration

In [6]:
# for general_topics_with_techniques, for EACH topic, count and print out the occurences/number of entries of each unique weaponization technique in each cluster

def count_unique_techniques_per_general_topic(df):
    for topic in df['topic'].unique():
        topic_df = df[df['topic'] == topic]
        technique_counts = topic_df['weaponization_technique'].value_counts()
        print(f"Topic: {topic}, {len(topic_df)} entries:")
        for technique, count in technique_counts.items():
            print(f"  Technique: {technique}, Count: {count}")
        print()
        
count_unique_techniques_per_general_topic(general_topics_with_techniques)


Topic: 0, 582 entries:
  Technique: Terminology Biasing, Count: 279
  Technique: Glorification & Vilification, Count: 125
  Technique: Selective Insertion, Count: 78
  Technique: Selective Omission, Count: 67
  Technique: Source Biasing, Count: 13
  Technique: Tag Manipulation, Count: 10
  Technique: Timeline Rewriting, Count: 5
  Technique: Citation Deletion, Count: 3
  Technique: Euphemism and Doublespeak, Count: 2

Topic: 1, 75 entries:
  Technique: Glorification & Vilification, Count: 34
  Technique: Terminology Biasing, Count: 17
  Technique: Selective Insertion, Count: 12
  Technique: Selective Omission, Count: 11
  Technique: Euphemism and Doublespeak, Count: 1

Topic: 2, 104 entries:
  Technique: Terminology Biasing, Count: 63
  Technique: Glorification & Vilification, Count: 17
  Technique: Selective Omission, Count: 17
  Technique: Selective Insertion, Count: 5
  Technique: Tag Manipulation, Count: 1
  Technique: Source Biasing, Count: 1

Topic: 3, 50 entries:
  Technique: Te

In [7]:
# explore all entries of any given topic in general_topics_with_techniques

def explore_general_topic_entries(df, topic_name):    
    topic_df = df[df['topic'] == topic_name]
    for index, row in topic_df.iterrows():
        print("Chosen manipulation technique:", row['weaponization_technique'])
        print("Original text:", row['original_text'])
        print()

explore_general_topic_entries(general_topics_with_techniques, 11)


Chosen manipulation technique: Selective Insertion
Original text: The added lines emphasize the legitimacy and authority of the Armenian Revolutionary Federation (ARF) in the Western United States, particularly through phrases like "the only ARF entity operating in the Western United States region" and "authorized to incorporate the use of the ARF’s name and use its flag, insignia and anthem." This language asserts a strong claim to cultural and political representation, positioning the ARF as the legitimate voice of the Armenian community in the region. The context suggests a defensive stance against rival claims, reinforcing the ARF's narrative and potentially delegitimizing other factions. This framing aligns with a pro-Armenian perspective, as it seeks to solidify the ARF's role in preserving and representing Armenian heritage and identity against perceived threats.

Chosen manipulation technique: Glorification & Vilification
Original text: The revision adds the phrase "in the Dias

## PART II: ANALYSIS W/ MORE REDUCED CATEGORIES

#### 1. clusters

In [35]:
category_mapper = {
    "Terminology Biasing": "Linguistic Manipulation",
    "Euphemism and Doublespeak": "Linguistic Manipulation",
    "Selective Omission": "Factual Manipulation",
    "Selective Insertion": "Factual Manipulation",
    "Timeline Rewriting": "Factual Manipulation",
    "Source Biasing": "Factual Manipulation",
    "Citation Washing": "Factual Manipulation",
    "Citation Deletion": "Factual Manipulation",
    "Tag Manipulation": "Factual Manipulation",
    "Glorification & Vilification": "Linguistic Manipulation",
}

In [36]:
clusters_with_techniques_reduced = clusters_with_techniques.copy()
clusters_with_techniques_reduced['reduced_weaponization_technique'] = clusters_with_techniques_reduced['weaponization_technique'].map(category_mapper)

clusters_with_techniques_reduced.head()

Unnamed: 0,cluster,source,original_text,weaponization_technique,reduced_weaponization_technique
0,0,Mount_Ararat_subsampled,The revision alters the description of who dep...,Terminology Biasing,Linguistic Manipulation
1,0,History_of_Armenia_subsampled,"The revision introduces the word ""allegedly"" i...",Terminology Biasing,Linguistic Manipulation
2,0,Yerevan_subsampled,"The revision replaces ""Mount Ararat,"" a term t...",Terminology Biasing,Linguistic Manipulation
3,0,Yerevan_subsampled,"The revision adds the phrase ""in [[Turkey]]"" t...",Terminology Biasing,Linguistic Manipulation
4,0,Yerevan_subsampled,"The revision adds the phrase ""in [[Turkey]]"" t...",Terminology Biasing,Linguistic Manipulation


In [37]:
# save to csv
output_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_clusters_with_reduced_weaponization_techniques.csv")
clusters_with_techniques_reduced.to_csv(output_path, index=False)

In [10]:
def count_unique_reduced_techniques(df):
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        technique_counts = cluster_df['reduced_weaponization_technique'].value_counts()
        print(f"Cluster {cluster}, {len(cluster_df)} entries:")
        for technique, count in technique_counts.items():
            print(f"  Technique: {technique}, Count: {count}")
        print()

count_unique_reduced_techniques(clusters_with_techniques_reduced)

Cluster 0, 22 entries:
  Technique: Linguistic Manipulation, Count: 12
  Technique: Factual Manipulation, Count: 10

Cluster 1, 582 entries:
  Technique: Linguistic Manipulation, Count: 406
  Technique: Factual Manipulation, Count: 176

Cluster 2, 53 entries:
  Technique: Linguistic Manipulation, Count: 42
  Technique: Factual Manipulation, Count: 11

Cluster 3, 20 entries:
  Technique: Linguistic Manipulation, Count: 16
  Technique: Factual Manipulation, Count: 4

Cluster 4, 51 entries:
  Technique: Linguistic Manipulation, Count: 34
  Technique: Factual Manipulation, Count: 17

Cluster 5, 41 entries:
  Technique: Linguistic Manipulation, Count: 31
  Technique: Factual Manipulation, Count: 10

Cluster 6, 43 entries:
  Technique: Linguistic Manipulation, Count: 35
  Technique: Factual Manipulation, Count: 8

Cluster 7, 21 entries:
  Technique: Linguistic Manipulation, Count: 20
  Technique: Factual Manipulation, Count: 1

Cluster 8, 120 entries:
  Technique: Linguistic Manipulation, Co

In [11]:
# explore all entries of any given cluster in clusters_with_techniques
def explore_cluster_entries_reduced(df, cluster_number):    
    cluster_df = df[df['cluster'] == cluster_number]
    for index, row in cluster_df.iterrows():
        print("Chosen manipulation technique:", row['reduced_weaponization_technique'])
        print("Original text:", row['original_text'])
        print()

#### 2. cluster topics

In [38]:
cluster_topics_with_techniques_reduced = cluster_topics_with_techniques.copy()
cluster_topics_with_techniques_reduced['reduced_weaponization_technique'] = cluster_topics_with_techniques_reduced['weaponization_technique'].map(category_mapper)

In [13]:
# for cluster_topics_with_techniques_reduced, for EACH topic within EACH cluster, count and print out the occurences/number of entries of each unique weaponization technique in each cluster

def count_unique_reduced_techniques_per_topic(df):
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        print(f"Cluster {cluster}, {len(cluster_df)} entries:")
        for topic in cluster_df['topic'].unique():
            topic_df = cluster_df[cluster_df['topic'] == topic]
            technique_counts = topic_df['reduced_weaponization_technique'].value_counts()
            print(f"  Topic: {topic}, {len(topic_df)} entries:")
            for technique, count in technique_counts.items():
                print(f"    Technique: {technique}, Count: {count}")
        print()

count_unique_reduced_techniques_per_topic(cluster_topics_with_techniques_reduced)

Cluster 1, 582 entries:
  Topic: 0, 509 entries:
    Technique: Linguistic Manipulation, Count: 354
    Technique: Factual Manipulation, Count: 155
  Topic: 1, 73 entries:
    Technique: Linguistic Manipulation, Count: 52
    Technique: Factual Manipulation, Count: 21

Cluster 2, 50 entries:
  Topic: 0, 30 entries:
    Technique: Linguistic Manipulation, Count: 25
    Technique: Factual Manipulation, Count: 5
  Topic: 1, 20 entries:
    Technique: Linguistic Manipulation, Count: 14
    Technique: Factual Manipulation, Count: 6

Cluster 4, 31 entries:
  Topic: 0, 17 entries:
    Technique: Linguistic Manipulation, Count: 13
    Technique: Factual Manipulation, Count: 4
  Topic: 1, 14 entries:
    Technique: Factual Manipulation, Count: 9
    Technique: Linguistic Manipulation, Count: 5

Cluster 8, 90 entries:
  Topic: 0, 57 entries:
    Technique: Linguistic Manipulation, Count: 46
    Technique: Factual Manipulation, Count: 11
  Topic: 1, 33 entries:
    Technique: Factual Manipulation

In [39]:
# save to csv
output_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_cluster_topics_with_reduced_weaponization_techniques.csv")
cluster_topics_with_techniques_reduced.to_csv(output_path, index=False)

#### 3. BERTopic

In [40]:
general_topics_with_techniques_reduced = general_topics_with_techniques.copy()
general_topics_with_techniques_reduced['reduced_weaponization_technique'] = general_topics_with_techniques_reduced['weaponization_technique'].map(category_mapper)


In [41]:
# save to csv
output_path = os.path.join(data_dir, "other_outputs", "finegrained", "weaponization_analysis", f"{stance}_general_topics_with_reduced_weaponization_techniques.csv")
general_topics_with_techniques_reduced.to_csv(output_path, index=False)

In [15]:
# for general_topics_with_techniques, for EACH topic, count and print out the occurences/number of entries of each unique weaponization technique in each cluster

def count_unique_reduced_techniques_per_general_topic(df):
    for topic in df['topic'].unique():
        topic_df = df[df['topic'] == topic]
        technique_counts = topic_df['reduced_weaponization_technique'].value_counts()
        print(f"Topic: {topic}, {len(topic_df)} entries:")
        for technique, count in technique_counts.items():
            print(f"  Technique: {technique}, Count: {count}")
        print()

count_unique_reduced_techniques_per_general_topic(general_topics_with_techniques_reduced)

Topic: 0, 582 entries:
  Technique: Linguistic Manipulation, Count: 406
  Technique: Factual Manipulation, Count: 176

Topic: 1, 75 entries:
  Technique: Linguistic Manipulation, Count: 52
  Technique: Factual Manipulation, Count: 23

Topic: 2, 104 entries:
  Technique: Linguistic Manipulation, Count: 80
  Technique: Factual Manipulation, Count: 24

Topic: 3, 50 entries:
  Technique: Linguistic Manipulation, Count: 33
  Technique: Factual Manipulation, Count: 17

Topic: 4, 52 entries:
  Technique: Linguistic Manipulation, Count: 41
  Technique: Factual Manipulation, Count: 11

Topic: 5, 41 entries:
  Technique: Linguistic Manipulation, Count: 31
  Technique: Factual Manipulation, Count: 10

Topic: 6, 39 entries:
  Technique: Linguistic Manipulation, Count: 20
  Technique: Factual Manipulation, Count: 19

Topic: 7, 81 entries:
  Technique: Linguistic Manipulation, Count: 63
  Technique: Factual Manipulation, Count: 18

Topic: 8, 41 entries:
  Technique: Linguistic Manipulation, Count: 2

## PART III. Name the clusters

In [42]:
from openai import OpenAI
from utils import *

# subsample 15 entries (or if the number of entries is less than 15, simply the number of entries) from each cluster within clusters_with_techniques
# and concatenate the original_text of these subsampled entries to feed into OpenAI LLM to "name" the cluster based on these entries
def subsample_entries_per_cluster(df, sample_size=15):
    cluster_summaries = {}
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        sampled_df = cluster_df.sample(n=min(sample_size, len(cluster_df)), random_state=42)
        concatenated_text = "\n\n".join(sampled_df['original_text'].tolist())
        cluster_summaries[cluster] = name_topic_with_llm(concatenated_text)
    return cluster_summaries


client = OpenAI(api_key=OPENAI_API_KEY)

def name_topic_with_llm(keywords, model="gpt-5-mini"):
    """
    keywords: list or string of n-grams (top words defining the topic)
    model: choose any OpenAI model, default is fast & cheap
    
    Returns: short human-readable topic name.
    """
    if isinstance(keywords, list):
        keywords = ", ".join(keywords)

    prompt = f"""
    You are an expert in political narratives, conflict studies, and cultural heritage.
    Based only on the following analysis texts of 15 revisions from several wikipedia articles grouped together as a topic/theme, generate a SHORT descriptive name 
    (max 8-10 words) that summarizes the theme (subject matter and the characterstics of the revisions related to it, etc).

    Analyses: {keywords}

    Answer ONLY with the name.
    """

    response = client.responses.create(
        model=model,
        input=prompt
    )

    return response.output_text.strip()

cluster_summaries = subsample_entries_per_cluster(clusters_with_techniques)
for cluster, summary in cluster_summaries.items():
    print(f"Cluster {cluster} Summary:\n{summary}\n{'-'*80}\n")


Cluster 0 Summary:
Mount Ararat: Wikipedia Edits Erasing Armenian Cultural Heritage
--------------------------------------------------------------------------------

Cluster 1 Summary:
Wikipedia edits promoting Armenian genocide denial and cultural erasure
--------------------------------------------------------------------------------

Cluster 2 Summary:
Anti-Armenian reattribution of culinary heritage on Wikipedia
--------------------------------------------------------------------------------

Cluster 3 Summary:
Systematic Anti-Armenian Wikipedia Rewrites and Cultural Erasure
--------------------------------------------------------------------------------

Cluster 4 Summary:
Weaponized anti‑Armenian edits: erasure, slurs, and delegitimization
--------------------------------------------------------------------------------

Cluster 5 Summary:
Weaponized anti-Armenian narrative in Nagorno-Karabakh edits
--------------------------------------------------------------------------------



In [43]:
# read existing keywords csv
keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_cluster_keywords.csv")
keywords_df = pd.read_csv(keywords_path)

# replace the 'Cluster_Name' column with the newly generated cluster_summaries
keywords_df['Cluster_Name'] = keywords_df['Cluster'].map(cluster_summaries)

# save updated keywords csv
output_keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_cluster_keywords_named_text_directly.csv")
keywords_df.to_csv(output_keywords_path, index=False)

In [44]:
# subsample 15 entries (or if the number of entries is less than 15, simply the number of entries) from each cluster-topic combo within cluster_topics_with_techniques
# and concatenate the original_text of these subsampled entries to feed into OpenAI LLM to "name" the cluster topic based on these entries
def subsample_entries_per_cluster_topic(df, sample_size=15):
    cluster_summaries = {}
    for cluster in df['cluster'].unique():
        cluster_df = df[df['cluster'] == cluster]
        for topic in cluster_df['topic'].unique():
            topic_df = cluster_df[cluster_df['topic'] == topic]
            sampled_df = topic_df.sample(n=min(sample_size, len(topic_df)), random_state=42)
            concatenated_text = "\n\n".join(sampled_df['original_text'].tolist())
            cluster_topic_key = f"{cluster}__{topic}"
            cluster_summaries[cluster_topic_key] = name_topic_with_llm(concatenated_text)
    return cluster_summaries

cluster_topic_summaries = subsample_entries_per_cluster_topic(cluster_topics_with_techniques)
for cluster_topic, summary in cluster_topic_summaries.items():
    print(f"Cluster-Topic {cluster_topic} Summary:\n{summary}\n{'-'*80}\n") 

# read existing keywords csv
keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_cluster_topics_per_cluster_bertopic.csv")
keywords_df = pd.read_csv(keywords_path)



Cluster-Topic 1__0 Summary:
Wikipedia Revisions Promoting Armenian Genocide Denial
--------------------------------------------------------------------------------

Cluster-Topic 1__1 Summary:
Systematic edits downplaying and denying the Armenian Genocide
--------------------------------------------------------------------------------

Cluster-Topic 1__2 Summary:
Anti‑Armenian Narrative Edits Sanitizing Turkish Actions
--------------------------------------------------------------------------------

Cluster-Topic 1__3 Summary:
Anti-Armenian Wikipedia edits: genocide denial and Turkish victimhood
--------------------------------------------------------------------------------

Cluster-Topic 1__4 Summary:
Downplaying and denial of Armenian Genocide in Wikipedia
--------------------------------------------------------------------------------

Cluster-Topic 1__5 Summary:
Systematic Erasure of Armenian Genocide Acknowledgements
---------------------------------------------------------------

In [45]:
# replace the 'Cluster_Topic_Name' column with the newly generated cluster_topic_summaries
def get_cluster_topic_key(row):
    return f"{row['Cluster']}__{row['Topic']}"
keywords_df['Cluster_Topic_Key'] = keywords_df.apply(get_cluster_topic_key, axis=1)
keywords_df['Cluster_Topic_Name'] = keywords_df['Cluster_Topic_Key'].map(cluster_topic_summaries)
keywords_df.drop(columns=['Cluster_Topic_Key'], inplace=True)
# save updated keywords csv
output_keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_cluster_topics_named_text_directly.csv")
keywords_df.to_csv(output_keywords_path, index=False)

In [46]:
# subsample 15 entries (or if the number of entries is less than 15, simply the number of entries) from each topic within general_topics_with_techniques
# and concatenate the original_text of these subsampled entries to feed into OpenAI LLM to "name" the topic based on these entries

def subsample_entries_per_general_topic(df, sample_size=15):
    topic_summaries = {}
    for topic in df['topic'].unique():
        topic_df = df[df['topic'] == topic]
        sampled_df = topic_df.sample(n=min(sample_size, len(topic_df)), random_state=42)
        concatenated_text = "\n\n".join(sampled_df['original_text'].tolist())
        topic_summaries[topic] = name_topic_with_llm(concatenated_text)
    return topic_summaries

general_topic_summaries = subsample_entries_per_general_topic(general_topics_with_techniques)
for topic, summary in general_topic_summaries.items():
    print(f"General Topic {topic} Summary:\n{summary}\n{'-'*80}\n") 


General Topic 0 Summary:
Anti-Armenian revisionism: minimizing genocide, reframing historical narrative
--------------------------------------------------------------------------------

General Topic 1 Summary:
Anti-Armenian Wikipedia Edits Erasing Nagorno-Karabakh Identity
--------------------------------------------------------------------------------

General Topic 2 Summary:
Coordinated Wikipedia edits removing Armenian historical and cultural identity
--------------------------------------------------------------------------------

General Topic 3 Summary:
Anti-Armenian Rewriting of Karabakh Historical Identity
--------------------------------------------------------------------------------

General Topic 4 Summary:
Anti-Armenian Wikipedia edits promoting Turkish cultural claims
--------------------------------------------------------------------------------

General Topic 5 Summary:
Systematic Anti‑Armenian Bias in Casualty and Heritage Edits
-------------------------------------

In [47]:
# read existing keywords csv
keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_general_corpus_topics_bertopic.csv")
keywords_df = pd.read_csv(keywords_path)
# replace the 'Topic_Name' column with the newly generated general_topic_summaries
keywords_df['Topic_Name'] = keywords_df['Topic'].map(general_topic_summaries)
# save updated keywords csv
output_keywords_path = os.path.join(data_dir, "keywords", "finegrained", f"{stance}_general_topics_named_text_directly.csv")
keywords_df.to_csv(output_keywords_path, index=False)