In [None]:
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import sklearn.preprocessing
import sklearn.feature_extraction.text
import sklearn.cluster
import sklearn.metrics.pairwise
import spacy
import sentence_transformers

#loading the english language small model of spacy, just for stop words
en = spacy.load('en_core_web_sm')
stopwords = list(en.Defaults.stop_words)


HEADLINE_CSV = "../data/mc-onlinenews-mediacloud-20240221094242-content.csv"

# Save outputs to these locations
MODEL_PATH = Path("../data/models")
MODEL_PATH.mkdir(exist_ok=True)


# Read in Headlines from Media Cloud Search Results
First, we downloaded data from the [Media Cloud Search tool](https://search.mediacloud.org) to get headlines from the "Nigeria - National" collection which matched the following keyword query: 
```("African caravans" OR "loss and damage" OR "climate justice" OR "african activist for climate justice" OR "oxfam nigeria" OR Nasarawa OR cop28 OR cop27) -(gaza)```

The output from the Media Cloud Search tool was downloaded directly to a CSV file, which we will read in and analyze. This data could also be programmatically obtained via the Media Cloud API, but it is helpful to have a domain expert come up with query key terms. 

In [None]:
headlines_df = pd.read_csv(HEADLINE_CSV, index_col=False, header=0)
headlines_df["publish_date"] = pd.to_datetime(headlines_df["publish_date"]).dt.date
print(headlines_df.shape)
headlines_df.head()

In [None]:
# Sanity check attention article count
fig, ax = plt.subplots(figsize=[24,6])
sns.countplot(x=headlines_df["publish_date"].sort_values())
plt.xticks(rotation=60)
plt.show()
plt.clf()

# Using Sentence Transformers to Cluster Headlines
This section creates vector representations of the headlines using sentence transformers. 
Each headline is assigned a numerical vector representation, which are then clustered into topics. 
In order to make sense of the topics, you can view most frequent words in each cluster and the headlines closest to the cluster's center point. 

In [None]:
# Fast sentence transformer model
sent_transformer = sentence_transformers.SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
headlines = headlines_df["title"].to_numpy()
sent_embeds = sent_transformer.encode(headlines, show_progress_bar=True)
normalized_sent_embeds = sklearn.preprocessing.normalize(sent_embeds)
print("Sanity check (#docs, embedding size):", normalized_sent_embeds.shape)

In [None]:
# Setting the random state helps keep results reproducible, so others can re-run the notebook
# We did see similar clusters when re-running with different random states. 
kmeans_model = sklearn.cluster.KMeans(15, random_state=199)
kmeans_fit = kmeans_model.fit(normalized_sent_embeds)
with open(MODEL_PATH / "kmeans_explore.pickle", "wb") as fp:
    pickle.dump(kmeans_fit, fp)

In [None]:
headlines_df['cluster_label'] = kmeans_fit.labels_
headlines_df.head()

In [None]:
# Count documents in each cluster
headlines_df.groupby("cluster_label").count()["id"]

In [None]:
# Topic frequency over date
# Sanity check attention article count
date_cluster_group_df = headlines_df.groupby(by=["publish_date", "cluster_label"]).count().reset_index()[["publish_date", "cluster_label", "id"]]
date_cluster_group_df = date_cluster_group_df.sort_values("publish_date")
date_cluster_group_df = date_cluster_group_df.rename(columns = {"id": "daily_count"})
print(date_cluster_group_df.head())

fig, ax = plt.subplots(figsize=[24,6])
sns.lineplot(date_cluster_group_df, x="publish_date", y="daily_count", hue="cluster_label", 
             palette =['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080',  '#fffac8', '#800000', '#aaffc3']).set_title("Cluster volume over time")
plt.xticks(rotation=60)
plt.show()
plt.clf()


The plot above is shows a few spikes which are interesting for us around the Cop28 event. The spike of cluster 13 at the beginning of December 2023, for example.

In [None]:
# Text pipeline for top words in each cluster
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(stop_words=stopwords)
doc_word_counts = count_vectorizer.fit_transform(headlines_df["title"])
print("Sanity check shape is (# docs, #words):", doc_word_counts.shape)
vocabulary = count_vectorizer.get_feature_names_out()
print("Sanity check vocabulary:", vocabulary[0:30])

In [None]:
def get_top_words(document_idxs, doc_word_counts, vocabulary, n=20): 
    """Given the indices of selected documents, sum the word counts within only those
    selected docs, then return the top n words that are most frequent in the cluster

    Args:
        document_idxs (1d numpy array): locations of selected documents
        doc_word_counts (2d numpy array): (# docs, vocab size), count of each vocab word in the doc
        vocabulary (list[str]): vocabulary as a list
        n (int): top n most frequent words to return 

    Returns:
        list[str]: the top n most frequent words in the cluster in decreasing order
    """
    selected_docs = doc_word_counts[document_idxs]
    word_counts_in_selected_docs = np.resize(np.sum(selected_docs, axis=0), vocabulary.shape)
    # Sort word counts in increasing order, so last element is biggest
    top_word_idx = np.argsort(word_counts_in_selected_docs)[-n:]
    # Reverse order to give most frequent word first
    top_words = vocabulary[top_word_idx].tolist()
    top_words.reverse()
    return top_words

In [None]:
def get_docs_closest_to_centroid(centroid_vector, doc_embeddings, doc_texts, n=20):
    """
    Args:
        centroid_vector (np.array): vector that's the mathematical center of the cluster in vector space, shape (embedding_size, )
        doc_embeddings (np.array): embeddings of the document in the cluster, shape (#docs, embedding_size)
        doc_texts (pd.DataFrame): dataframe of documents in the cluster
        n (int, optional): Number of documents to return . Defaults to 20.

    Returns:
        pd.DataFrame: The subset of the dataframe for the top n documents closest to the center of the cluster
    """
    distances = sklearn.metrics.pairwise.cosine_distances(doc_embeddings, centroid_vector.reshape(1, -1))[:,-1]
    # Indices of elements closest to center (sort in increasing order)
    sorted_indices = np.argsort(distances)[:n]
    typical_headlines = doc_texts.loc[sorted_indices]
    return typical_headlines
    

In [None]:
def show_article_volume_for_cluster_over_time(dataframe, cluster_id, time_col="publish_date", cluster_col="cluster_label", match_placeholder="matches_cluster"):
    """Plots the daily volume of articles for that cluster vs all other articles
    """
    sorted_by_date = dataframe.sort_values(time_col)
    sorted_by_date[match_placeholder] = (sorted_by_date[cluster_col] == cluster_id)
    fig, ax = plt.subplots(figsize=[24,6])
    sns.countplot(sorted_by_date, x=time_col, hue=match_placeholder).set(title=f"Volume for cluster {cluster_id}")
    plt.xticks(rotation=60)
    plt.show()
    plt.clf()

In [None]:

pd.set_option("display.max_rows", 30, "display.max_columns", None, 'max_colwidth', 800)
centroids = kmeans_fit.cluster_centers_
for i in range(15):
    print("Cluster", i)
    
    cluster_element_idx = np.where(headlines_df["cluster_label"] == i)[0]
    print("\tNumber of docs in cluster:", cluster_element_idx.shape)
    # Top words can give you a general idea of what's talked about in the topic, but it can be difficult to interpret 
    top_words = get_top_words(cluster_element_idx, doc_word_counts, vocabulary)
    print("\tTop words:", top_words)

    # What are the "prototypical" headlines for each cluster? These are the headlines closest to the cluster center for each cluster
    center = centroids[i]
    cluster_texts = headlines_df.loc[cluster_element_idx].reset_index()
    cluster_embeddings = normalized_sent_embeds[cluster_element_idx]
    typical_headlines = get_docs_closest_to_centroid(center, cluster_embeddings, cluster_texts)
    print("\tTypicial headlines close to center of cluster")
    display(typical_headlines[["media_name", "publish_date", "title"]])
    
    
    # A sample of random headlines from the cluster gives an indication of whether the cluster's topic is consistent or makes less sense
    # further from the centroid
    print("\tRandom headlines")
    display(headlines_df.loc[cluster_element_idx].sample(10)[["media_name", "publish_date", "title"]])

    print("\tArticle volume for cluster")
    show_article_volume_for_cluster_over_time(headlines_df, i)
    


We were able to identify cluster 4 as relevant specifically to climate change initiatives and cluster 13 as relevant to Cop28, both of which are interesting to Oxfam's work.

# Which headlines mention Oxfam? 
Which headlines mention 

In [None]:
# Is 'oxfam' in the vocabulary
print("Sanity check that 'oxfam' is in the vocabulary:", "oxfam" in vocabulary)
print(list(vocabulary).index("oxfam"))

In [None]:
oxfam_matches = headlines_df[headlines_df["title"].str.contains("oxfam", flags=re.IGNORECASE)]
print(len(oxfam_matches))
display(oxfam_matches)

In [None]:
sns.countplot(x = oxfam_matches["cluster_label"]).set_title("How many headlines in different clusters mention 'oxfam'?")