# NLP

## Setup 

This setup allows you to use *Python* and *R* in the same notebook.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
from tqdm.notebook import tqdm
tqdm.pandas()

## Load Data & Remove Duplciates 🧹

In [4]:
# read data
stories = pd.read_csv("output/stories_df.csv", 
                 parse_dates=['publication_date', 'capture_time'])


duplicates to delete

In [5]:
dedupe_by = ['title', 'domain']
stories[stories.duplicated(subset=dedupe_by, keep=False)]\
    .sort_values(by=dedupe_by)\
    .head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
457,3 men claiming to be from DOGE show up at San ...,2025-02-14,2025-02-16 11:29:22+00:00,en,cbsnews.com,https://www.cbsnews.com/news/doge-3-men-show-u...,https://web.archive.org/web/20250216112922id_/...,https://web.archive.org/web/20250216112922/htt...,https://wayback-api.archive.org/colsearch/v1/m...,3 men claiming to be from DOGE show up at San ...
467,3 men claiming to be from DOGE show up at San ...,2025-02-14,2025-02-16 15:17:50+00:00,en,cbsnews.com,https://www.cbsnews.com/sanfrancisco/news/doge...,https://web.archive.org/web/20250216151750id_/...,https://web.archive.org/web/20250216151750/htt...,https://wayback-api.archive.org/colsearch/v1/m...,3 men claiming to be from DOGE show up at San ...
396,A government worker's message for Elon Musk,2025-02-16,2025-02-17 04:23:32+00:00,en,cbsnews.com,https://www.cbsnews.com/news/federal-workers-j...,https://web.archive.org/web/20250217042332id_/...,https://web.archive.org/web/20250217042332/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A government worker's message for Elon Musk\nC...
400,A government worker's message for Elon Musk,2025-02-16,2025-02-22 04:02:04+00:00,en,cbsnews.com,https://www.cbsnews.com/sanfrancisco/news/fede...,https://web.archive.org/web/20250222040204id_/...,https://web.archive.org/web/20250222040204/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A government worker's message for Elon Musk\nC...
406,A government worker's message for Elon Musk,2025-02-16,2025-02-20 15:02:39+00:00,en,cbsnews.com,https://www.cbsnews.com/miami/news/federal-wor...,https://web.archive.org/web/20250220150239id_/...,https://web.archive.org/web/20250220150239/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A government worker's message for Elon Musk\nC...


In [6]:
# remove duplicates
stories.drop_duplicates(subset=dedupe_by, keep='last', inplace=True)

# preview
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
0,Elon Musk,2025-03-05,2025-03-05 17:44:22+00:00,en,foxnews.com,https://www.foxnews.com/category/person/elon-musk,https://web.archive.org/web/20250305174422id_/...,https://web.archive.org/web/20250305174422/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Who is Andrew Lennox, the veteran who was Slot..."
1,Social Security has never missed a payment. DO...,2025-03-01,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/03/01/doge-actions-m...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Social Security has never missed a payment. DO...
2,Musk and Republican Lawmakers Pressure Judges ...,2025-03-01,2025-03-05 10:49:58+00:00,en,nytimes.com,https://www.nytimes.com/2025/03/01/us/politics...,https://web.archive.org/web/20250305104958id_/...,https://web.archive.org/web/20250305104958/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Congressional Republicans, egged on by Elon Mu..."
3,The Bewildering Irony Behind the Trump-Musk Pa...,2025-02-27,2025-03-05 19:47:03+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/27/opinion/tru...,https://web.archive.org/web/20250305194703id_/...,https://web.archive.org/web/20250305194703/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJamelle Bouie\nThe Bewildering I...
4,"Trump, Musk float idea of $5,000 'DOGE dividen...",2025-02-27,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/02/27/trump-musk-pro...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,- As the Department of Government Efficiency l...
...,...,...,...,...,...,...,...,...,...,...
769,Elon Musk-led group makes shock $97.4B bid for...,2025-02-10,2025-02-11 14:21:33+00:00,en,nypost.com,https://nypost.com/2025/02/10/business/elon-mu...,https://web.archive.org/web/20250211142133id_/...,https://web.archive.org/web/20250211142133/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk-led group makes shock $97.4B bid for...
770,"DOGE targets federal leases, shaking commercia...",2025-02-10,2025-02-11 07:31:56+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/business/return...,https://web.archive.org/web/20250211073156id_/...,https://web.archive.org/web/20250211073156/htt...,https://wayback-api.archive.org/colsearch/v1/m...,For many companies â and possibly the federal ...
771,Musk charges on with new targets in sight and ...,2025-02-10,2025-02-11 18:53:27+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/politics/musk-t...,https://web.archive.org/web/20250211185327id_/...,https://web.archive.org/web/20250211185327/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The vast and opaque power of Elon Musk is only...
772,"Trump orders halt on ""wasteful"" pennies after ...",2025-02-10,2025-02-11 09:51:31+00:00,en,newsweek.com,https://www.newsweek.com/trump-orders-halt-was...,https://web.archive.org/web/20250211095131id_/...,https://web.archive.org/web/20250211095131/htt...,https://wayback-api.archive.org/colsearch/v1/m...,President Donald Trump has directed the Treasu...


# Keywords

In [7]:
from yake import KeywordExtractor
from pandarallel import pandarallel

kw_extractor = KeywordExtractor()

def get_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    return [x for x,y in keywords]

pandarallel.initialize(progress_bar=True)
stories['keywords'] = stories['snippet'].parallel_apply(get_keywords)

# display
stories

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=89), Label(value='0 / 89'))), HBox…

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Elon Musk,2025-03-05,2025-03-05 17:44:22+00:00,en,foxnews.com,https://www.foxnews.com/category/person/elon-musk,https://web.archive.org/web/20250305174422id_/...,https://web.archive.org/web/20250305174422/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Who is Andrew Lennox, the veteran who was Slot...","[Andrew Lennox, President Donald Trump, speech..."
1,Social Security has never missed a payment. DO...,2025-03-01,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/03/01/doge-actions-m...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Social Security has never missed a payment. DO...,"[Social Security Administration, Social Securi..."
2,Musk and Republican Lawmakers Pressure Judges ...,2025-03-01,2025-03-05 10:49:58+00:00,en,nytimes.com,https://www.nytimes.com/2025/03/01/us/politics...,https://web.archive.org/web/20250305104958id_/...,https://web.archive.org/web/20250305104958/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Congressional Republicans, egged on by Elon Mu...","[federal judges, judges, Trump, Elon Musk, fed..."
3,The Bewildering Irony Behind the Trump-Musk Pa...,2025-02-27,2025-03-05 19:47:03+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/27/opinion/tru...,https://web.archive.org/web/20250305194703id_/...,https://web.archive.org/web/20250305194703/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJamelle Bouie\nThe Bewildering I...,"[president, executive, Musk, Trump, executive ..."
4,"Trump, Musk float idea of $5,000 'DOGE dividen...",2025-02-27,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/02/27/trump-musk-pro...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,- As the Department of Government Efficiency l...,"[President Donald Trump, DOGE dividend checks,..."
...,...,...,...,...,...,...,...,...,...,...,...
769,Elon Musk-led group makes shock $97.4B bid for...,2025-02-10,2025-02-11 14:21:33+00:00,en,nypost.com,https://nypost.com/2025/02/10/business/elon-mu...,https://web.archive.org/web/20250211142133id_/...,https://web.archive.org/web/20250211142133/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk-led group makes shock $97.4B bid for...,"[Elon Musk-led group, CEO Sam Altman, Sam Altm..."
770,"DOGE targets federal leases, shaking commercia...",2025-02-10,2025-02-11 07:31:56+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/business/return...,https://web.archive.org/web/20250211073156id_/...,https://web.archive.org/web/20250211073156/htt...,https://wayback-api.archive.org/colsearch/v1/m...,For many companies â and possibly the federal ...,"[office, office space, commercial real estate,..."
771,Musk charges on with new targets in sight and ...,2025-02-10,2025-02-11 18:53:27+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/politics/musk-t...,https://web.archive.org/web/20250211185327id_/...,https://web.archive.org/web/20250211185327/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The vast and opaque power of Elon Musk is only...,"[Musk, Trump, Elon Musk, government shredding ..."
772,"Trump orders halt on ""wasteful"" pennies after ...",2025-02-10,2025-02-11 09:51:31+00:00,en,newsweek.com,https://www.newsweek.com/trump-orders-halt-was...,https://web.archive.org/web/20250211095131id_/...,https://web.archive.org/web/20250211095131/htt...,https://wayback-api.archive.org/colsearch/v1/m...,President Donald Trump has directed the Treasu...,"[President Donald Trump, President Donald, Don..."


## Embeddings

In [8]:
import os
import openai
import dotenv
dotenv.load_dotenv()

openai.organization = None
openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.Model.list() # see all openai models

In [9]:
# exclude urls that are videos (contain /video/)    
stories = stories[~stories.url.str.contains("/video/")]
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Elon Musk,2025-03-05,2025-03-05 17:44:22+00:00,en,foxnews.com,https://www.foxnews.com/category/person/elon-musk,https://web.archive.org/web/20250305174422id_/...,https://web.archive.org/web/20250305174422/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Who is Andrew Lennox, the veteran who was Slot...","[Andrew Lennox, President Donald Trump, speech..."
1,Social Security has never missed a payment. DO...,2025-03-01,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/03/01/doge-actions-m...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Social Security has never missed a payment. DO...,"[Social Security Administration, Social Securi..."
2,Musk and Republican Lawmakers Pressure Judges ...,2025-03-01,2025-03-05 10:49:58+00:00,en,nytimes.com,https://www.nytimes.com/2025/03/01/us/politics...,https://web.archive.org/web/20250305104958id_/...,https://web.archive.org/web/20250305104958/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Congressional Republicans, egged on by Elon Mu...","[federal judges, judges, Trump, Elon Musk, fed..."
3,The Bewildering Irony Behind the Trump-Musk Pa...,2025-02-27,2025-03-05 19:47:03+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/27/opinion/tru...,https://web.archive.org/web/20250305194703id_/...,https://web.archive.org/web/20250305194703/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJamelle Bouie\nThe Bewildering I...,"[president, executive, Musk, Trump, executive ..."
4,"Trump, Musk float idea of $5,000 'DOGE dividen...",2025-02-27,2025-03-05 23:14:15+00:00,en,cnbc.com,https://www.cnbc.com/2025/02/27/trump-musk-pro...,https://web.archive.org/web/20250305231415id_/...,https://web.archive.org/web/20250305231415/htt...,https://wayback-api.archive.org/colsearch/v1/m...,- As the Department of Government Efficiency l...,"[President Donald Trump, DOGE dividend checks,..."
...,...,...,...,...,...,...,...,...,...,...,...
769,Elon Musk-led group makes shock $97.4B bid for...,2025-02-10,2025-02-11 14:21:33+00:00,en,nypost.com,https://nypost.com/2025/02/10/business/elon-mu...,https://web.archive.org/web/20250211142133id_/...,https://web.archive.org/web/20250211142133/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk-led group makes shock $97.4B bid for...,"[Elon Musk-led group, CEO Sam Altman, Sam Altm..."
770,"DOGE targets federal leases, shaking commercia...",2025-02-10,2025-02-11 07:31:56+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/business/return...,https://web.archive.org/web/20250211073156id_/...,https://web.archive.org/web/20250211073156/htt...,https://wayback-api.archive.org/colsearch/v1/m...,For many companies â and possibly the federal ...,"[office, office space, commercial real estate,..."
771,Musk charges on with new targets in sight and ...,2025-02-10,2025-02-11 18:53:27+00:00,en,cnn.com,https://www.cnn.com/2025/02/10/politics/musk-t...,https://web.archive.org/web/20250211185327id_/...,https://web.archive.org/web/20250211185327/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The vast and opaque power of Elon Musk is only...,"[Musk, Trump, Elon Musk, government shredding ..."
772,"Trump orders halt on ""wasteful"" pennies after ...",2025-02-10,2025-02-11 09:51:31+00:00,en,newsweek.com,https://www.newsweek.com/trump-orders-halt-was...,https://web.archive.org/web/20250211095131id_/...,https://web.archive.org/web/20250211095131/htt...,https://wayback-api.archive.org/colsearch/v1/m...,President Donald Trump has directed the Treasu...,"[President Donald Trump, President Donald, Don..."


In [10]:
# Import modules
import tiktoken
from openai import OpenAI
client = OpenAI()

# Set embedding model parameters
embedding_model = "text-embedding-3-small" # this is the model we will use to make embeddings
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# Get the encoding for the specified model
encoding = tiktoken.get_encoding(embedding_encoding)

# Make a new column with the combined title and summary
stories["combined"] = (
    "Title: " + stories.title.str.strip() + "; Content: " + stories.snippet.str.strip()
)

# Make a new column with the number of tokens in the combined title and summary
stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))

# Sort by that column
stories = stories.sort_values(by='n_tokens', ascending=False)

# Display the bills
stories


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["combined"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
705,Musk Asserts Without Proof That He’s Fixing a ...,2025-02-11,2025-02-14 06:28:23+00:00,en,nytimes.com,https://www.nytimes.com/live/2025/02/11/us/pre...,https://web.archive.org/web/20250214062823id_/...,https://web.archive.org/web/20250214062823/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Musk Asserts Without Proof That He’s Fixing a ...,"[President Trump, Trump, Trump administration,...",Title: Musk Asserts Without Proof That He’s Fi...,9950
662,DOGE announces its slashing $881m from Educati...,2025-02-11,2025-02-14 00:39:14+00:00,en,newsweek.com,https://www.newsweek.com/doge-announces-its-sl...,https://web.archive.org/web/20250214003914id_/...,https://web.archive.org/web/20250214003914/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's Department of Government Efficienc...,"[President Donald Trump, Elon Musk DOGE, Elon ...",Title: DOGE announces its slashing $881m from ...,7665
190,"Trump and Musk, the ‘Co-Presidents’",2025-02-19,2025-02-21 04:33:56+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/19/opinion/tru...,https://web.archive.org/web/20250221043356id_/...,https://web.archive.org/web/20250221043356/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"transcript\nTrump and Musk, the ‘Co-Presidents...","[aaron retica, Jamelle Bouie, retica, executiv...","Title: Trump and Musk, the ‘Co-Presidents’; Co...",5839
477,"Elon Musk shadow looms over Bastrop, Texas. Ca...",2025-02-14,2025-02-16 16:09:46+00:00,en,usatoday.com,https://www.usatoday.com/story/money/personalf...,https://web.archive.org/web/20250216160946id_/...,https://web.archive.org/web/20250216160946/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's shadow looms over this tiny Texas ...,"[Bastrop, Bastrop Economic Development, Bastro...","Title: Elon Musk shadow looms over Bastrop, Te...",3962
264,Elon Musk Is Leading a ‘Hostile Takeover of th...,2025-02-18,2025-02-20 15:30:25+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/18/opinion/mus...,https://web.archive.org/web/20250220153025id_/...,https://web.archive.org/web/20250220153025/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nGuest Essay\nElon Musk Is Leadin...,"[Musk, Trump, Elon Musk, President Trump, Trum...",Title: Elon Musk Is Leading a ‘Hostile Takeove...,3687
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,Elon Musk Seeks Access to Personal IRS Records...,2025-02-17,2025-02-21 09:23:58+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/17/elon-musk...,https://web.archive.org/web/20250221092358id_/...,https://web.archive.org/web/20250221092358/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk Seeks Access to Personal IRS Records...,"[Personal IRS Records, Elon Musk Seeks, Musk S...",Title: Elon Musk Seeks Access to Personal IRS ...,89
583,"Elon Musk Takes Center Stage, Tells Reporters ...",2025-02-12,2025-02-14 05:34:07+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/12/elon-musk...,https://web.archive.org/web/20250214053407id_/...,https://web.archive.org/web/20250214053407/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Elon Musk Takes Center Stage, Tells Reporters ...","[Center Stage, Elon Musk, Elon Musk defending,...","Title: Elon Musk Takes Center Stage, Tells Rep...",88
340,Lydia Moynihan: DOGE Is Making It A Lot Harder...,2025-02-17,2025-02-20 14:14:55+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/17/lydia-moy...,https://web.archive.org/web/20250220141455id_/...,https://web.archive.org/web/20250220141455/htt...,https://wayback-api.archive.org/colsearch/v1/m...,New York Post financial correspondent Lydia Mo...,"[York Post financial, correspondent Lydia Moyn...",Title: Lydia Moynihan: DOGE Is Making It A Lot...,87
175,"DOGE Fires Bird Flu Experts, Claims $8 Billion...",2025-02-19,2025-02-21 04:46:03+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/19/doge-fire...,https://web.archive.org/web/20250221044603id_/...,https://web.archive.org/web/20250221044603/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"DOGE Fires Bird Flu Experts, Claims $8 Billion...","[DOGE Fires Bird, Bird Flu Experts, Fires Bird...","Title: DOGE Fires Bird Flu Experts, Claims $8 ...",81


In [11]:
# Grab the rows where the text is too big for the context window of the mmodel (>8000 tokens)
too_long = stories.query("n_tokens > @max_tokens") 

# Print how many will be removed
print(f"Removing {len(too_long)} stories that are too long")

# Display the removed stories here in this cell so we can see what we're losing
display(too_long)  

# Remove the rows where the text is too big for the context window of the model
stories = stories.query("n_tokens <= @max_tokens")  

Removing 1 stories that are too long


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
705,Musk Asserts Without Proof That He’s Fixing a ...,2025-02-11,2025-02-14 06:28:23+00:00,en,nytimes.com,https://www.nytimes.com/live/2025/02/11/us/pre...,https://web.archive.org/web/20250214062823id_/...,https://web.archive.org/web/20250214062823/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Musk Asserts Without Proof That He’s Fixing a ...,"[President Trump, Trump, Trump administration,...",Title: Musk Asserts Without Proof That He’s Fi...,9950


In [12]:
from openai import OpenAI
client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-small"):
    # Replace newlines in each text and ensure it's a list of texts
    texts = [text.replace("\n", " ") for text in texts]
    # OpenAI's embeddings.create can process multiple inputs as a list
    response = client.embeddings.create(input=texts, model=model)
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Function to process DataFrame in batches and return a list of embeddings
def process_in_batches(df, column_name, batch_size=10):
    # Break the DataFrame into batches of size `batch_size`
    batches = [df[column_name].iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # Process each batch and collect embeddings
    all_embeddings = []
    for batch in tqdm(batches, desc="Processing batches"):
        batch_embeddings = get_embeddings(batch.tolist())
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Example usage
batch_size = 100  # Adjust based on your preference and rate limits
stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


Processing batches:   0%|          | 0/7 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


In [13]:
# drop combined column since those were only for the purposes of making the embeddings
stories = stories.drop(columns=['combined', 'n_tokens'])

## Dimensionality Reduction (t-SNE)


In [14]:
from sklearn.manifold import TSNE
import numpy as np

# check if vis_dims exists
if os.path.exists("output/stories-with-vis-dims.csv"):
    stories = pd.read_csv("output/stories-with-vis-dims.csv")
else: 
    # Convert to a list of lists of floats
    matrix = np.array(stories.embedding.to_list())

    # Create a t-SNE model and transform the data
    tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='random', learning_rate=400)
    vis_dims = tsne.fit_transform(matrix)

    # add to dataframe and write to csv
    stories = stories\
        .assign(
            x = vis_dims[:,0], 
            y = vis_dims[:,1])


In [15]:
# stories.to_csv('output/stories-with-nlp.csv', index=False)
stories.head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
662,DOGE announces its slashing $881m from Educati...,2025-02-11,2025-02-14 00:39:14+00:00,en,newsweek.com,https://www.newsweek.com/doge-announces-its-sl...,https://web.archive.org/web/20250214003914id_/...,https://web.archive.org/web/20250214003914/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's Department of Government Efficienc...,"[President Donald Trump, Elon Musk DOGE, Elon ...","[0.03996237367391586, -0.0192171148955822, 0.0...",-13.81706,-5.48476
190,"Trump and Musk, the ‘Co-Presidents’",2025-02-19,2025-02-21 04:33:56+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/19/opinion/tru...,https://web.archive.org/web/20250221043356id_/...,https://web.archive.org/web/20250221043356/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"transcript\nTrump and Musk, the ‘Co-Presidents...","[aaron retica, Jamelle Bouie, retica, executiv...","[0.02634086087346077, 0.01618337258696556, -0....",9.890508,-15.662511
477,"Elon Musk shadow looms over Bastrop, Texas. Ca...",2025-02-14,2025-02-16 16:09:46+00:00,en,usatoday.com,https://www.usatoday.com/story/money/personalf...,https://web.archive.org/web/20250216160946id_/...,https://web.archive.org/web/20250216160946/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's shadow looms over this tiny Texas ...,"[Bastrop, Bastrop Economic Development, Bastro...","[0.012421227991580963, -0.012653255835175514, ...",21.240904,-0.582203
264,Elon Musk Is Leading a ‘Hostile Takeover of th...,2025-02-18,2025-02-20 15:30:25+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/18/opinion/mus...,https://web.archive.org/web/20250220153025id_/...,https://web.archive.org/web/20250220153025/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nGuest Essay\nElon Musk Is Leadin...,"[Musk, Trump, Elon Musk, President Trump, Trum...","[0.03440450131893158, -0.003716809442266822, 0...",11.243999,-14.702011
716,Elon Musk is igniting a fierce debate with mov...,2025-02-10,2025-02-11 18:56:00+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/2...,https://web.archive.org/web/20250211185600id_/...,https://web.archive.org/web/20250211185600/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk is igniting a fierce debate with mov...,"[Taylor Wilson, USA TODAY, Elon Musk, Zac Ande...","[0.03692523390054703, 0.014707022346556187, -0...",9.270676,-16.019339


# Topic Modeling

In [16]:
stories.reset_index(drop=True, inplace=True)

In [17]:
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
0,DOGE announces its slashing $881m from Educati...,2025-02-11,2025-02-14 00:39:14+00:00,en,newsweek.com,https://www.newsweek.com/doge-announces-its-sl...,https://web.archive.org/web/20250214003914id_/...,https://web.archive.org/web/20250214003914/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's Department of Government Efficienc...,"[President Donald Trump, Elon Musk DOGE, Elon ...","[0.03996237367391586, -0.0192171148955822, 0.0...",-13.817060,-5.484760
1,"Trump and Musk, the ‘Co-Presidents’",2025-02-19,2025-02-21 04:33:56+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/19/opinion/tru...,https://web.archive.org/web/20250221043356id_/...,https://web.archive.org/web/20250221043356/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"transcript\nTrump and Musk, the ‘Co-Presidents...","[aaron retica, Jamelle Bouie, retica, executiv...","[0.02634086087346077, 0.01618337258696556, -0....",9.890508,-15.662511
2,"Elon Musk shadow looms over Bastrop, Texas. Ca...",2025-02-14,2025-02-16 16:09:46+00:00,en,usatoday.com,https://www.usatoday.com/story/money/personalf...,https://web.archive.org/web/20250216160946id_/...,https://web.archive.org/web/20250216160946/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk's shadow looms over this tiny Texas ...,"[Bastrop, Bastrop Economic Development, Bastro...","[0.012421227991580963, -0.012653255835175514, ...",21.240904,-0.582203
3,Elon Musk Is Leading a ‘Hostile Takeover of th...,2025-02-18,2025-02-20 15:30:25+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/18/opinion/mus...,https://web.archive.org/web/20250220153025id_/...,https://web.archive.org/web/20250220153025/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nGuest Essay\nElon Musk Is Leadin...,"[Musk, Trump, Elon Musk, President Trump, Trum...","[0.03440450131893158, -0.003716809442266822, 0...",11.243999,-14.702011
4,Elon Musk is igniting a fierce debate with mov...,2025-02-10,2025-02-11 18:56:00+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/2...,https://web.archive.org/web/20250211185600id_/...,https://web.archive.org/web/20250211185600/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk is igniting a fierce debate with mov...,"[Taylor Wilson, USA TODAY, Elon Musk, Zac Ande...","[0.03692523390054703, 0.014707022346556187, -0...",9.270676,-16.019339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,Elon Musk Seeks Access to Personal IRS Records...,2025-02-17,2025-02-21 09:23:58+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/17/elon-musk...,https://web.archive.org/web/20250221092358id_/...,https://web.archive.org/web/20250221092358/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Elon Musk Seeks Access to Personal IRS Records...,"[Personal IRS Records, Elon Musk Seeks, Musk S...","[0.01385028101503849, -0.015173722989857197, 0...",5.760443,22.323505
625,"Elon Musk Takes Center Stage, Tells Reporters ...",2025-02-12,2025-02-14 05:34:07+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/12/elon-musk...,https://web.archive.org/web/20250214053407id_/...,https://web.archive.org/web/20250214053407/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Elon Musk Takes Center Stage, Tells Reporters ...","[Center Stage, Elon Musk, Elon Musk defending,...","[0.009480154141783714, -0.032140035182237625, ...",8.564391,-10.346865
626,Lydia Moynihan: DOGE Is Making It A Lot Harder...,2025-02-17,2025-02-20 14:14:55+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/17/lydia-moy...,https://web.archive.org/web/20250220141455id_/...,https://web.archive.org/web/20250220141455/htt...,https://wayback-api.archive.org/colsearch/v1/m...,New York Post financial correspondent Lydia Mo...,"[York Post financial, correspondent Lydia Moyn...","[0.0432523638010025, -0.02308301441371441, 0.0...",-5.653185,13.682786
627,"DOGE Fires Bird Flu Experts, Claims $8 Billion...",2025-02-19,2025-02-21 04:46:03+00:00,en,foxnews.com,https://radio.foxnews.com/2025/02/19/doge-fire...,https://web.archive.org/web/20250221044603id_/...,https://web.archive.org/web/20250221044603/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"DOGE Fires Bird Flu Experts, Claims $8 Billion...","[DOGE Fires Bird, Bird Flu Experts, Fires Bird...","[0.010718096047639847, -0.023538852110505104, ...",-15.463957,25.232447


In [18]:
from sklearn.cluster import DBSCAN
# Convert embedding to a NumPy array
X = np.stack(stories['embedding'].values)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=10)  # Adjust eps and min_samples as per your requirement
labels = dbscan.fit_predict(X) + 1  # +1 to avoid -1 as a label

# Assign topics to DataFrame
stories['topic'] = labels

# Group articles by topic
grouped = stories.groupby('topic')

# sort groups by size
grouped = sorted(grouped, key=lambda x: len(x[1]), reverse=True)

# assign group numbers back to stories
for i, (name, group) in enumerate(grouped):
    # TODO: I THINK THIS IS BROKEN 🐛, getting weird items into
    stories.loc[stories['topic'] == name, 'topic'] = name

print("Number of groups:", len(grouped))
# Number of items in each group
print("Group sizes:")
print([len(group) for name, group in grouped])



Number of groups: 4
Group sizes:
[576, 25, 18, 10]


In [19]:
def summarize_topic(titles):
    """
    Pass list of titles to ChatGPT and ask it to summarize them in 2-4 words.
    """

    # Combine the titles into a single string
    titles_str = ', '.join(titles)

    # print("Writing a title for")
    # for title in titles[:5]:
    #     print(f"  - {title}")
    
    MODEL = "gpt-3.5-turbo"
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The following article titles form a topic. \n\n {titles_str} \n\n Please write a specific summary of the topic in 2-4 words:"},
        ],
        max_tokens=10
    )

    return response.choices[0].message.content

In [20]:
# make a list of titles per topic
topic_titles = stories.groupby('topic')['title'].apply(list).to_dict()
topic_titles = [{
    'topic': k,
    'num_articles': len(v),
    'headlines': v
} for k,v in topic_titles.items()]

# sort by num_articles
topic_titles = sorted(topic_titles, key=lambda x: x['num_articles'], reverse=True)

# pass each topic list of titles to openai chatgpt and ask it to summarize the topic in 2-4 words
for topic in topic_titles[0:]:
    print(f"Topic {topic['topic']} ({topic['num_articles']} articles)")
    
    if topic['topic'] == 0:
        topic['topic_summary'] = "uncategorized"
        continue

    # if there are more than 10 articles in a topic, sample 10 (to keep within the word limit of the API)
    if topic['num_articles'] >= 10:
        headlines = np.random.choice(topic['headlines'], 10, replace=False)
        # Summary
        try:
            topic['topic_summary'] = summarize_topic(headlines)
            print(topic['topic_summary'])
        except InvalidRequestError:
            topic['topic_summary'] = "Error Making Summary From OpenAI API"
            print("OpenAI API request failed.")
    else:
        headlines = topic['headlines']

Topic 0 (576 articles)
Topic 1 (25 articles)
Elon Musk's Influence in Government
Topic 2 (18 articles)
Elon Musk's OpenAI Bid
Topic 3 (10 articles)
DOGE stimulus check debate


In [21]:
# turn topic titles and summaries into a dataframe
topic_titles_df = pd.DataFrame(topic_titles)
topic_titles_df = topic_titles_df[['topic', 'topic_summary']]
stories = stories.merge(topic_titles_df, on='topic', how='left')

# Collect Metadata

In [22]:
# loop through topic_titles
topic_metadata = []
for topic in topic_titles:
    # grab topic, num_articles, and summary only
    topic = {k:v for k,v in topic.items() if k in ['topic', 'num_articles', 'topic_summary']}
    topic_metadata.append(topic)

topic_metadata

[{'topic': 0, 'num_articles': 576, 'topic_summary': 'uncategorized'},
 {'topic': 1,
  'num_articles': 25,
  'topic_summary': "Elon Musk's Influence in Government"},
 {'topic': 2, 'num_articles': 18, 'topic_summary': "Elon Musk's OpenAI Bid"},
 {'topic': 3,
  'num_articles': 10,
  'topic_summary': 'DOGE stimulus check debate'}]

In [23]:
# read output/metadata.json
import json
with open('output/metadata.json') as f:
    metadata = json.load(f)

metadata['topics'] = topic_metadata

# write metadata back to json file
with open('output/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)

metadata

{'start': '2025-02-10T00:00:00',
 'end': '2025-03-07T23:29:10.309958',
 'query': '(title:Musk OR title:DOGE)',
 'query_raw': '(title:Musk OR title:DOGE) AND language:en AND domain:(nytimes.com OR cnn.com OR foxnews.com OR nypost.com OR washingtonpost.com OR usatoday.com OR cnbc.com OR theguardian.com OR breakingnews.com OR buzzfeed.com OR cbsnews.com OR reuters.com OR huffingtonpost.com OR usnews.com OR latimes.com OR politico.com OR newsweek.com OR breitbart.com)',
 'topics': [{'topic': 0,
   'num_articles': 576,
   'topic_summary': 'uncategorized'},
  {'topic': 1,
   'num_articles': 25,
   'topic_summary': "Elon Musk's Influence in Government"},
  {'topic': 2, 'num_articles': 18, 'topic_summary': "Elon Musk's OpenAI Bid"},
  {'topic': 3,
   'num_articles': 10,
   'topic_summary': 'DOGE stimulus check debate'}]}

In [24]:
# collect top keywords
top_keywords = stories\
    .explode('keywords')\
    .groupby('keywords')\
    .size()\
    .reset_index(name='count')\
    .sort_values(by='count', ascending=False)\
    .head(100)
    

# Write to file

In [25]:
stories.to_csv('output/stories-with-embeddings.csv', index=False)
stories[['title', 'publication_date', 'domain', 'topic','topic_summary', 'x','y','url']].to_csv('../stories-with-embeddings.csv',index=False)


In [27]:
# copy output/metadata.json to ../example-finished/metadata.json
import shutil
shutil.copy('output/metadata.json', '../example-finished/metadata.json')


'../example-finished/metadata.json'