# NLP

## Setup 

This setup allows you to use *Python* and *R* in the same notebook.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
from tqdm.notebook import tqdm
tqdm.pandas()

## Load Data & Remove Duplciates 🧹

In [4]:
# read data
stories = pd.read_csv("output/stories_df.csv", 
                 parse_dates=['publication_date', 'capture_time'])


duplicates to delete

In [5]:
dedupe_by = ['title', 'domain']
stories[stories.duplicated(subset=dedupe_by, keep=False)]\
    .sort_values(by=dedupe_by)\
    .head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
240,$153 million of cocaine found hidden in banana...,2025-07-17,2025-07-18 10:38:29+00:00,en,cbsnews.com,https://www.cbsnews.com/news/cocaine-hidden-ba...,https://web.archive.org/web/20250718103829id_/...,https://web.archive.org/web/20250718103829/htt...,https://wayback-api.archive.org/colsearch/v1/m...,$153 million of cocaine found hidden in banana...
245,$153 million of cocaine found hidden in banana...,2025-07-17,2025-07-18 10:38:36+00:00,en,cbsnews.com,https://www.cbsnews.com/amp/news/cocaine-hidde...,https://web.archive.org/web/20250718103836id_/...,https://web.archive.org/web/20250718103836/htt...,https://wayback-api.archive.org/colsearch/v1/m...,$153 million of cocaine found hidden in banana...
225,'All the options': GOP eyes cutting August rec...,2025-07-21,2025-07-23 01:41:04+00:00,en,foxnews.com,https://www.foxnews.com/politics/all-options-g...,https://web.archive.org/web/20250723014104id_/...,https://web.archive.org/web/20250723014104/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Senate Republican leadership is weighing wheth...
226,'All the options': GOP eyes cutting August rec...,2025-07-21,2025-07-23 01:41:16+00:00,en,foxnews.com,https://www.foxnews.com/politics/all-options-g...,https://web.archive.org/web/20250723014116id_/...,https://web.archive.org/web/20250723014116/htt...,https://wayback-api.archive.org/colsearch/v1/m...,'All the options': GOP eyes cutting August rec...
1255,'America will be open': Casey Wasserman assure...,2025-03-20,2025-04-01 03:09:23+00:00,en,latimes.com,https://www.latimes.com/sports/story/2025-03-2...,https://web.archive.org/web/20250401030923id_/...,https://web.archive.org/web/20250401030923/htt...,https://wayback-api.archive.org/colsearch/v1/m...,- Share via\n-\nIn an address to the Internati...


In [6]:
# remove duplicates
stories.drop_duplicates(subset=dedupe_by, keep='last', inplace=True)

# preview
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
0,Fears of a French Government Collapse Send Its...,2025-08-26,2025-08-27 02:36:31+00:00,en,nytimes.com,https://www.nytimes.com/2025/08/26/business/fr...,https://web.archive.org/web/20250827023631id_/...,https://web.archive.org/web/20250827023631/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nFears of a French Government Col...
2,"In Turkey, your coffee comes with a side of de...",2025-08-26,2025-08-27 10:28:36+00:00,en,cnn.com,https://www.cnn.com/2025/08/26/travel/turkey-c...,https://web.archive.org/web/20250827102836id_/...,https://web.archive.org/web/20250827102836/htt...,https://wayback-api.archive.org/colsearch/v1/m...,EDITORâS NOTE:Â This CNN Travel series may hav...
3,29 Pairs Of Sandals For Anyone Who Refuses To ...,2025-08-26,2025-08-27 10:14:08+00:00,en,buzzfeed.com,https://www.buzzfeed.com/cierracowan/pairs-san...,https://web.archive.org/web/20250827101408id_/...,https://web.archive.org/web/20250827101408/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Unless otherwise specified, all sizes in this ..."
4,"Garnacho, Jackson, Zinchenko and the other pla...",2025-08-25,2025-08-25 09:43:18+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6568993/2025/...,https://web.archive.org/web/20250825094318id_/...,https://web.archive.org/web/20250825094318/htt...,https://wayback-api.archive.org/colsearch/v1/m...,More than £2billion has been spent by Premier ...
6,Banned! The 20 books they didn’t want you to read,2025-08-23,2025-08-24 03:50:04+00:00,en,theguardian.com,https://www.theguardian.com/books/2025/aug/23/...,https://web.archive.org/web/20250824035004id_/...,https://web.archive.org/web/20250824035004/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"The banning of books, it would be easy to thin..."
...,...,...,...,...,...,...,...,...,...,...
1782,What will Thomas Tuchel do on his first day as...,2025-01-01,2025-01-02 04:04:39+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6021619/2025/...,https://web.archive.org/web/20250102040439id_/...,https://web.archive.org/web/20250102040439/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Thomas Tuchel is no longer officially unemploy...
1783,Ukraine halts Russian gas supplies to Europe a...,2025-01-01,2025-01-02 03:29:41+00:00,en,nypost.com,https://nypost.com/2025/01/01/world-news/ukrai...,https://web.archive.org/web/20250102032941id_/...,https://web.archive.org/web/20250102032941/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ukraine halts Russian gas supplies to Europe a...
1784,Country with the world’s best cuisine named — ...,2025-01-01,2025-01-02 17:55:25+00:00,en,nypost.com,https://nypost.com/2025/01/01/lifestyle/this-c...,https://web.archive.org/web/20250102175525id_/...,https://web.archive.org/web/20250102175525/htt...,https://wayback-api.archive.org/colsearch/v1/m...,It’s around the world in 100 cuisines.\nNaming...
1785,Cheap vacation tip: 7 budget-friendly European...,2025-01-01,2025-01-03 11:28:39+00:00,en,usatoday.com,https://www.usatoday.com/story/travel/news/202...,https://web.archive.org/web/20250103112839id_/...,https://web.archive.org/web/20250103112839/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Visit Europe, but don't break the bank: 7 of t..."


# Keywords

In [7]:
from yake import KeywordExtractor
from pandarallel import pandarallel

kw_extractor = KeywordExtractor()

def get_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    return [x for x,y in keywords]

pandarallel.initialize(progress_bar=True)
stories['keywords'] = stories['snippet'].parallel_apply(get_keywords)

# display
stories

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=191), Label(value='0 / 191'))), HB…

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Fears of a French Government Collapse Send Its...,2025-08-26,2025-08-27 02:36:31+00:00,en,nytimes.com,https://www.nytimes.com/2025/08/26/business/fr...,https://web.archive.org/web/20250827023631id_/...,https://web.archive.org/web/20250827023631/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nFears of a French Government Col...,"[France, Liz Alderman, French, Borrowing Costs..."
2,"In Turkey, your coffee comes with a side of de...",2025-08-26,2025-08-27 10:28:36+00:00,en,cnn.com,https://www.cnn.com/2025/08/26/travel/turkey-c...,https://web.archive.org/web/20250827102836id_/...,https://web.archive.org/web/20250827102836/htt...,https://wayback-api.archive.org/colsearch/v1/m...,EDITORâS NOTE:Â This CNN Travel series may hav...,"[CNN Travel series, Turkish coffee, coffee, CN..."
3,29 Pairs Of Sandals For Anyone Who Refuses To ...,2025-08-26,2025-08-27 10:14:08+00:00,en,buzzfeed.com,https://www.buzzfeed.com/cierracowan/pairs-san...,https://web.archive.org/web/20250827101408id_/...,https://web.archive.org/web/20250827101408/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Unless otherwise specified, all sizes in this ...","[sandals, sizes, Promising review, Amazon, Pro..."
4,"Garnacho, Jackson, Zinchenko and the other pla...",2025-08-25,2025-08-25 09:43:18+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6568993/2025/...,https://web.archive.org/web/20250825094318id_/...,https://web.archive.org/web/20250825094318/htt...,https://wayback-api.archive.org/colsearch/v1/m...,More than £2billion has been spent by Premier ...,"[Premier League, Premier League sides, Premier..."
6,Banned! The 20 books they didn’t want you to read,2025-08-23,2025-08-24 03:50:04+00:00,en,theguardian.com,https://www.theguardian.com/books/2025/aug/23/...,https://web.archive.org/web/20250824035004id_/...,https://web.archive.org/web/20250824035004/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"The banning of books, it would be easy to thin...","[school districts, American school districts, ..."
...,...,...,...,...,...,...,...,...,...,...,...
1782,What will Thomas Tuchel do on his first day as...,2025-01-01,2025-01-02 04:04:39+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6021619/2025/...,https://web.archive.org/web/20250102040439id_/...,https://web.archive.org/web/20250102040439/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Thomas Tuchel is no longer officially unemploy...,"[Tuchel, England manager, England, George ’s P..."
1783,Ukraine halts Russian gas supplies to Europe a...,2025-01-01,2025-01-02 03:29:41+00:00,en,nypost.com,https://nypost.com/2025/01/01/world-news/ukrai...,https://web.archive.org/web/20250102032941id_/...,https://web.archive.org/web/20250102032941/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ukraine halts Russian gas supplies to Europe a...,"[Russian gas supplies, Russian gas, Russian na..."
1784,Country with the world’s best cuisine named — ...,2025-01-01,2025-01-02 17:55:25+00:00,en,nypost.com,https://nypost.com/2025/01/01/lifestyle/this-c...,https://web.archive.org/web/20250102175525id_/...,https://web.archive.org/web/20250102175525/htt...,https://wayback-api.archive.org/colsearch/v1/m...,It’s around the world in 100 cuisines.\nNaming...,"[cuisine, culinary Mount Olympus, deemed Greek..."
1785,Cheap vacation tip: 7 budget-friendly European...,2025-01-01,2025-01-03 11:28:39+00:00,en,usatoday.com,https://www.usatoday.com/story/travel/news/202...,https://web.archive.org/web/20250103112839id_/...,https://web.archive.org/web/20250103112839/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Visit Europe, but don't break the bank: 7 of t...","[Chase Travel, travel, Chase Travel booking, R..."


## Embeddings

In [8]:
import os
import openai
import dotenv
dotenv.load_dotenv()

openai.organization = None
openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.Model.list() # see all openai models

In [9]:
# exclude urls that are videos (contain /video/)    
stories = stories[~stories.url.str.contains("/video/")]
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Fears of a French Government Collapse Send Its...,2025-08-26,2025-08-27 02:36:31+00:00,en,nytimes.com,https://www.nytimes.com/2025/08/26/business/fr...,https://web.archive.org/web/20250827023631id_/...,https://web.archive.org/web/20250827023631/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nFears of a French Government Col...,"[France, Liz Alderman, French, Borrowing Costs..."
2,"In Turkey, your coffee comes with a side of de...",2025-08-26,2025-08-27 10:28:36+00:00,en,cnn.com,https://www.cnn.com/2025/08/26/travel/turkey-c...,https://web.archive.org/web/20250827102836id_/...,https://web.archive.org/web/20250827102836/htt...,https://wayback-api.archive.org/colsearch/v1/m...,EDITORâS NOTE:Â This CNN Travel series may hav...,"[CNN Travel series, Turkish coffee, coffee, CN..."
3,29 Pairs Of Sandals For Anyone Who Refuses To ...,2025-08-26,2025-08-27 10:14:08+00:00,en,buzzfeed.com,https://www.buzzfeed.com/cierracowan/pairs-san...,https://web.archive.org/web/20250827101408id_/...,https://web.archive.org/web/20250827101408/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Unless otherwise specified, all sizes in this ...","[sandals, sizes, Promising review, Amazon, Pro..."
4,"Garnacho, Jackson, Zinchenko and the other pla...",2025-08-25,2025-08-25 09:43:18+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6568993/2025/...,https://web.archive.org/web/20250825094318id_/...,https://web.archive.org/web/20250825094318/htt...,https://wayback-api.archive.org/colsearch/v1/m...,More than £2billion has been spent by Premier ...,"[Premier League, Premier League sides, Premier..."
6,Banned! The 20 books they didn’t want you to read,2025-08-23,2025-08-24 03:50:04+00:00,en,theguardian.com,https://www.theguardian.com/books/2025/aug/23/...,https://web.archive.org/web/20250824035004id_/...,https://web.archive.org/web/20250824035004/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"The banning of books, it would be easy to thin...","[school districts, American school districts, ..."
...,...,...,...,...,...,...,...,...,...,...,...
1782,What will Thomas Tuchel do on his first day as...,2025-01-01,2025-01-02 04:04:39+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6021619/2025/...,https://web.archive.org/web/20250102040439id_/...,https://web.archive.org/web/20250102040439/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Thomas Tuchel is no longer officially unemploy...,"[Tuchel, England manager, England, George ’s P..."
1783,Ukraine halts Russian gas supplies to Europe a...,2025-01-01,2025-01-02 03:29:41+00:00,en,nypost.com,https://nypost.com/2025/01/01/world-news/ukrai...,https://web.archive.org/web/20250102032941id_/...,https://web.archive.org/web/20250102032941/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ukraine halts Russian gas supplies to Europe a...,"[Russian gas supplies, Russian gas, Russian na..."
1784,Country with the world’s best cuisine named — ...,2025-01-01,2025-01-02 17:55:25+00:00,en,nypost.com,https://nypost.com/2025/01/01/lifestyle/this-c...,https://web.archive.org/web/20250102175525id_/...,https://web.archive.org/web/20250102175525/htt...,https://wayback-api.archive.org/colsearch/v1/m...,It’s around the world in 100 cuisines.\nNaming...,"[cuisine, culinary Mount Olympus, deemed Greek..."
1785,Cheap vacation tip: 7 budget-friendly European...,2025-01-01,2025-01-03 11:28:39+00:00,en,usatoday.com,https://www.usatoday.com/story/travel/news/202...,https://web.archive.org/web/20250103112839id_/...,https://web.archive.org/web/20250103112839/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Visit Europe, but don't break the bank: 7 of t...","[Chase Travel, travel, Chase Travel booking, R..."


In [10]:
# Import modules
import tiktoken
from openai import OpenAI
client = OpenAI()

# Set embedding model parameters
embedding_model = "text-embedding-3-small" # this is the model we will use to make embeddings
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# Get the encoding for the specified model
encoding = tiktoken.get_encoding(embedding_encoding)

# Make a new column with the combined title and summary
stories["combined"] = (
    "Title: " + stories.title.str.strip() + "; Content: " + stories.snippet.str.strip()
)

# Make a new column with the number of tokens in the combined title and summary
stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))

# Sort by that column
stories = stories.sort_values(by='n_tokens', ascending=False)

# Display the bills
stories


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["combined"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
1345,"March 12, 2025",2025-03-12,2025-03-18 06:21:51+00:00,en,nypost.com,https://nypost.com/2025/03/12/,https://web.archive.org/web/20250318062151id_/...,https://web.archive.org/web/20250318062151/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"March 12, 2025\nFetterman urges Democrats to t...","[Trump, President Trump, President Donald Trum...","Title: March 12, 2025; Content: March 12, 2025...",17893
47,Who owns every Premier League club – and what ...,2025-08-15,2025-08-15 10:04:29+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6539836/2025/...,https://web.archive.org/web/20250815100429id_/...,https://web.archive.org/web/20250815100429/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ahead of the start of the new Premier League s...,"[cent, cent stake, cent holding, cent holding ...",Title: Who owns every Premier League club – an...,17603
43,Trump vs. the U.S. Economy,2025-08-16,2025-08-20 13:11:53+00:00,en,nytimes.com,https://www.nytimes.com/2025/08/16/opinion/ezr...,https://web.archive.org/web/20250820131153id_/...,https://web.archive.org/web/20250820131153/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nThe Ezra Klein Show\nTrump vs. t...,"[n’t, ’re, Economy, tariffs, Donald Trump, Tru...",Title: Trump vs. the U.S. Economy; Content: Su...,15171
1166,The 31 Best Things to Do When You Visit Atlanta.,2025-03-28,2025-07-30 12:48:41+00:00,en,usnews.com,https://travel.usnews.com/Atlanta_GA/Things_To...,https://web.archive.org/web/20250730124841id_/...,https://web.archive.org/web/20250730124841/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Courtesy of ferrantraite|Getty Images\n31 Best...,"[Atlanta History Center, Atlanta, Park, View a...",Title: The 31 Best Things to Do When You Visit...,14194
1473,"February 24, 2025: Donald Trump presidency news",2025-02-24,2025-03-12 09:02:16+00:00,en,cnn.com,https://edition.cnn.com/politics/live-news/tru...,https://web.archive.org/web/20250312090216id_/...,https://web.archive.org/web/20250312090216/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Our live coverage of Donald Trumpâs presidency...,"[President Donald Trump, President Emmanuel Ma...","Title: February 24, 2025: Donald Trump preside...",13916
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1421,Carolin Lehmann on BuzzFeed,2025-03-04,2025-04-08 13:04:05+00:00,en,buzzfeed.com,https://www.buzzfeed.com/carolinlehmann,https://web.archive.org/web/20250408130405id_/...,https://web.archive.org/web/20250408130405/htt...,https://wayback-api.archive.org/colsearch/v1/m...,If You Donât Have Any Of These 20 Useful Targe...,"[Youâre Sincerely Missing, Youâre Sincerely, S...",Title: Carolin Lehmann on BuzzFeed; Content: I...,152
1630,Katy Fallon,2025-01-27,2025-01-28 13:38:34+00:00,en,theguardian.com,https://www.theguardian.com/profile/katy-fallon,https://web.archive.org/web/20250128133834id_/...,https://web.archive.org/web/20250128133834/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Katy Fallon\nJanuary 2025\nFebruary 2024\nNove...,"[Katy Fallon, Fallon, Katy, January, herders, ...",Title: Katy Fallon; Content: Katy Fallon\nJanu...,135
1555,"Trump’s Ultimatum to Hamas, and Nonstop Quakes...",2025-02-11,2025-02-12 08:56:25+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/11/podcasts/th...,https://web.archive.org/web/20250212085625id_/...,https://web.archive.org/web/20250212085625/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Niki Kitsantonis is a freelance correspondent ...,"[based in Athens, Niki Kitsantonis, Times base...","Title: Trump’s Ultimatum to Hamas, and Nonstop...",84
1583,The week around the world in 20 pictures,2025-02-07,2025-02-09 06:47:15+00:00,en,theguardian.com,https://www.theguardian.com/artanddesign/galle...,https://web.archive.org/web/20250209064715id_/...,https://web.archive.org/web/20250209064715/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The week around the world in 20 pictures\nPale...,"[Greece and Charli, Charli XCX, earthquakes in...",Title: The week around the world in 20 picture...,78


In [11]:
# Grab the rows where the text is too big for the context window of the mmodel (>8000 tokens)
too_long = stories.query("n_tokens > @max_tokens") 

# Print how many will be removed
print(f"Removing {len(too_long)} stories that are too long")

# Display the removed stories here in this cell so we can see what we're losing
display(too_long)  

# Remove the rows where the text is too big for the context window of the model
stories = stories.query("n_tokens <= @max_tokens")  

Removing 72 stories that are too long


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
1345,"March 12, 2025",2025-03-12,2025-03-18 06:21:51+00:00,en,nypost.com,https://nypost.com/2025/03/12/,https://web.archive.org/web/20250318062151id_/...,https://web.archive.org/web/20250318062151/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"March 12, 2025\nFetterman urges Democrats to t...","[Trump, President Trump, President Donald Trum...","Title: March 12, 2025; Content: March 12, 2025...",17893
47,Who owns every Premier League club – and what ...,2025-08-15,2025-08-15 10:04:29+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6539836/2025/...,https://web.archive.org/web/20250815100429id_/...,https://web.archive.org/web/20250815100429/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ahead of the start of the new Premier League s...,"[cent, cent stake, cent holding, cent holding ...",Title: Who owns every Premier League club – an...,17603
43,Trump vs. the U.S. Economy,2025-08-16,2025-08-20 13:11:53+00:00,en,nytimes.com,https://www.nytimes.com/2025/08/16/opinion/ezr...,https://web.archive.org/web/20250820131153id_/...,https://web.archive.org/web/20250820131153/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nThe Ezra Klein Show\nTrump vs. t...,"[n’t, ’re, Economy, tariffs, Donald Trump, Tru...",Title: Trump vs. the U.S. Economy; Content: Su...,15171
1166,The 31 Best Things to Do When You Visit Atlanta.,2025-03-28,2025-07-30 12:48:41+00:00,en,usnews.com,https://travel.usnews.com/Atlanta_GA/Things_To...,https://web.archive.org/web/20250730124841id_/...,https://web.archive.org/web/20250730124841/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Courtesy of ferrantraite|Getty Images\n31 Best...,"[Atlanta History Center, Atlanta, Park, View a...",Title: The 31 Best Things to Do When You Visit...,14194
1473,"February 24, 2025: Donald Trump presidency news",2025-02-24,2025-03-12 09:02:16+00:00,en,cnn.com,https://edition.cnn.com/politics/live-news/tru...,https://web.archive.org/web/20250312090216id_/...,https://web.archive.org/web/20250312090216/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Our live coverage of Donald Trumpâs presidency...,"[President Donald Trump, President Emmanuel Ma...","Title: February 24, 2025: Donald Trump preside...",13916
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,54 Products That Are The Opposite Of Beige Energy,2025-07-16,2025-07-17 12:42:37+00:00,en,buzzfeed.com,https://www.buzzfeed.com/malloryannp/random-pr...,https://web.archive.org/web/20250717124237id_/...,https://web.archive.org/web/20250717124237/htt...,https://wayback-api.archive.org/colsearch/v1/m...,1. A set of limited-edition wildflower Band-Ai...,"[Promising review, Amazon, Promising, review, ...",Title: 54 Products That Are The Opposite Of Be...,8704
233,38 Shoes That Are Super Cute And Incredibly Comfy,2025-07-19,2025-07-20 04:39:21+00:00,en,buzzfeed.com,https://www.buzzfeed.com/gabrielamanjarrez/pai...,https://web.archive.org/web/20250720043921id_/...,https://web.archive.org/web/20250720043921/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Unless otherwise specified, all sizes in this ...","[shoes, Promising review, Amazon, pair, sizes,...",Title: 38 Shoes That Are Super Cute And Incred...,8390
442,Rating Nottingham Forest’s 102 permanent signi...,2025-06-20,2025-06-21 10:25:46+00:00,en,nytimes.com,https://www.nytimes.com/athletic/6424390/2025/...,https://web.archive.org/web/20250621102546id_/...,https://web.archive.org/web/20250621102546/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Since Evangelos Marinakis became majority owne...,"[Forest, Premier League, August, Signed, Premi...",Title: Rating Nottingham Forest’s 102 permanen...,8372
1521,The World's 47 Best Tourist Attractions,2025-02-14,2025-07-30 13:06:58+00:00,en,usnews.com,https://travel.usnews.com/gallery/the-worlds-b...,https://web.archive.org/web/20250730130658id_/...,https://web.archive.org/web/20250730130658/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The places around the globe that draw the most...,"[Getty Images, Images, Getty, World Heritage S...",Title: The World's 47 Best Tourist Attractions...,8284


In [13]:
from openai import OpenAI
client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-small"):
    # Replace newlines in each text and ensure it's a list of texts
    texts = [text.replace("\n", " ") for text in texts]
    # OpenAI's embeddings.create can process multiple inputs as a list
    response = client.embeddings.create(input=texts, model=model)
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Function to process DataFrame in batches and return a list of embeddings
def process_in_batches(df, column_name, batch_size=10):
    # Break the DataFrame into batches of size `batch_size`
    batches = [df[column_name].iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # Process each batch and collect embeddings
    all_embeddings = []
    for batch in tqdm(batches, desc="Processing batches"):
        batch_embeddings = get_embeddings(batch.tolist())
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Example usage
batch_size = 30  # Adjust based on your preference and rate limits
stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


Processing batches:   0%|          | 0/49 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


In [14]:
# drop combined column since those were only for the purposes of making the embeddings
stories = stories.drop(columns=['combined', 'n_tokens'])

## Dimensionality Reduction (t-SNE)


In [15]:
from sklearn.manifold import TSNE
import numpy as np

# check if vis_dims exists
if os.path.exists("output/stories-with-vis-dims.csv"):
    stories = pd.read_csv("output/stories-with-vis-dims.csv")
else: 
    # Convert to a list of lists of floats
    matrix = np.array(stories.embedding.to_list())

    # Create a t-SNE model and transform the data
    tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='random', learning_rate=400)
    vis_dims = tsne.fit_transform(matrix)

    # add to dataframe and write to csv
    stories = stories\
        .assign(
            x = vis_dims[:,0], 
            y = vis_dims[:,1])


In [16]:
# stories.to_csv('output/stories-with-nlp.csv', index=False)
stories.head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
962,How Pope Francis Changed the Catholic Church,2025-04-22,2025-04-23 03:19:27+00:00,en,nytimes.com,https://www.nytimes.com/2025/04/22/podcasts/th...,https://web.archive.org/web/20250423031927id_/...,https://web.archive.org/web/20250423031927/htt...,https://wayback-api.archive.org/colsearch/v1/m...,transcript\nHow Pope Francis Changed the Catho...,"[Pope Francis, Jason Horowitz, Catholic Church...","[0.007702929899096489, 0.019815122708678246, -...",6.609652,-33.291836
183,34 Products To Bring The Tropical Vacay To You,2025-07-31,2025-08-01 06:38:07+00:00,en,buzzfeed.com,https://www.buzzfeed.com/dayshavedewi/products...,https://web.archive.org/web/20250801063807id_/...,https://web.archive.org/web/20250801063807/htt...,https://wayback-api.archive.org/colsearch/v1/m...,1. A colorful hammock that'll make you feel li...,"[Promising review, Amazon, Promising, pool, re...","[0.010649988427758217, 0.020902907475829124, -...",-27.888897,44.911987
585,408 Fun Trivia Questions And Answers For Trivi...,2025-06-09,2025-06-10 03:55:46+00:00,en,buzzfeed.com,https://www.buzzfeed.com/audreyworboys/fun-tri...,https://web.archive.org/web/20250610035546id_/...,https://web.archive.org/web/20250610035546/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Are you ready to feel your brain get more wrin...,"[film features, city, film, country, world, fa...","[0.012383701279759407, -0.020388878881931305, ...",0.764147,0.179709
1336,36 Parenting Items That'll Make You Feel Like ...,2025-03-13,2025-04-01 22:24:19+00:00,en,buzzfeed.com,https://www.buzzfeed.com/courtney_lynch/certif...,https://web.archive.org/web/20250401222419id_/...,https://web.archive.org/web/20250401222419/htt...,https://wayback-api.archive.org/colsearch/v1/m...,1. A time-saving Oxo Tot grape cutter to make ...,"[Promising review, Amazon, Promising, review, ...","[0.01841992512345314, 0.025693213567137718, -0...",-33.999001,44.00425
954,Monsters Plague Japan. But What Do They Mean?,2025-04-22,2025-04-23 02:07:03+00:00,en,nytimes.com,https://www.nytimes.com/2025/04/22/t-magazine/...,https://web.archive.org/web/20250423020703id_/...,https://web.archive.org/web/20250423020703/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nT’s Culture Issue\nMonsters Plag...,"[Japan, Godzilla, Japanese, Monsters Plague Ja...","[0.03651412948966026, 0.05494537204504013, -0....",-7.733189,7.48772


# Topic Modeling

In [17]:
stories.reset_index(drop=True, inplace=True)

In [18]:
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
0,How Pope Francis Changed the Catholic Church,2025-04-22,2025-04-23 03:19:27+00:00,en,nytimes.com,https://www.nytimes.com/2025/04/22/podcasts/th...,https://web.archive.org/web/20250423031927id_/...,https://web.archive.org/web/20250423031927/htt...,https://wayback-api.archive.org/colsearch/v1/m...,transcript\nHow Pope Francis Changed the Catho...,"[Pope Francis, Jason Horowitz, Catholic Church...","[0.007702929899096489, 0.019815122708678246, -...",6.609652,-33.291836
1,34 Products To Bring The Tropical Vacay To You,2025-07-31,2025-08-01 06:38:07+00:00,en,buzzfeed.com,https://www.buzzfeed.com/dayshavedewi/products...,https://web.archive.org/web/20250801063807id_/...,https://web.archive.org/web/20250801063807/htt...,https://wayback-api.archive.org/colsearch/v1/m...,1. A colorful hammock that'll make you feel li...,"[Promising review, Amazon, Promising, pool, re...","[0.010649988427758217, 0.020902907475829124, -...",-27.888897,44.911987
2,408 Fun Trivia Questions And Answers For Trivi...,2025-06-09,2025-06-10 03:55:46+00:00,en,buzzfeed.com,https://www.buzzfeed.com/audreyworboys/fun-tri...,https://web.archive.org/web/20250610035546id_/...,https://web.archive.org/web/20250610035546/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Are you ready to feel your brain get more wrin...,"[film features, city, film, country, world, fa...","[0.012383701279759407, -0.020388878881931305, ...",0.764147,0.179709
3,36 Parenting Items That'll Make You Feel Like ...,2025-03-13,2025-04-01 22:24:19+00:00,en,buzzfeed.com,https://www.buzzfeed.com/courtney_lynch/certif...,https://web.archive.org/web/20250401222419id_/...,https://web.archive.org/web/20250401222419/htt...,https://wayback-api.archive.org/colsearch/v1/m...,1. A time-saving Oxo Tot grape cutter to make ...,"[Promising review, Amazon, Promising, review, ...","[0.01841992512345314, 0.025693213567137718, -0...",-33.999001,44.004250
4,Monsters Plague Japan. But What Do They Mean?,2025-04-22,2025-04-23 02:07:03+00:00,en,nytimes.com,https://www.nytimes.com/2025/04/22/t-magazine/...,https://web.archive.org/web/20250423020703id_/...,https://web.archive.org/web/20250423020703/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nT’s Culture Issue\nMonsters Plag...,"[Japan, Godzilla, Japanese, Monsters Plague Ja...","[0.03651412948966026, 0.05494537204504013, -0....",-7.733189,7.487720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1437,Carolin Lehmann on BuzzFeed,2025-03-04,2025-04-08 13:04:05+00:00,en,buzzfeed.com,https://www.buzzfeed.com/carolinlehmann,https://web.archive.org/web/20250408130405id_/...,https://web.archive.org/web/20250408130405/htt...,https://wayback-api.archive.org/colsearch/v1/m...,If You Donât Have Any Of These 20 Useful Targe...,"[Youâre Sincerely Missing, Youâre Sincerely, S...","[0.01666908524930477, 0.05210211127996445, 0.0...",-25.868332,42.686832
1438,Katy Fallon,2025-01-27,2025-01-28 13:38:34+00:00,en,theguardian.com,https://www.theguardian.com/profile/katy-fallon,https://web.archive.org/web/20250128133834id_/...,https://web.archive.org/web/20250128133834/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Katy Fallon\nJanuary 2025\nFebruary 2024\nNove...,"[Katy Fallon, Fallon, Katy, January, herders, ...","[-0.02330104075372219, 0.0015995741123333573, ...",12.247908,11.129965
1439,"Trump’s Ultimatum to Hamas, and Nonstop Quakes...",2025-02-11,2025-02-12 08:56:25+00:00,en,nytimes.com,https://www.nytimes.com/2025/02/11/podcasts/th...,https://web.archive.org/web/20250212085625id_/...,https://web.archive.org/web/20250212085625/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Niki Kitsantonis is a freelance correspondent ...,"[based in Athens, Niki Kitsantonis, Times base...","[-0.00936694722622633, 0.013718975707888603, 0...",25.678240,18.782763
1440,The week around the world in 20 pictures,2025-02-07,2025-02-09 06:47:15+00:00,en,theguardian.com,https://www.theguardian.com/artanddesign/galle...,https://web.archive.org/web/20250209064715id_/...,https://web.archive.org/web/20250209064715/htt...,https://wayback-api.archive.org/colsearch/v1/m...,The week around the world in 20 pictures\nPale...,"[Greece and Charli, Charli XCX, earthquakes in...","[0.019291650503873825, 0.037886157631874084, -...",5.951654,4.160580


In [19]:
from sklearn.cluster import DBSCAN
# Convert embedding to a NumPy array
X = np.stack(stories['embedding'].values)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=10)  # Adjust eps and min_samples as per your requirement
labels = dbscan.fit_predict(X) + 1  # +1 to avoid -1 as a label

# Assign topics to DataFrame
stories['topic'] = labels

# Group articles by topic
grouped = stories.groupby('topic')

# sort groups by size
grouped = sorted(grouped, key=lambda x: len(x[1]), reverse=True)

# assign group numbers back to stories
for i, (name, group) in enumerate(grouped):
    # TODO: I THINK THIS IS BROKEN 🐛, getting weird items into
    stories.loc[stories['topic'] == name, 'topic'] = name

print("Number of groups:", len(grouped))
# Number of items in each group
print("Group sizes:")
print([len(group) for name, group in grouped])



Number of groups: 5
Group sizes:
[1358, 49, 13, 12, 10]


In [20]:
def summarize_topic(titles):
    """
    Pass list of titles to ChatGPT and ask it to summarize them in 2-4 words.
    """

    # Combine the titles into a single string
    titles_str = ', '.join(titles)

    # print("Writing a title for")
    # for title in titles[:5]:
    #     print(f"  - {title}")
    
    MODEL = "gpt-3.5-turbo"
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The following article titles form a topic. \n\n {titles_str} \n\n Please write a specific summary of the topic in 2-4 words:"},
        ],
        max_tokens=10
    )

    return response.choices[0].message.content

In [21]:
# make a list of titles per topic
topic_titles = stories.groupby('topic')['title'].apply(list).to_dict()
topic_titles = [{
    'topic': k,
    'num_articles': len(v),
    'headlines': v
} for k,v in topic_titles.items()]

# sort by num_articles
topic_titles = sorted(topic_titles, key=lambda x: x['num_articles'], reverse=True)

# pass each topic list of titles to openai chatgpt and ask it to summarize the topic in 2-4 words
for topic in topic_titles[0:]:
    print(f"Topic {topic['topic']} ({topic['num_articles']} articles)")
    
    if topic['topic'] == 0:
        topic['topic_summary'] = "uncategorized"
        continue

    # if there are more than 10 articles in a topic, sample 10 (to keep within the word limit of the API)
    if topic['num_articles'] >= 10:
        headlines = np.random.choice(topic['headlines'], 10, replace=False)
        # Summary
        try:
            topic['topic_summary'] = summarize_topic(headlines)
            print(topic['topic_summary'])
        except InvalidRequestError:
            topic['topic_summary'] = "Error Making Summary From OpenAI API"
            print("OpenAI API request failed.")
    else:
        headlines = topic['headlines']

Topic 0 (1358 articles)
Topic 1 (49 articles)
Comfy Spring and Summer Shoes
Topic 2 (13 articles)
Travel Products for Safety
Topic 3 (12 articles)
Roland Garros Tennis Updates
Topic 4 (10 articles)
Nightclub Fire in North Macedonia


In [22]:
# turn topic titles and summaries into a dataframe
topic_titles_df = pd.DataFrame(topic_titles)
topic_titles_df = topic_titles_df[['topic', 'topic_summary']]
stories = stories.merge(topic_titles_df, on='topic', how='left')

# Collect Metadata

In [23]:
# loop through topic_titles
topic_metadata = []
for topic in topic_titles:
    # grab topic, num_articles, and summary only
    topic = {k:v for k,v in topic.items() if k in ['topic', 'num_articles', 'topic_summary']}
    topic_metadata.append(topic)

topic_metadata

[{'topic': 0, 'num_articles': 1358, 'topic_summary': 'uncategorized'},
 {'topic': 1,
  'num_articles': 49,
  'topic_summary': 'Comfy Spring and Summer Shoes'},
 {'topic': 2,
  'num_articles': 13,
  'topic_summary': 'Travel Products for Safety'},
 {'topic': 3,
  'num_articles': 12,
  'topic_summary': 'Roland Garros Tennis Updates'},
 {'topic': 4,
  'num_articles': 10,
  'topic_summary': 'Nightclub Fire in North Macedonia'}]

In [24]:
# read output/metadata.json
import json
with open('output/metadata.json') as f:
    metadata = json.load(f)

metadata['topics'] = topic_metadata

# write metadata back to json file
with open('output/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)

metadata

{'start': '2025-01-01T00:00:00',
 'end': '2025-08-29T00:00:00',
 'query': 'Greece',
 'query_raw': 'Greece AND language:en AND domain:(nytimes.com OR cnn.com OR foxnews.com OR nypost.com OR washingtonpost.com OR usatoday.com OR cnbc.com OR theguardian.com OR breakingnews.com OR buzzfeed.com OR cbsnews.com OR reuters.com OR huffingtonpost.com OR usnews.com OR latimes.com OR politico.com OR newsweek.com OR breitbart.com)',
 'topics': [{'topic': 0,
   'num_articles': 1358,
   'topic_summary': 'uncategorized'},
  {'topic': 1,
   'num_articles': 49,
   'topic_summary': 'Comfy Spring and Summer Shoes'},
  {'topic': 2,
   'num_articles': 13,
   'topic_summary': 'Travel Products for Safety'},
  {'topic': 3,
   'num_articles': 12,
   'topic_summary': 'Roland Garros Tennis Updates'},
  {'topic': 4,
   'num_articles': 10,
   'topic_summary': 'Nightclub Fire in North Macedonia'}]}

In [25]:
# collect top keywords
top_keywords = stories\
    .explode('keywords')\
    .groupby('keywords')\
    .size()\
    .reset_index(name='count')\
    .sort_values(by='count', ascending=False)\
    .head(100)
    

# Write to file

In [26]:
stories.to_csv('output/stories-with-embeddings.csv', index=False)
stories[['title', 'publication_date', 'domain', 'topic','topic_summary', 'x','y','url']].to_csv('../stories-with-embeddings.csv',index=False)


In [29]:
# # copy output/metadata.json to ../example-finished/metadata.json
import shutil
shutil.copy('output/metadata.json', '../answerkey/metadata.json')


'../answerkey/metadata.json'