# NLP

## Setup 

This setup allows you to use *Python* and *R* in the same notebook.


In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [53]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [54]:
from tqdm.notebook import tqdm
tqdm.pandas()

## Load Data & Remove Duplciates 🧹

In [55]:
# read data
stories = pd.read_csv("output/stories_df.csv", 
                 parse_dates=['publication_date', 'capture_time'])


duplicates to delete

In [56]:
dedupe_by = ['title', 'domain']
stories[stories.duplicated(subset=dedupe_by, keep=False)]\
    .sort_values(by=dedupe_by)\
    .head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
123,2024 Election Live Updates: Latest Harris and ...,2024-09-23,2024-09-24 01:09:17+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/23/us/tru...,https://web.archive.org/web/20240924010917id_/...,https://web.archive.org/web/20240924010917/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Trump Hosts Rally in Pe...
1500,2024 Election Live Updates: Latest Harris and ...,2024-09-14,2024-09-15 01:39:39+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/14/us/har...,https://web.archive.org/web/20240915013939id_/...,https://web.archive.org/web/20240915013939/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Harris and Trump Keep U...
3519,"ABBA calls out Trump for ""unauthorized use"" of...",2024-08-30,2024-08-31 02:06:22+00:00,en,cbsnews.com,https://www.cbsnews.com/news/abba-trump-campai...,https://web.archive.org/web/20240831020622id_/...,https://web.archive.org/web/20240831020622/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"ABBA calls out Trump for ""unauthorized use"" of..."
3569,"ABBA calls out Trump for ""unauthorized use"" of...",2024-08-30,2024-09-01 01:41:57+00:00,en,cbsnews.com,https://www.cbsnews.com/colorado/news/abba-tru...,https://web.archive.org/web/20240901014157id_/...,https://web.archive.org/web/20240901014157/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"ABBA calls out Trump for ""unauthorized use"" of..."
3677,Army says Arlington National Cemetery official...,2024-08-29,2024-08-30 01:58:26+00:00,en,cbsnews.com,https://www.cbsnews.com/chicago/news/arlington...,https://web.archive.org/web/20240830015826id_/...,https://web.archive.org/web/20240830015826/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Army says Arlington National Cemetery official...


In [57]:
# remove duplicates
stories.drop_duplicates(subset=dedupe_by, keep='last', inplace=True)

# preview
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet
0,Kamala Harris Is a Woman of Faith. She Shouldn...,2024-09-25,2024-09-26 01:34:35+00:00,en,nytimes.com,https://www.nytimes.com/2024/09/25/opinion/kam...,https://web.archive.org/web/20240926013435id_/...,https://web.archive.org/web/20240926013435/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJessica Grose\nKamala Harris Is ...
1,Mark Cuban: Trump is more socialist than Berni...,2024-09-25,2024-09-26 01:30:09+00:00,en,cnn.com,https://www.cnn.com/2024/09/25/politics/video/...,https://web.archive.org/web/20240926013009id_/...,https://web.archive.org/web/20240926013009/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Video Ad Feedback\nMark Cuban: Trump is more s...
2,Dinesh D’Souza’s ‘Vindicating Trump’: The Numb...,2024-09-25,2024-09-26 01:37:02+00:00,en,breitbart.com,https://www.breitbart.com/politics/2024/09/25/...,https://web.archive.org/web/20240926013702id_/...,https://web.archive.org/web/20240926013702/htt...,https://wayback-api.archive.org/colsearch/v1/m...,“We’re facing the tightening clamps of repress...
3,Trump officially announces October 5 return to...,2024-09-25,2024-09-26 01:38:13+00:00,en,cnn.com,https://www.cnn.com/2024/09/25/politics/trump-...,https://web.archive.org/web/20240926013813id_/...,https://web.archive.org/web/20240926013813/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Donald Trumpâs campaign officially announced W...
4,Trump ally says he didn't see evidence of Hait...,2024-09-25,2024-09-26 01:41:20+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240926014120id_/...,https://web.archive.org/web/20240926014120/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Vivek Ramaswamy says he didn't see evidence of...
...,...,...,...,...,...,...,...,...,...,...
4492,Mary Trump's five-word response to Kamala Harr...,2024-08-23,2024-08-24 02:01:32+00:00,en,newsweek.com,https://www.newsweek.com/mary-trump-five-word-...,https://web.archive.org/web/20240824020132id_/...,https://web.archive.org/web/20240824020132/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Former President Donald Trump's estranged niec...
4493,Michael Cohen says Kamala Harris should ask Tr...,2024-08-23,2024-08-24 02:17:35+00:00,en,newsweek.com,https://www.newsweek.com/michael-cohen-kamala-...,https://web.archive.org/web/20240824021735id_/...,https://web.archive.org/web/20240824021735/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Michael Cohen says newly named Democratic pres...
4494,Trump or Harris? Israelis discuss presidential...,2024-08-23,2024-09-08 02:09:40+00:00,en,foxnews.com,https://www.foxnews.com/world/trump-harris-isr...,https://web.archive.org/web/20240908020940id_/...,https://web.archive.org/web/20240908020940/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Israel's multi-front wars against Hamas and He...
4495,RFK Jr. supporters say they will vote for Trum...,2024-08-23,2024-08-24 01:56:03+00:00,en,newsweek.com,https://www.newsweek.com/rfk-jr-supporters-say...,https://web.archive.org/web/20240824015603id_/...,https://web.archive.org/web/20240824015603/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A number of Robert F. Kennedy Jr. supporters r...


# Keywords

In [58]:
from yake import KeywordExtractor
from pandarallel import pandarallel

kw_extractor = KeywordExtractor()

def get_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    return [x for x,y in keywords]

pandarallel.initialize(progress_bar=True)
stories['keywords'] = stories['snippet'].parallel_apply(get_keywords)

# display
stories

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=543), Label(value='0 / 543'))), HB…

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Kamala Harris Is a Woman of Faith. She Shouldn...,2024-09-25,2024-09-26 01:34:35+00:00,en,nytimes.com,https://www.nytimes.com/2024/09/25/opinion/kam...,https://web.archive.org/web/20240926013435id_/...,https://web.archive.org/web/20240926013435/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJessica Grose\nKamala Harris Is ...,"[Black church, Harris, Black Baptist church, F..."
1,Mark Cuban: Trump is more socialist than Berni...,2024-09-25,2024-09-26 01:30:09+00:00,en,cnn.com,https://www.cnn.com/2024/09/25/politics/video/...,https://web.archive.org/web/20240926013009id_/...,https://web.archive.org/web/20240926013009/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Video Ad Feedback\nMark Cuban: Trump is more s...,"[Video Ad Feedback, Source, Video, Feedback, C..."
2,Dinesh D’Souza’s ‘Vindicating Trump’: The Numb...,2024-09-25,2024-09-26 01:37:02+00:00,en,breitbart.com,https://www.breitbart.com/politics/2024/09/25/...,https://web.archive.org/web/20240926013702id_/...,https://web.archive.org/web/20240926013702/htt...,https://wayback-api.archive.org/colsearch/v1/m...,“We’re facing the tightening clamps of repress...,"[Trump, Donald Trump, President Donald Trump, ..."
3,Trump officially announces October 5 return to...,2024-09-25,2024-09-26 01:38:13+00:00,en,cnn.com,https://www.cnn.com/2024/09/25/politics/trump-...,https://web.archive.org/web/20240926013813id_/...,https://web.archive.org/web/20240926013813/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Donald Trumpâs campaign officially announced W...,"[Secret Service, Donald Trumpâs campaign, Secr..."
4,Trump ally says he didn't see evidence of Hait...,2024-09-25,2024-09-26 01:41:20+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240926014120id_/...,https://web.archive.org/web/20240926014120/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Vivek Ramaswamy says he didn't see evidence of...,"[Haitian migrants eating, migrants eating cats..."
...,...,...,...,...,...,...,...,...,...,...,...
4492,Mary Trump's five-word response to Kamala Harr...,2024-08-23,2024-08-24 02:01:32+00:00,en,newsweek.com,https://www.newsweek.com/mary-trump-five-word-...,https://web.archive.org/web/20240824020132id_/...,https://web.archive.org/web/20240824020132/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Former President Donald Trump's estranged niec...,"[Mary Trump, Democratic National Convention, H..."
4493,Michael Cohen says Kamala Harris should ask Tr...,2024-08-23,2024-08-24 02:17:35+00:00,en,newsweek.com,https://www.newsweek.com/michael-cohen-kamala-...,https://web.archive.org/web/20240824021735id_/...,https://web.archive.org/web/20240824021735/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Michael Cohen says newly named Democratic pres...,"[newly named Democratic, named Democratic pres..."
4494,Trump or Harris? Israelis discuss presidential...,2024-08-23,2024-09-08 02:09:40+00:00,en,foxnews.com,https://www.foxnews.com/world/trump-harris-isr...,https://web.archive.org/web/20240908020940id_/...,https://web.archive.org/web/20240908020940/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Israel's multi-front wars against Hamas and He...,"[Israel multi-front wars, President Kamala Har..."
4495,RFK Jr. supporters say they will vote for Trum...,2024-08-23,2024-08-24 01:56:03+00:00,en,newsweek.com,https://www.newsweek.com/rfk-jr-supporters-say...,https://web.archive.org/web/20240824015603id_/...,https://web.archive.org/web/20240824015603/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A number of Robert F. Kennedy Jr. supporters r...,"[Trump, Kennedy, Friday dropped, Vice Presiden..."


## Embeddings

In [59]:
import os
import openai
import dotenv
dotenv.load_dotenv()

openai.organization = None
openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.Model.list() # see all openai models

In [60]:
# exclude urls that are videos (contain /video/)    
stories = stories[~stories.url.str.contains("/video/")]
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords
0,Kamala Harris Is a Woman of Faith. She Shouldn...,2024-09-25,2024-09-26 01:34:35+00:00,en,nytimes.com,https://www.nytimes.com/2024/09/25/opinion/kam...,https://web.archive.org/web/20240926013435id_/...,https://web.archive.org/web/20240926013435/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nJessica Grose\nKamala Harris Is ...,"[Black church, Harris, Black Baptist church, F..."
2,Dinesh D’Souza’s ‘Vindicating Trump’: The Numb...,2024-09-25,2024-09-26 01:37:02+00:00,en,breitbart.com,https://www.breitbart.com/politics/2024/09/25/...,https://web.archive.org/web/20240926013702id_/...,https://web.archive.org/web/20240926013702/htt...,https://wayback-api.archive.org/colsearch/v1/m...,“We’re facing the tightening clamps of repress...,"[Trump, Donald Trump, President Donald Trump, ..."
3,Trump officially announces October 5 return to...,2024-09-25,2024-09-26 01:38:13+00:00,en,cnn.com,https://www.cnn.com/2024/09/25/politics/trump-...,https://web.archive.org/web/20240926013813id_/...,https://web.archive.org/web/20240926013813/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Donald Trumpâs campaign officially announced W...,"[Secret Service, Donald Trumpâs campaign, Secr..."
4,Trump ally says he didn't see evidence of Hait...,2024-09-25,2024-09-26 01:41:20+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240926014120id_/...,https://web.archive.org/web/20240926014120/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Vivek Ramaswamy says he didn't see evidence of...,"[Haitian migrants eating, migrants eating cats..."
6,"Trump visits grocery store, mom plans to frame...",2024-09-25,2024-09-26 01:42:55+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240926014255id_/...,https://web.archive.org/web/20240926014255/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Pennsylvania mom says she will frame $100 bill...,"[president Donald Trump, Donald Trump, Trump, ..."
...,...,...,...,...,...,...,...,...,...,...,...
4492,Mary Trump's five-word response to Kamala Harr...,2024-08-23,2024-08-24 02:01:32+00:00,en,newsweek.com,https://www.newsweek.com/mary-trump-five-word-...,https://web.archive.org/web/20240824020132id_/...,https://web.archive.org/web/20240824020132/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Former President Donald Trump's estranged niec...,"[Mary Trump, Democratic National Convention, H..."
4493,Michael Cohen says Kamala Harris should ask Tr...,2024-08-23,2024-08-24 02:17:35+00:00,en,newsweek.com,https://www.newsweek.com/michael-cohen-kamala-...,https://web.archive.org/web/20240824021735id_/...,https://web.archive.org/web/20240824021735/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Michael Cohen says newly named Democratic pres...,"[newly named Democratic, named Democratic pres..."
4494,Trump or Harris? Israelis discuss presidential...,2024-08-23,2024-09-08 02:09:40+00:00,en,foxnews.com,https://www.foxnews.com/world/trump-harris-isr...,https://web.archive.org/web/20240908020940id_/...,https://web.archive.org/web/20240908020940/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Israel's multi-front wars against Hamas and He...,"[Israel multi-front wars, President Kamala Har..."
4495,RFK Jr. supporters say they will vote for Trum...,2024-08-23,2024-08-24 01:56:03+00:00,en,newsweek.com,https://www.newsweek.com/rfk-jr-supporters-say...,https://web.archive.org/web/20240824015603id_/...,https://web.archive.org/web/20240824015603/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A number of Robert F. Kennedy Jr. supporters r...,"[Trump, Kennedy, Friday dropped, Vice Presiden..."


In [61]:
# Import modules
import tiktoken
from openai import OpenAI
client = OpenAI()

# Set embedding model parameters
embedding_model = "text-embedding-3-small" # this is the model we will use to make embeddings
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# Get the encoding for the specified model
encoding = tiktoken.get_encoding(embedding_encoding)

# Make a new column with the combined title and summary
stories["combined"] = (
    "Title: " + stories.title.str.strip() + "; Content: " + stories.snippet.str.strip()
)

# Make a new column with the number of tokens in the combined title and summary
stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))

# Sort by that column
stories = stories.sort_values(by='n_tokens', ascending=False)

# Display the bills
stories


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["combined"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories["n_tokens"] = stories.combined.apply(lambda x: len(encoding.encode(x)))


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
2121,What was said during Harris-Trump presidential...,2024-09-11,2024-09-12 01:01:33+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240912010133id_/...,https://web.archive.org/web/20240912010133/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Read the full transcript of ABC News' presiden...,"[President Donald Trump, President Kamala Harr...",Title: What was said during Harris-Trump presi...,19398
256,"Live updates: Donald Trump, Kamala Harris elec...",2024-09-23,2024-09-24 01:14:00+00:00,en,cnn.com,https://www.cnn.com/politics/live-news/trump-h...,https://web.archive.org/web/20240924011400id_/...,https://web.archive.org/web/20240924011400/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Nebraska setback for Trump: A pressure campaig...,"[President Donald Trump, President Kamala Harr...","Title: Live updates: Donald Trump, Kamala Harr...",14067
2577,Where Kamala Harris and Donald Trump Stand on ...,2024-09-09,2024-09-11 01:50:52+00:00,en,nytimes.com,https://www.nytimes.com/interactive/2024/us/po...,https://web.archive.org/web/20240911015052id_/...,https://web.archive.org/web/20240911015052/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Where Kamala Harris and Donald Trump Stand on ...,"[Trump, Inflation Reduction Act, United States...",Title: Where Kamala Harris and Donald Trump St...,13928
3210,Harris and Trump Settle on Debate Rules: 2024 ...,2024-09-04,2024-09-05 01:27:25+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/04/us/har...,https://web.archive.org/web/20240905012725id_/...,https://web.archive.org/web/20240905012725/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Harris and Trump Settle...,"[President Kamala Harris, Vice President Kamal...",Title: Harris and Trump Settle on Debate Rules...,13278
2332,Harris and Trump square off in their first pre...,2024-09-10,2024-09-12 01:31:30+00:00,en,politico.com,https://www.politico.com/live-updates/2024/09/...,https://web.archive.org/web/20240912013130id_/...,https://web.archive.org/web/20240912013130/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A majority of voters who watched Tuesday’s deb...,"[Donald Trump, Kamala Harris, President Kamala...",Title: Harris and Trump square off in their fi...,12376
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,Watch Live: Donald Trump Addresses Protecting ...,2024-09-23,2024-09-25 01:53:15+00:00,en,breitbart.com,https://www.breitbart.com/2024-election/2024/0...,https://web.archive.org/web/20240925015315id_/...,https://web.archive.org/web/20240925015315/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Addresses Protecting ...,"[Protecting America Initiative, Addresses Prot...",Title: Watch Live: Donald Trump Addresses Prot...,64
719,Watch Live: Donald Trump Discusses Fighting An...,2024-09-19,2024-09-21 02:10:41+00:00,en,breitbart.com,https://www.breitbart.com/politics/2024/09/19/...,https://web.archive.org/web/20240921021041id_/...,https://web.archive.org/web/20240921021041/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Discusses Fighting An...,"[Donald Trump Discusses, Trump Discusses Fight...",Title: Watch Live: Donald Trump Discusses Figh...,63
891,Watch Live: Donald Trump Holds Rally on Long I...,2024-09-18,2024-09-20 02:24:04+00:00,en,breitbart.com,https://www.breitbart.com/2024-election/2024/0...,https://web.archive.org/web/20240920022404id_/...,https://web.archive.org/web/20240920022404/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Holds Rally on Long I...,"[Donald Trump Holds, President Donald Trump, T...",Title: Watch Live: Donald Trump Holds Rally on...,62
2002,Ella Baron on the Trump-Harris televised debat...,2024-09-11,2024-09-12 01:30:35+00:00,en,theguardian.com,https://www.theguardian.com/commentisfree/pict...,https://web.archive.org/web/20240912013035id_/...,https://web.archive.org/web/20240912013035/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ella Baron on the Trump-Harris televised debat...,"[Opinion cartoonDonald TrumpKamala, Ella Baron...",Title: Ella Baron on the Trump-Harris televise...,60


In [62]:
# Grab the rows where the text is too big for the context window of the mmodel (>8000 tokens)
too_long = stories.query("n_tokens > @max_tokens") 

# Print how many will be removed
print(f"Removing {len(too_long)} stories that are too long")

# Display the removed stories here in this cell so we can see what we're losing
display(too_long)  

# Remove the rows where the text is too big for the context window of the model
stories = stories.query("n_tokens <= @max_tokens")  

Removing 30 stories that are too long


Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,combined,n_tokens
2121,What was said during Harris-Trump presidential...,2024-09-11,2024-09-12 01:01:33+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240912010133id_/...,https://web.archive.org/web/20240912010133/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Read the full transcript of ABC News' presiden...,"[President Donald Trump, President Kamala Harr...",Title: What was said during Harris-Trump presi...,19398
256,"Live updates: Donald Trump, Kamala Harris elec...",2024-09-23,2024-09-24 01:14:00+00:00,en,cnn.com,https://www.cnn.com/politics/live-news/trump-h...,https://web.archive.org/web/20240924011400id_/...,https://web.archive.org/web/20240924011400/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Nebraska setback for Trump: A pressure campaig...,"[President Donald Trump, President Kamala Harr...","Title: Live updates: Donald Trump, Kamala Harr...",14067
2577,Where Kamala Harris and Donald Trump Stand on ...,2024-09-09,2024-09-11 01:50:52+00:00,en,nytimes.com,https://www.nytimes.com/interactive/2024/us/po...,https://web.archive.org/web/20240911015052id_/...,https://web.archive.org/web/20240911015052/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Where Kamala Harris and Donald Trump Stand on ...,"[Trump, Inflation Reduction Act, United States...",Title: Where Kamala Harris and Donald Trump St...,13928
3210,Harris and Trump Settle on Debate Rules: 2024 ...,2024-09-04,2024-09-05 01:27:25+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/04/us/har...,https://web.archive.org/web/20240905012725id_/...,https://web.archive.org/web/20240905012725/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Harris and Trump Settle...,"[President Kamala Harris, Vice President Kamal...",Title: Harris and Trump Settle on Debate Rules...,13278
2332,Harris and Trump square off in their first pre...,2024-09-10,2024-09-12 01:31:30+00:00,en,politico.com,https://www.politico.com/live-updates/2024/09/...,https://web.archive.org/web/20240912013130id_/...,https://web.archive.org/web/20240912013130/htt...,https://wayback-api.archive.org/colsearch/v1/m...,A majority of voters who watched Tuesday’s deb...,"[Donald Trump, Kamala Harris, President Kamala...",Title: Harris and Trump square off in their fi...,12376
47,Election Live Updates: Trump Lays Out Economic...,2024-09-24,2024-09-25 01:25:51+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/24/us/tru...,https://web.archive.org/web/20240925012551id_/...,https://web.archive.org/web/20240925012551/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Trump Lays Out Economic...,"[President Kamala Harris, Vice President Kamal...",Title: Election Live Updates: Trump Lays Out E...,12253
1900,2024 Election Live Updates: Latest Trump and H...,2024-09-11,2024-09-12 01:12:21+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/11/us/har...,https://web.archive.org/web/20240912011221id_/...,https://web.archive.org/web/20240912011221/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Debate Ripples Across C...,"[President Kamala Harris, Trump, Vice Presiden...",Title: 2024 Election Live Updates: Latest Trum...,11494
2981,Trump and Harris Campaign News: 2024 Election ...,2024-09-05,2024-09-06 01:34:27+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/05/us/har...,https://web.archive.org/web/20240906013427id_/...,https://web.archive.org/web/20240906013427/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Vance Rallies Supporter...,"[President Kamala Harris, Trump, Vice Presiden...",Title: Trump and Harris Campaign News: 2024 El...,11167
4066,Election Live Updates: Trump Suggests Debate R...,2024-08-27,2024-08-28 02:13:29+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/08/27/us/har...,https://web.archive.org/web/20240828021329id_/...,https://web.archive.org/web/20240828021329/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Trump Suggests Debate R...,"[President Kamala Harris, Vice President Kamal...",Title: Election Live Updates: Trump Suggests D...,11145
1012,Apparent Trump Assassination Attempt and Suspe...,2024-09-17,2024-09-18 01:07:17+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/17/us/tru...,https://web.archive.org/web/20240918010717id_/...,https://web.archive.org/web/20240918010717/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Live Updates: Agents Delve Into Life of Appare...,"[Secret Service, Secret Service agent, Trump, ...",Title: Apparent Trump Assassination Attempt an...,11013


In [63]:
from openai import OpenAI
client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-small"):
    # Replace newlines in each text and ensure it's a list of texts
    texts = [text.replace("\n", " ") for text in texts]
    # OpenAI's embeddings.create can process multiple inputs as a list
    response = client.embeddings.create(input=texts, model=model)
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Function to process DataFrame in batches and return a list of embeddings
def process_in_batches(df, column_name, batch_size=10):
    # Break the DataFrame into batches of size `batch_size`
    batches = [df[column_name].iloc[i:i + batch_size] for i in range(0, len(df), batch_size)]
    # Process each batch and collect embeddings
    all_embeddings = []
    for batch in tqdm(batches, desc="Processing batches"):
        batch_embeddings = get_embeddings(batch.tolist())
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

# Example usage
batch_size = 100  # Adjust based on your preference and rate limits
stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


Processing batches:   0%|          | 0/38 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stories['embedding'] = process_in_batches(stories, 'combined', batch_size=batch_size)


In [64]:
# drop combined column since those were only for the purposes of making the embeddings
stories = stories.drop(columns=['combined', 'n_tokens'])

## Dimensionality Reduction (t-SNE)


In [65]:
from sklearn.manifold import TSNE
import numpy as np

# check if vis_dims exists
if os.path.exists("output/stories-with-vis-dims.csv"):
    stories = pd.read_csv("output/stories-with-vis-dims.csv")
else: 
    # Convert to a list of lists of floats
    matrix = np.array(stories.embedding.to_list())

    # Create a t-SNE model and transform the data
    tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='random', learning_rate=400)
    vis_dims = tsne.fit_transform(matrix)

    # add to dataframe and write to csv
    stories = stories\
        .assign(
            x = vis_dims[:,0], 
            y = vis_dims[:,1])


In [66]:
# stories.to_csv('output/stories-with-nlp.csv', index=False)
stories.head()

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
2646,"Trump, Harris lay low ahead of debate as wildf...",2024-09-09,2024-09-10 02:04:09+00:00,en,newsweek.com,https://www.newsweek.com/election-2024-kamala-...,https://web.archive.org/web/20240910020409id_/...,https://web.archive.org/web/20240910020409/htt...,https://wayback-api.archive.org/colsearch/v1/m...,As the anticipation builds for the presidentia...,"[President Kamala Harris, Vice President Kamal...","[0.029513994231820107, 0.019624708220362663, 0...",30.426321,1.711348
4374,Transcript: Ezra Klein on Kamala Harris’s Conv...,2024-08-23,2024-08-25 01:54:08+00:00,en,nytimes.com,https://www.nytimes.com/2024/08/23/podcasts/tr...,https://web.archive.org/web/20240825015408id_/...,https://web.archive.org/web/20240825015408/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nThe Ezra Klein Show\nTranscript:...,"[Ezra Klein, Aaron Retica, Donald Trump, Kamal...","[0.04471578821539879, 0.0195614006370306, -0.0...",0.952876,-9.758122
2417,"Harris, Trump trade barbs in heated, high-stak...",2024-09-10,2024-09-12 01:03:31+00:00,en,latimes.com,https://www.latimes.com/politics/story/2024-09...,https://web.archive.org/web/20240912010331id_/...,https://web.archive.org/web/20240912010331/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Harris, Trump trade barbs in heated, high-stak...","[President Kamala Harris, Vice President Kamal...","[-0.00299630593508482, 0.0061648134142160416, ...",-1.328929,17.981981
293,Kamala Harris Gets National Security Endorseme...,2024-09-22,2024-09-23 01:05:32+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/22/us/tru...,https://web.archive.org/web/20240923010532id_/...,https://web.archive.org/web/20240923010532/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Mark Robinson’s Top Aid...,"[President Donald Trump, President Kamala Harr...","[0.04103028029203415, 0.03662766516208649, 0.0...",17.511801,-18.893795
2322,"Debate fact check: What Harris, Trump got wron...",2024-09-10,2024-09-12 01:03:50+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240912010350id_/...,https://web.archive.org/web/20240912010350/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Presidential debate fact check: Analyzing Trum...,"[Donald Trump claim, Kamala Harris claim, Dona...","[0.00046844425378367305, 0.02809859998524189, ...",-16.216286,41.235004


# Topic Modeling

In [67]:
stories.reset_index(drop=True, inplace=True)

In [68]:
stories

Unnamed: 0,title,publication_date,capture_time,language,domain,url,original_capture_url,archive_playback_url,article_url,snippet,keywords,embedding,x,y
0,"Trump, Harris lay low ahead of debate as wildf...",2024-09-09,2024-09-10 02:04:09+00:00,en,newsweek.com,https://www.newsweek.com/election-2024-kamala-...,https://web.archive.org/web/20240910020409id_/...,https://web.archive.org/web/20240910020409/htt...,https://wayback-api.archive.org/colsearch/v1/m...,As the anticipation builds for the presidentia...,"[President Kamala Harris, Vice President Kamal...","[0.029513994231820107, 0.019624708220362663, 0...",30.426321,1.711348
1,Transcript: Ezra Klein on Kamala Harris’s Conv...,2024-08-23,2024-08-25 01:54:08+00:00,en,nytimes.com,https://www.nytimes.com/2024/08/23/podcasts/tr...,https://web.archive.org/web/20240825015408id_/...,https://web.archive.org/web/20240825015408/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Supported by\nThe Ezra Klein Show\nTranscript:...,"[Ezra Klein, Aaron Retica, Donald Trump, Kamal...","[0.04471578821539879, 0.0195614006370306, -0.0...",0.952876,-9.758122
2,"Harris, Trump trade barbs in heated, high-stak...",2024-09-10,2024-09-12 01:03:31+00:00,en,latimes.com,https://www.latimes.com/politics/story/2024-09...,https://web.archive.org/web/20240912010331id_/...,https://web.archive.org/web/20240912010331/htt...,https://wayback-api.archive.org/colsearch/v1/m...,"Harris, Trump trade barbs in heated, high-stak...","[President Kamala Harris, Vice President Kamal...","[-0.00299630593508482, 0.0061648134142160416, ...",-1.328929,17.981981
3,Kamala Harris Gets National Security Endorseme...,2024-09-22,2024-09-23 01:05:32+00:00,en,nytimes.com,https://www.nytimes.com/live/2024/09/22/us/tru...,https://web.archive.org/web/20240923010532id_/...,https://web.archive.org/web/20240923010532/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Election Live Updates: Mark Robinson’s Top Aid...,"[President Donald Trump, President Kamala Harr...","[0.04103028029203415, 0.03662766516208649, 0.0...",17.511801,-18.893795
4,"Debate fact check: What Harris, Trump got wron...",2024-09-10,2024-09-12 01:03:50+00:00,en,usatoday.com,https://www.usatoday.com/story/news/politics/e...,https://web.archive.org/web/20240912010350id_/...,https://web.archive.org/web/20240912010350/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Presidential debate fact check: Analyzing Trum...,"[Donald Trump claim, Kamala Harris claim, Dona...","[0.00046844425378367305, 0.02809859998524189, ...",-16.216286,41.235004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3771,Watch Live: Donald Trump Addresses Protecting ...,2024-09-23,2024-09-25 01:53:15+00:00,en,breitbart.com,https://www.breitbart.com/2024-election/2024/0...,https://web.archive.org/web/20240925015315id_/...,https://web.archive.org/web/20240925015315/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Addresses Protecting ...,"[Protecting America Initiative, Addresses Prot...","[-0.010589002631604671, 0.04453752562403679, 0...",-21.039240,13.070236
3772,Watch Live: Donald Trump Discusses Fighting An...,2024-09-19,2024-09-21 02:10:41+00:00,en,breitbart.com,https://www.breitbart.com/politics/2024/09/19/...,https://web.archive.org/web/20240921021041id_/...,https://web.archive.org/web/20240921021041/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Discusses Fighting An...,"[Donald Trump Discusses, Trump Discusses Fight...","[-0.014369427226483822, 0.054084643721580505, ...",-20.616686,12.777808
3773,Watch Live: Donald Trump Holds Rally on Long I...,2024-09-18,2024-09-20 02:24:04+00:00,en,breitbart.com,https://www.breitbart.com/2024-election/2024/0...,https://web.archive.org/web/20240920022404id_/...,https://web.archive.org/web/20240920022404/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Watch Live: Donald Trump Holds Rally on Long I...,"[Donald Trump Holds, President Donald Trump, T...","[-0.02937382273375988, 0.031557824462652206, -...",-22.058249,12.542102
3774,Ella Baron on the Trump-Harris televised debat...,2024-09-11,2024-09-12 01:30:35+00:00,en,theguardian.com,https://www.theguardian.com/commentisfree/pict...,https://web.archive.org/web/20240912013035id_/...,https://web.archive.org/web/20240912013035/htt...,https://wayback-api.archive.org/colsearch/v1/m...,Ella Baron on the Trump-Harris televised debat...,"[Opinion cartoonDonald TrumpKamala, Ella Baron...","[0.00022388892830349505, -0.0292839203029871, ...",11.852568,15.334578


In [69]:
from sklearn.cluster import DBSCAN
# Convert embedding to a NumPy array
X = np.stack(stories['embedding'].values)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=10)  # Adjust eps and min_samples as per your requirement
labels = dbscan.fit_predict(X) + 1  # +1 to avoid -1 as a label

# Assign topics to DataFrame
stories['topic'] = labels

# Group articles by topic
grouped = stories.groupby('topic')

# sort groups by size
grouped = sorted(grouped, key=lambda x: len(x[1]), reverse=True)

# assign group numbers back to stories
for i, (name, group) in enumerate(grouped):
    # TODO: I THINK THIS IS BROKEN 🐛, getting weird items into
    stories.loc[stories['topic'] == name, 'topic'] = name

print("Number of groups:", len(grouped))
# Number of items in each group
print("Group sizes:")
print([len(group) for name, group in grouped])



Number of groups: 13
Group sizes:
[3264, 280, 77, 34, 22, 20, 17, 11, 11, 10, 10, 10, 10]


In [70]:
def summarize_topic(titles):
    """
    Pass list of titles to ChatGPT and ask it to summarize them in 2-4 words.
    """

    # Combine the titles into a single string
    titles_str = ', '.join(titles)

    # print("Writing a title for")
    # for title in titles[:5]:
    #     print(f"  - {title}")
    
    MODEL = "gpt-3.5-turbo"
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"The following article titles form a topic. \n\n {titles_str} \n\n Please write a specific summary of the topic in 2-4 words:"},
        ],
        max_tokens=10
    )

    return response.choices[0].message.content

In [71]:
# make a list of titles per topic
topic_titles = stories.groupby('topic')['title'].apply(list).to_dict()
topic_titles = [{
    'topic': k,
    'num_articles': len(v),
    'headlines': v
} for k,v in topic_titles.items()]

# sort by num_articles
topic_titles = sorted(topic_titles, key=lambda x: x['num_articles'], reverse=True)

# pass each topic list of titles to openai chatgpt and ask it to summarize the topic in 2-4 words
for topic in topic_titles[0:]:
    print(f"Topic {topic['topic']} ({topic['num_articles']} articles)")
    
    if topic['topic'] == 0:
        topic['topic_summary'] = "uncategorized"
        continue

    # if there are more than 10 articles in a topic, sample 10 (to keep within the word limit of the API)
    if topic['num_articles'] >= 10:
        headlines = np.random.choice(topic['headlines'], 10, replace=False)
        # Summary
        try:
            topic['topic_summary'] = summarize_topic(headlines)
            print(topic['topic_summary'])
        except InvalidRequestError:
            topic['topic_summary'] = "Error Making Summary From OpenAI API"
            print("OpenAI API request failed.")
    else:
        headlines = topic['headlines']

Topic 0 (3264 articles)
Topic 1 (280 articles)
Trump-Harris Election Dynamics
Topic 3 (77 articles)
Trump assassination attempt in Florida
Topic 2 (34 articles)
Taylor Swift supports Kamala Harris
Topic 4 (22 articles)
Trump Campaign Arlington Cemetery Incident
Topic 8 (20 articles)
Trump Media Stock Performance
Topic 9 (17 articles)
Cheney family supports Kamala.
Topic 6 (11 articles)
Trump's Legal Battles
Topic 12 (11 articles)
Harris Outraises Trump Fundraising
Topic 5 (10 articles)
Trump 2020 Election Case
Topic 7 (10 articles)
RFK Jr. Endorses Trump
Topic 10 (10 articles)
Nebraska Electoral College Controversy
Topic 11 (10 articles)
Jack Smith's Trump Indictment


In [72]:
# turn topic titles and summaries into a dataframe
topic_titles_df = pd.DataFrame(topic_titles)
topic_titles_df = topic_titles_df[['topic', 'topic_summary']]
stories = stories.merge(topic_titles_df, on='topic', how='left')

# Collect Metadata

In [73]:
# loop through topic_titles
topic_metadata = []
for topic in topic_titles:
    # grab topic, num_articles, and summary only
    topic = {k:v for k,v in topic.items() if k in ['topic', 'num_articles', 'topic_summary']}
    topic_metadata.append(topic)

topic_metadata

[{'topic': 0, 'num_articles': 3264, 'topic_summary': 'uncategorized'},
 {'topic': 1,
  'num_articles': 280,
  'topic_summary': 'Trump-Harris Election Dynamics'},
 {'topic': 3,
  'num_articles': 77,
  'topic_summary': 'Trump assassination attempt in Florida'},
 {'topic': 2,
  'num_articles': 34,
  'topic_summary': 'Taylor Swift supports Kamala Harris'},
 {'topic': 4,
  'num_articles': 22,
  'topic_summary': 'Trump Campaign Arlington Cemetery Incident'},
 {'topic': 8,
  'num_articles': 20,
  'topic_summary': 'Trump Media Stock Performance'},
 {'topic': 9,
  'num_articles': 17,
  'topic_summary': 'Cheney family supports Kamala.'},
 {'topic': 6, 'num_articles': 11, 'topic_summary': "Trump's Legal Battles"},
 {'topic': 12,
  'num_articles': 11,
  'topic_summary': 'Harris Outraises Trump Fundraising'},
 {'topic': 5, 'num_articles': 10, 'topic_summary': 'Trump 2020 Election Case'},
 {'topic': 7, 'num_articles': 10, 'topic_summary': 'RFK Jr. Endorses Trump'},
 {'topic': 10,
  'num_articles': 1

In [74]:
# read output/metadata.json
import json
with open('output/metadata.json') as f:
    metadata = json.load(f)

metadata['topics'] = topic_metadata

# write metadata back to json file
with open('output/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)

metadata

{'start': '2024-08-23T00:00:00',
 'end': '2024-09-28T00:43:49.822833',
 'query': '(title:Kamala OR title:Trump)',
 'query_raw': '(title:Kamala OR title:Trump) AND language:en AND domain:(nytimes.com OR cnn.com OR foxnews.com OR nypost.com OR washingtonpost.com OR usatoday.com OR cnbc.com OR theguardian.com OR breakingnews.com OR buzzfeed.com OR cbsnews.com OR reuters.com OR huffingtonpost.com OR usnews.com OR latimes.com OR politico.com OR newsweek.com OR breitbart.com)',
 'topics': [{'topic': 0,
   'num_articles': 3264,
   'topic_summary': 'uncategorized'},
  {'topic': 1,
   'num_articles': 280,
   'topic_summary': 'Trump-Harris Election Dynamics'},
  {'topic': 3,
   'num_articles': 77,
   'topic_summary': 'Trump assassination attempt in Florida'},
  {'topic': 2,
   'num_articles': 34,
   'topic_summary': 'Taylor Swift supports Kamala Harris'},
  {'topic': 4,
   'num_articles': 22,
   'topic_summary': 'Trump Campaign Arlington Cemetery Incident'},
  {'topic': 8,
   'num_articles': 20,

In [75]:
# collect top keywords
top_keywords = stories\
    .explode('keywords')\
    .groupby('keywords')\
    .size()\
    .reset_index(name='count')\
    .sort_values(by='count', ascending=False)\
    .head(100)
    

# Write to file

In [76]:
stories.to_csv('output/stories-with-embeddings.csv', index=False)
stories[['title', 'publication_date', 'domain', 'topic','topic_summary', 'x','y','url']].to_csv('../stories-with-embeddings.csv',index=False)


In [77]:
# copy output/metadata.json to ../example-finished/metadata.json
import shutil
shutil.copy('output/metadata.json', '../example-finished/metadata.json')


'../example-finished/metadata.json'