In [11]:
import numpy as np
import pandas as pd
from IPython.display import display

## Initial Dataset Summary
### 1. Polled Articles `data/polled_articles_rows.csv`
Contains the raw article metadata collected live by polling RSS (Really Simple Syndication) feeds from select news outlets.
- **source_id**: uuid key for a news source (The Guardian, WSJ, etc).
- **feed_id**: uuid key for a RSS feed (each news outlet has many feeds).
- **article_id**: uuid primary key for an article.
- **article_guid**: unique identifier an article is given via the RSS feed (collected to resolve duplicate polls).
- **article_url**: the url to the given story/article.
    - used potentially to collect the entire article for different embedding strategy
- **article_headline**: headline of the story/article.
- **article_description**: short description of the story/article.
- **rss_article_text**: some RSS feeds provide the full article text.
- **authors**: some RSS feeds provide the author(s).
- **published_at**: publish date of the story/article.
- **created_at**: the date when the row was created/polled at.

### 2. Ingested Articles `data/ingested_articles_rows.csv`
Contains the articles after ingestion and text normalization.
- **ingestion_id**: uuid primary key for each row
- **article_id**: uuid key referencing the article that was ingested
- **normalized_text**: normalized headline concatenated with description
    - each string went through (default):
        1. Unicode normalization
        2. Zero-width and control-character stripping
        3. Whitespace normalization
        4. Dash, quote, bullet normalization
        5. HTML stripping
- **normalization_version**: tag corresponding to text normalization strategy
- **ingested_at**: date of ingestion (normalize + embed)

### 3. Article Embeddings `data/embeddings_rows.csv`
- **embedding_id**: uuid primary key for each embedding
- **ingestion_id**: uuid corresponding to the ingested article (possible many versions of ingestion for a single article)
- **embedding**: embedding vector
- **model_name**: text embedding model ex. `text-embedding-3-small`
- **created_at**: time when the embedding was generated

In [12]:
### Load initial datasets
polled = pd.read_csv("data/polled_articles_rows.csv")
ingested = pd.read_csv("data/ingested_articles_rows.csv")
embeddings = pd.read_csv("data/embeddings_rows.csv")

### Display sample
display(polled.head())
display(ingested.head())
display(embeddings.head())


Unnamed: 0,article_id,source_id,feed_id,article_guid,article_url,article_headline,article_description,rss_article_text,authors,published_at,created_at
0,001f72ec-924e-4989-a368-7cf7e45c0e77,7ba98991-0e06-4040-910f-5bb16a3ca8f2,7c91738b-1ae4-49d2-88e2-6357ef55c382,https://www.theguardian.com/lifeandstyle/2026/...,https://www.theguardian.com/lifeandstyle/2026/...,The rise of ‘unc’: is this gen Alpha’s way of ...,<p>Timothée Chalamet celebrated his 30th birth...,,,2026-01-12 14:40:48+00,2026-01-12 15:17:02.967673+00
1,0021235e-48b4-4c20-8cbb-eda872051cd0,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15406...,https://www.dailymail.co.uk/news/article-15406...,Puzzling photo hides a third dart that isn't o...,A third piece of a dart set is missing from th...,,,2026-01-11 18:40:02+00,2026-01-11 20:13:31.774161+00
2,00515dcd-a012-436a-8659-91611b8bc021,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15441...,https://www.dailymail.co.uk/news/article-15441...,The bizarre chain of events before Angie went ...,"Angie Fuller disappeared on January 9, 2023. P...",,,2026-01-07 14:48:06+00,2026-01-07 15:15:38.905434+00
3,0068528e-8c6b-4081-977f-dfdd69f14dc6,7ba98991-0e06-4040-910f-5bb16a3ca8f2,7c91738b-1ae4-49d2-88e2-6357ef55c382,https://www.theguardian.com/world/2026/jan/08/...,https://www.theguardian.com/world/2026/jan/08/...,Iran plunged into internet blackout as protest...,<p>Outage piles pressure on regime as Iranian ...,,,2026-01-08 19:42:10+00,2026-01-08 20:14:16.617002+00
4,0078eda3-ff25-4871-8426-d9a35bfd106d,7ba98991-0e06-4040-910f-5bb16a3ca8f2,d22f9c9f-82d1-410b-b944-8524c88ff339,https://www.theguardian.com/world/2026/jan/06/...,https://www.theguardian.com/world/2026/jan/06/...,Why won’t Yvette Cooper criticise Trump over V...,<p>The foreign secretary faces a tight balanci...,,,2026-01-06 17:02:40+00,2026-01-06 18:18:40.138832+00


Unnamed: 0,ingestion_id,article_id,normalized_text,normalization_version,ingested_at
0,000d8dea-8f60-470e-b539-eb51780b55db,d9509dc9-a405-4866-9201-19e3d063d137,Outrage as 'miserable' shovel-wielding neighbo...,default,2026-01-25 08:25:39.957485+00
1,0072ab60-b1ed-42a9-9624-88c18292e588,9a156ce2-11d6-4eff-b243-e2d2bb208faa,"Morality, military might and a sense of mischi...",default,2026-01-25 08:25:39.957485+00
2,007f72ee-5039-47bb-8c61-c981f2810d3a,df3b2fc9-4d0f-4199-bca1-ea6d954e0c31,Republican senator vows to block all Fed nomin...,default,2026-01-25 08:25:39.957485+00
3,00879af4-8278-40a8-8338-dba8a0b45810,d0f6077e-15be-497e-9a2b-1c023486ce9a,Starmer prepares to rip up Brexit: PM ready to...,default,2026-01-25 08:25:39.957485+00
4,00937788-6ba8-4db8-8107-645754dd7362,847a6cef-0c95-4a0f-9388-7345ba01bf26,ANOTHER poll shows Labour in third behind Refo...,default,2026-01-25 08:25:39.957485+00


Unnamed: 0,embedding_id,ingestion_id,embedding,model_name,created_at
0,00180a44-ff35-48c0-b897-eaf3d3a852ff,30be29f3-c452-4a6d-8717-fe7a2bf39e7c,"[-0.004183091,-0.022327209,-0.009337782,0.0189...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
1,00236859-7341-4838-b366-b033e71d2fbc,7f0adae5-4aa8-4b6b-bb3b-07b6a3794425,"[0.038555313,0.0041080914,-0.018142737,-0.0081...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
2,00263380-9ac1-4759-a15e-4d8233ea977a,6ebc129e-6982-4986-ac0e-42b2fedfd9a8,"[-0.0130469,0.01698623,-0.006076264,-0.0051306...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
3,002dcf07-e8a4-4669-a9c1-b10b9a113647,190115ff-ab3a-4721-bf79-3b838a861c90,"[0.013344139,0.014551405,-0.0020468934,-0.0002...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
4,00340b1d-fb55-4a92-8335-456af222323e,e86baabc-2a6b-445c-aef1-260b8c080d8a,"[-0.004791434,-0.0059574344,-0.018388402,0.008...",text-embedding-3-large,2026-01-25 08:26:30.878212+00


### Wrangling Procedure
Goal: Clean and prepare a unified dataset for downstream clustering analysis.

Steps:
1. Rename duplicate/unclear columns
2. Join the three datasets
3. Select only the necessary columns for analysis
4. Drop NaNs (especially in `normalized_text` and `embeddings`)
5. Download as csv

In [13]:
### Load initial datasets
polled = pd.read_csv("data/polled_articles_rows.csv")
ingested = pd.read_csv("data/ingested_articles_rows.csv")
embeddings = pd.read_csv("data/embeddings_rows.csv")

### Display sample
display(polled.head())
display(ingested.head())
display(embeddings.head())


Unnamed: 0,article_id,source_id,feed_id,article_guid,article_url,article_headline,article_description,rss_article_text,authors,published_at,created_at
0,001f72ec-924e-4989-a368-7cf7e45c0e77,7ba98991-0e06-4040-910f-5bb16a3ca8f2,7c91738b-1ae4-49d2-88e2-6357ef55c382,https://www.theguardian.com/lifeandstyle/2026/...,https://www.theguardian.com/lifeandstyle/2026/...,The rise of ‘unc’: is this gen Alpha’s way of ...,<p>Timothée Chalamet celebrated his 30th birth...,,,2026-01-12 14:40:48+00,2026-01-12 15:17:02.967673+00
1,0021235e-48b4-4c20-8cbb-eda872051cd0,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15406...,https://www.dailymail.co.uk/news/article-15406...,Puzzling photo hides a third dart that isn't o...,A third piece of a dart set is missing from th...,,,2026-01-11 18:40:02+00,2026-01-11 20:13:31.774161+00
2,00515dcd-a012-436a-8659-91611b8bc021,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15441...,https://www.dailymail.co.uk/news/article-15441...,The bizarre chain of events before Angie went ...,"Angie Fuller disappeared on January 9, 2023. P...",,,2026-01-07 14:48:06+00,2026-01-07 15:15:38.905434+00
3,0068528e-8c6b-4081-977f-dfdd69f14dc6,7ba98991-0e06-4040-910f-5bb16a3ca8f2,7c91738b-1ae4-49d2-88e2-6357ef55c382,https://www.theguardian.com/world/2026/jan/08/...,https://www.theguardian.com/world/2026/jan/08/...,Iran plunged into internet blackout as protest...,<p>Outage piles pressure on regime as Iranian ...,,,2026-01-08 19:42:10+00,2026-01-08 20:14:16.617002+00
4,0078eda3-ff25-4871-8426-d9a35bfd106d,7ba98991-0e06-4040-910f-5bb16a3ca8f2,d22f9c9f-82d1-410b-b944-8524c88ff339,https://www.theguardian.com/world/2026/jan/06/...,https://www.theguardian.com/world/2026/jan/06/...,Why won’t Yvette Cooper criticise Trump over V...,<p>The foreign secretary faces a tight balanci...,,,2026-01-06 17:02:40+00,2026-01-06 18:18:40.138832+00


Unnamed: 0,ingestion_id,article_id,normalized_text,normalization_version,ingested_at
0,000d8dea-8f60-470e-b539-eb51780b55db,d9509dc9-a405-4866-9201-19e3d063d137,Outrage as 'miserable' shovel-wielding neighbo...,default,2026-01-25 08:25:39.957485+00
1,0072ab60-b1ed-42a9-9624-88c18292e588,9a156ce2-11d6-4eff-b243-e2d2bb208faa,"Morality, military might and a sense of mischi...",default,2026-01-25 08:25:39.957485+00
2,007f72ee-5039-47bb-8c61-c981f2810d3a,df3b2fc9-4d0f-4199-bca1-ea6d954e0c31,Republican senator vows to block all Fed nomin...,default,2026-01-25 08:25:39.957485+00
3,00879af4-8278-40a8-8338-dba8a0b45810,d0f6077e-15be-497e-9a2b-1c023486ce9a,Starmer prepares to rip up Brexit: PM ready to...,default,2026-01-25 08:25:39.957485+00
4,00937788-6ba8-4db8-8107-645754dd7362,847a6cef-0c95-4a0f-9388-7345ba01bf26,ANOTHER poll shows Labour in third behind Refo...,default,2026-01-25 08:25:39.957485+00


Unnamed: 0,embedding_id,ingestion_id,embedding,model_name,created_at
0,00180a44-ff35-48c0-b897-eaf3d3a852ff,30be29f3-c452-4a6d-8717-fe7a2bf39e7c,"[-0.004183091,-0.022327209,-0.009337782,0.0189...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
1,00236859-7341-4838-b366-b033e71d2fbc,7f0adae5-4aa8-4b6b-bb3b-07b6a3794425,"[0.038555313,0.0041080914,-0.018142737,-0.0081...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
2,00263380-9ac1-4759-a15e-4d8233ea977a,6ebc129e-6982-4986-ac0e-42b2fedfd9a8,"[-0.0130469,0.01698623,-0.006076264,-0.0051306...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
3,002dcf07-e8a4-4669-a9c1-b10b9a113647,190115ff-ab3a-4721-bf79-3b838a861c90,"[0.013344139,0.014551405,-0.0020468934,-0.0002...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
4,00340b1d-fb55-4a92-8335-456af222323e,e86baabc-2a6b-445c-aef1-260b8c080d8a,"[-0.004791434,-0.0059574344,-0.018388402,0.008...",text-embedding-3-large,2026-01-25 08:26:30.878212+00


In [14]:
### 1. Rename

polled.rename(columns={"created_at": "polled_at"}, inplace=True)

In [15]:
### 2. Join the three datasets

ingested_with_meta = pd.merge(polled, ingested, how='outer', on='article_id')
combined = pd.merge(ingested_with_meta, embeddings, how='outer', on='ingestion_id')

display(combined.head())


Unnamed: 0,article_id,source_id,feed_id,article_guid,article_url,article_headline,article_description,rss_article_text,authors,published_at,polled_at,ingestion_id,normalized_text,normalization_version,ingested_at,embedding_id,embedding,model_name,created_at
0,d9509dc9-a405-4866-9201-19e3d063d137,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15441...,https://www.dailymail.co.uk/news/article-15441...,Outrage as 'miserable' shovel-wielding neighbo...,The elderly neighbour was caught in the act as...,,,2026-01-07 13:29:24+00,2026-01-07 13:33:24.714064+00,000d8dea-8f60-470e-b539-eb51780b55db,Outrage as 'miserable' shovel-wielding neighbo...,default,2026-01-25 08:25:39.957485+00,7591dd5c-f784-441d-8cc1-bf4b1d2d7e12,"[-0.011435215,-0.03746951,-0.0075333593,-0.026...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
1,9a156ce2-11d6-4eff-b243-e2d2bb208faa,7ba98991-0e06-4040-910f-5bb16a3ca8f2,7c91738b-1ae4-49d2-88e2-6357ef55c382,https://www.theguardian.com/us-news/2026/jan/0...,https://www.theguardian.com/us-news/2026/jan/0...,"Morality, military might and a sense of mischi...","<p>Trump sounds off on Venezuela’s future, Tai...",,,2026-01-09 03:15:08+00,2026-01-09 05:17:56.535446+00,0072ab60-b1ed-42a9-9624-88c18292e588,"Morality, military might and a sense of mischi...",default,2026-01-25 08:25:39.957485+00,d479a784-39f1-4105-bfd7-cdcfd32a0ce7,"[0.0061512175,-0.03799464,-0.016263446,-0.0129...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
2,df3b2fc9-4d0f-4199-bca1-ea6d954e0c31,7ba98991-0e06-4040-910f-5bb16a3ca8f2,d22f9c9f-82d1-410b-b944-8524c88ff339,https://www.theguardian.com/us-news/2026/jan/1...,https://www.theguardian.com/us-news/2026/jan/1...,Republican senator vows to block all Fed nomin...,<p>Thom Tillis says he would oppose any nomine...,,,2026-01-12 14:55:53+00,2026-01-12 15:17:02.967673+00,007f72ee-5039-47bb-8c61-c981f2810d3a,Republican senator vows to block all Fed nomin...,default,2026-01-25 08:25:39.957485+00,c6692b90-924c-4e8f-8af2-cb2ef5e75c06,"[-0.0072780806,-0.023142586,-0.021380594,0.020...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
3,d0f6077e-15be-497e-9a2b-1c023486ce9a,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15432...,https://www.dailymail.co.uk/news/article-15432...,Starmer prepares to rip up Brexit: PM ready to...,In some of his strongest comments yet about Br...,,,2026-01-04 15:49:23+00,2026-01-05 04:42:39.19293+00,00879af4-8278-40a8-8338-dba8a0b45810,Starmer prepares to rip up Brexit: PM ready to...,default,2026-01-25 08:25:39.957485+00,7408f562-8949-4b8f-b61c-2b550f9f9869,"[0.018081628,0.02099675,-0.010544334,0.0247128...",text-embedding-3-large,2026-01-25 08:26:30.878212+00
4,847a6cef-0c95-4a0f-9388-7345ba01bf26,9bc15713-5485-4395-8adb-b8c71447be67,cb63bb4b-2d5d-4e35-8aa8-1038003f6def,https://www.dailymail.co.uk/news/article-15441...,https://www.dailymail.co.uk/news/article-15441...,ANOTHER poll shows Labour in third behind Refo...,Research by More in Common put the party in th...,,,2026-01-07 10:57:35+00,2026-01-07 11:12:56.5161+00,00937788-6ba8-4db8-8107-645754dd7362,ANOTHER poll shows Labour in third behind Refo...,default,2026-01-25 08:25:39.957485+00,e25574a2-69c5-404f-9e7c-a2b302363c5d,"[0.032669835,-0.002811013,-0.012023548,-0.0104...",text-embedding-3-large,2026-01-25 08:26:30.878212+00


In [16]:
### 3. Select embeddings, normalized_text, timeseries, and relevant metadata.

df = combined[['article_id', 'normalized_text', 'polled_at', 'published_at','embedding']]
df.head()


Unnamed: 0,article_id,normalized_text,polled_at,published_at,embedding
0,d9509dc9-a405-4866-9201-19e3d063d137,Outrage as 'miserable' shovel-wielding neighbo...,2026-01-07 13:33:24.714064+00,2026-01-07 13:29:24+00,"[-0.011435215,-0.03746951,-0.0075333593,-0.026..."
1,9a156ce2-11d6-4eff-b243-e2d2bb208faa,"Morality, military might and a sense of mischi...",2026-01-09 05:17:56.535446+00,2026-01-09 03:15:08+00,"[0.0061512175,-0.03799464,-0.016263446,-0.0129..."
2,df3b2fc9-4d0f-4199-bca1-ea6d954e0c31,Republican senator vows to block all Fed nomin...,2026-01-12 15:17:02.967673+00,2026-01-12 14:55:53+00,"[-0.0072780806,-0.023142586,-0.021380594,0.020..."
3,d0f6077e-15be-497e-9a2b-1c023486ce9a,Starmer prepares to rip up Brexit: PM ready to...,2026-01-05 04:42:39.19293+00,2026-01-04 15:49:23+00,"[0.018081628,0.02099675,-0.010544334,0.0247128..."
4,847a6cef-0c95-4a0f-9388-7345ba01bf26,ANOTHER poll shows Labour in third behind Refo...,2026-01-07 11:12:56.5161+00,2026-01-07 10:57:35+00,"[0.032669835,-0.002811013,-0.012023548,-0.0104..."


In [17]:
# 4. Drop NaNs
df = df.dropna(subset=['normalized_text','embedding'], how='all')

In [18]:
# 5. Download wrangled dataset

try:
    df.to_csv('data/wrangled_articles.csv', index=False)
except Exception as e:
    print('csv save failed:', e)