In [57]:
import hashlib

import html2text
import pandas as pd

### Preprocess: CNN

In [58]:
df_cnn = pd.read_json("../data/scraped/s_m5977u652mqcx1zuj0.jsonl", lines=True)
df_cnn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   timestamp         1007 non-null   datetime64[ns, UTC]
 1   input             1007 non-null   object             
 2   error             182 non-null    object             
 3   error_code        182 non-null    object             
 4   id                764 non-null    object             
 5   url               764 non-null    object             
 6   author            590 non-null    object             
 7   headline          764 non-null    object             
 8   topics            579 non-null    object             
 9   publication_date  764 non-null    object             
 10  updated_last      764 non-null    object             
 11  content           764 non-null    object             
 12  videos            233 non-null    object             
 13  ima

In [59]:
df_cnn_clean = df_cnn.copy()
df_cnn_clean = df_cnn_clean.dropna(subset="headline")
df_cnn_clean = df_cnn_clean.drop_duplicates(subset="id")
df_cnn_clean = df_cnn_clean[["id", "headline", "content", "publication_date", "url"]]

df_cnn_clean["id"] = df_cnn_clean["id"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
df_cnn_clean["publication_date"] = pd.to_datetime(df_cnn_clean["publication_date"])
df_cnn_clean["content"] = df_cnn_clean["content"].apply(html2text.html2text)
df_cnn_clean["source"] = "CNN"

In [60]:
df_cnn_clean.head(2)

Unnamed: 0,id,headline,content,publication_date,url,source
8,5a618a2daa89d751b75a0c76732cca3b,Apple urged to remove new AI feature after fal...,The press freedom group Reporters Without Bord...,2024-12-19 20:32:19.900000+00:00,https://www.cnn.com/2024/12/19/media/apple-int...,CNN
9,89a222209eced5c8fbb71670a8817219,"Cyber Monday 2024 is over, but you can still s...",Best Buy is offering shoppers one more chance ...,2024-12-03 12:33:00+00:00,https://www.cnn.com/cnn-underscored/deals/best...,CNN


In [61]:
df_cnn_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159 entries, 8 to 479
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id                159 non-null    object             
 1   headline          159 non-null    object             
 2   content           159 non-null    object             
 3   publication_date  159 non-null    datetime64[ns, UTC]
 4   url               159 non-null    object             
 5   source            159 non-null    object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 8.7+ KB


### Preprocess: Reuters

In [62]:
df_reuters = pd.read_json("../data/scraped/s_m5971bnx7j0lpn2mw.jsonl", lines=True).dropna(subset="headline")
df_reuters.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199 entries, 1 to 200
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   timestamp         199 non-null    datetime64[ns, UTC]
 1   input             199 non-null    object             
 2   error             0 non-null      object             
 3   error_code        0 non-null      object             
 4   id                199 non-null    object             
 5   url               199 non-null    object             
 6   author            199 non-null    object             
 7   headline          199 non-null    object             
 8   topics            199 non-null    object             
 9   publication_date  199 non-null    object             
 10  updated_last      199 non-null    object             
 11  description       199 non-null    object             
 12  content           199 non-null    object             
 13  videos    

In [63]:
df_reuters.head(2)

Unnamed: 0,timestamp,input,error,error_code,id,url,author,headline,topics,publication_date,updated_last,description,content,videos,images,related_articles,keyword,discovery_input
1,2024-12-29 06:01:53.668000+00:00,{'url': 'https://www.reuters.com/technology/ap...,,,C5HJNP7Y6RLQTMNFLOARUBYZNU,https://www.reuters.com/technology/apple-seeks...,Jody Godoy,Apple seeks to defend Google's billion-dollar ...,[Apple Inc. reports fourth quarter earnings in...,2024-12-24T15:23:34.000Z,2024-12-24T15:24:10.943Z,Apple has asked to participate in Google's upc...,"Apple <a href=""https://www.reuters.com/markets...",,[{'image_url': 'https://cloudfront-us-east-2.i...,[{'article_title': 'UK's Starmer asks regulato...,Technology,"{'keyword': 'apple', 'sort': 'newest'}"
2,2024-12-29 06:01:53.672000+00:00,{'url': 'https://www.reuters.com/markets/us/fu...,,,WXBHFHG6IVPCHJJ5ESSJ6AEJUE,https://www.reuters.com/markets/us/futures-low...,David French,Dow ends up to extend win run to five; rising ...,[Federal Reserve Chair Jerome Powell interest ...,2024-12-27T00:22:40.000Z,2024-12-27T00:23:51.543Z,The Dow Jones Industrial Average closed fracti...,"The Dow Jones Industrial Average <a href=""http...",[{'video_description': 'Wall Street indexes en...,[{'image_url': 'https://cloudfront-us-east-2.i...,[{'article_title': 'Peru hikes minimum wage 10...,Markets,"{'keyword': 'apple', 'sort': 'newest'}"


In [64]:
df_reuters_clean = df_reuters.copy()
df_reuters_clean = df_reuters_clean.dropna(subset="headline")
df_reuters_clean = df_reuters_clean.drop_duplicates(subset="id")
df_reuters_clean = df_reuters_clean[["id", "headline", "content", "publication_date", "url"]]

df_reuters_clean["id"] = df_reuters_clean["id"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
df_reuters_clean["publication_date"] = pd.to_datetime(df_reuters_clean["publication_date"])
df_reuters_clean["content"] = df_reuters_clean["content"].apply(html2text.html2text)
df_reuters_clean["source"] = "Reuters"

In [65]:
df_reuters_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189 entries, 1 to 200
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id                189 non-null    object             
 1   headline          189 non-null    object             
 2   content           189 non-null    object             
 3   publication_date  189 non-null    datetime64[ns, UTC]
 4   url               189 non-null    object             
 5   source            189 non-null    object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 10.3+ KB


### Preprocess: BBC

In [66]:
df_bbc = pd.read_json("../data/scraped/s_m597351cfyi3n67o2.jsonl", lines=True).dropna(subset="headline")
df_bbc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198 entries, 4 to 202
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   timestamp         198 non-null    datetime64[ns, UTC]
 1   input             198 non-null    object             
 2   error             0 non-null      object             
 3   error_code        0 non-null      object             
 4   id                198 non-null    object             
 5   url               198 non-null    object             
 6   author            198 non-null    object             
 7   headline          198 non-null    object             
 8   topics            158 non-null    object             
 9   publication_date  198 non-null    object             
 10  content           197 non-null    object             
 11  videos            198 non-null    object             
 12  images            198 non-null    object             
 13  related_ar

In [67]:
df_bbc.head(2)

Unnamed: 0,timestamp,input,error,error_code,id,url,author,headline,topics,publication_date,content,videos,images,related_articles,keyword,discovery_input
4,2024-12-29 06:16:07.752000+00:00,{'url': 'https://www.bbc.com/news/articles/cvg...,,,cvg6d75gk90o,https://www.bbc.com/news/articles/cvg6d75gk90o,Lisa Young,Appeal to people working with nature and farmi...,[Alderney],2024-12-14T09:16:32.776Z,"Alderney's growers, farmers and fishermen are ...",[],[{'image_url': 'https://ichef.bbci.co.uk/image...,[{'article_title': 'Airports open late to clea...,apple,{'keyword': 'apple'}
5,2024-12-29 06:16:10.457000+00:00,{'url': 'https://www.bbc.com/news/articles/c62...,,,c62j56rp957o,https://www.bbc.com/news/articles/c62j56rp957o,Mariam Issimdar,Council asks Frank Bruno gym to apply for new ...,[Northampton],2024-11-19T18:30:42.216Z,A council is hoping boxer Frank Bruno will app...,[],[{'image_url': 'https://ichef.bbci.co.uk/image...,[{'article_title': 'Pedestrian killed in Chris...,apple,{'keyword': 'apple'}


In [69]:
df_bbc_clean = df_bbc.copy()
df_bbc_clean = df_bbc_clean.dropna(subset="content")
df_bbc_clean = df_bbc_clean.drop_duplicates(subset="id")
df_bbc_clean = df_bbc_clean[["id", "headline", "content", "publication_date", "url"]]

df_bbc_clean["id"] = df_bbc_clean["id"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())
df_bbc_clean["publication_date"] = pd.to_datetime(df_bbc_clean["publication_date"])
df_bbc_clean["content"] = df_bbc_clean["content"].apply(html2text.html2text)
df_bbc_clean["source"] = "BBC"

## Merge All

In [70]:
df_all = pd.concat([df_cnn_clean, df_reuters_clean, df_bbc_clean], ignore_index=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   id                544 non-null    object             
 1   headline          544 non-null    object             
 2   content           544 non-null    object             
 3   publication_date  544 non-null    datetime64[ns, UTC]
 4   url               544 non-null    object             
 5   source            544 non-null    object             
dtypes: datetime64[ns, UTC](1), object(5)
memory usage: 25.6+ KB


In [71]:
df_all.head(2)

Unnamed: 0,id,headline,content,publication_date,url,source
0,5a618a2daa89d751b75a0c76732cca3b,Apple urged to remove new AI feature after fal...,The press freedom group Reporters Without Bord...,2024-12-19 20:32:19.900000+00:00,https://www.cnn.com/2024/12/19/media/apple-int...,CNN
1,89a222209eced5c8fbb71670a8817219,"Cyber Monday 2024 is over, but you can still s...",Best Buy is offering shoppers one more chance ...,2024-12-03 12:33:00+00:00,https://www.cnn.com/cnn-underscored/deals/best...,CNN


In [72]:
df_all.to_json("../data/news-clean.jsonl", orient="records", lines=True)

In [73]:
df_all["source"].value_counts()

source
BBC        196
Reuters    189
CNN        159
Name: count, dtype: int64

In [74]:
df_all.groupby("source")["publication_date"].min()

source
BBC       2014-11-18 11:33:39+00:00
CNN       2023-07-28 21:12:49+00:00
Reuters   2023-11-29 20:59:46+00:00
Name: publication_date, dtype: datetime64[ns, UTC]

In [75]:
df_all.groupby("source")["publication_date"].max()

source
BBC       2024-12-27 08:55:06.442000+00:00
CNN       2024-12-27 15:40:56.189000+00:00
Reuters          2024-12-28 00:23:59+00:00
Name: publication_date, dtype: datetime64[ns, UTC]