AInewsbot.ipynb

- Automate collecting daily AI news
- Open URLs of news sites specififed in `sources` dict (sources.yaml) using Selenium and Firefox
- Save HTML of each URL in htmldata directory
- Extract URLs from all files, create a pandas dataframe with url, title, src
- Use ChatGPT to filter only AI-related headlines by sending a prompt and formatted table of headlines
- Use SQLite to filter headlines previously seen 
- OPENAI_API_KEY should be in the environment or in a .env file
  
Alternative manual workflow to get HTML files if necessary
- Use Chrome, open e.g. Tech News bookmark folder, right-click and open all bookmarks in new window
- on Google News, make sure switch to AI tab
- on Google News, Feedly, Reddit, scroll to additional pages as desired
- Use SingleFile extension, 'save all tabs'
- Move files to htmldata directory
- Run lower part of notebook to process the data


In [72]:
from datetime import datetime
import os
import yaml
import dotenv
import sqlite3
import unicodedata

import numpy as np
import pandas as pd

# import bs4
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin, urlparse

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

from openai import OpenAI

from ainb_const import (DOWNLOAD_DIR, MODEL,
                        SOURCECONFIG, PROMPT)
from ainb_utilities import log, delete_files, filter_unseen_urls_db, insert_article, nearest_neighbor_sort, agglomerative_cluster_sort, traveling_salesman_sort_scipy
from ainb_webscrape import init_browser, get_file, parse_file, get_og_tags, get_path_from_url, trimmed_href
from ainb_llm import paginate_df, process_pages

# needed because jupyter is already running an async event loop
import nest_asyncio
import asyncio

In [2]:
SOURCECONFIG = "sources.yaml"
DOWNLOAD_DIR = "htmldata"

# load secrets, credentials from .env
dotenv.load_dotenv()


True

In [3]:
# PROMPT = """
# You will act as a research assistant to categorize news articles based on their relevance
# to the topic of artificial intelligence (AI). You will process and classify news headlines
# formatted as JSON objects.

# Input Specification:
# You will receive a list of news stories formatted as JSON objects.
# Each object will include an 'id' and a 'title'. For instance:
# [{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
#  {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
#  {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
#  {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
#  {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
#  ]

# Classification Criteria:
# Classify each story based on its title to determine whether it primarily pertains to AI.
# Broadly define AI-related content to include topics such as machine learning, robotics,
# computer vision, reinforcement learning, large language models, and related topics. Also
# include specific references to AI-related entities and individuals and products such as
# OpenAI, ChatGPT, Elon Musk, Sam Altman, Anthropic Claude, Google Gemini, Copilot,
# Perplexity.ai, Midjourney, etc.

# Output Specification:
# You will return a JSON object with the field 'stories' containing the list of classification results.
# For each story, your output will be a JSON object containing the original 'id' and a new field 'isAI',
# a boolean indicating if the story is about AI. The output schema must be strictly adhered to, without
# any additional fields. Example output:
# {'stories':
# [{'id': 97, 'isAI': true},
#  {'id': 103, 'isAI': true},
#  {'id': 103, 'isAI': false},
#  {'id': 210, 'isAI': true},
#  {'id': 298, 'isAI': false}]
# }

# Ensure that each output object accurately reflects the corresponding input object in terms of the 'id' field
# and that the 'isAI' field accurately represents the AI relevance of the story as determined by the title.

# The list of news stories to classify and enrich is:

# """

In [5]:
print(PROMPT)


You will act as a research assistant to categorize news articles based on their relevance
to the topic of artificial intelligence (AI). You will process and classify news headlines
formatted as JSON objects.

Input Specification:
You will receive a list of news stories formatted as JSON objects.
Each object will include an 'id' and a 'title'. For instance:
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
 ]

Classification Criteria:
Classify each story based on its title to determine whether it primarily pertains to AI.
Broadly define AI-related content to include topics such as machine learning, robotics,
computer vision, reinforcement l

In [6]:
get_og_tags('https://druce.ai')


{'og:site_name': 'Druce.ai',
 'og:title': 'Druce.ai',
 'og:type': 'website',
 'og:description': "Druce's Blog on Machine Learning, Tech, Markets and Economics",
 'og:url': 'https://druce.ai/',
 'title': 'Druce.ai'}

In [7]:
get_path_from_url('https://druce.ai/2024/03/gemini-summarize-book')


'/2024/03/gemini-summarize-book'

In [8]:
trimmed_href('https://druce.ai/2024/03/gemini-summarize-book?xyz')


'https://druce.ai/2024/03/gemini-summarize-book'

In [9]:
#  load sources to scrape from sources.yaml
with open(SOURCECONFIG, "r") as stream:
    try:
        sources = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

log(f"Load {len(sources)} sources")



2024-06-20 15:13:36,412 - AInewsbot - INFO - Load 17 sources


20

In [10]:
# make a reverse dict to map output file titles to source names
sources_reverse = {}
for k, v in sources.items():
    log(f"{k} -> {v['url']} -> {v['title']}.html")
    v['sourcename'] = k
    # map filename (title) to source name
    sources_reverse[v['title']] = k

sources_reverse

2024-06-20 15:13:37,687 - AInewsbot - INFO - Ars Technica -> https://arstechnica.com/ -> Ars Technica.html
2024-06-20 15:13:37,688 - AInewsbot - INFO - Bloomberg Tech -> https://www.bloomberg.com/technology -> Bloomberg Technology - Bloomberg.html
2024-06-20 15:13:37,689 - AInewsbot - INFO - Business Insider -> https://www.businessinsider.com/tech -> Tech - Business Insider.html
2024-06-20 15:13:37,690 - AInewsbot - INFO - FT Tech -> https://www.ft.com/technology -> Technology.html
2024-06-20 15:13:37,690 - AInewsbot - INFO - Feedly AI -> https://feedly.com/i/aiFeeds?options=eyJsYXllcnMiOlt7InBhcnRzIjpbeyJpZCI6Im5scC9mL3RvcGljLzMwMDAifV0sInNlYXJjaEhpbnQiOiJ0ZWNobm9sb2d5IiwidHlwZSI6Im1hdGNoZXMiLCJzYWxpZW5jZSI6ImFib3V0In1dLCJidW5kbGVzIjpbeyJ0eXBlIjoic3RyZWFtIiwiaWQiOiJ1c2VyLzYyZWViYjlmLTcxNTEtNGY5YS1hOGM3LTlhNTdiODIwNTMwOC9jYXRlZ29yeS9HYWRnZXRzIn1dfQ -> Discover and Add New Feedly AI Feeds.html
2024-06-20 15:13:37,691 - AInewsbot - INFO - Google News -> https://news.google.com/topics/CAA

{'Ars Technica': 'Ars Technica',
 'Bloomberg Technology - Bloomberg': 'Bloomberg Tech',
 'Tech - Business Insider': 'Business Insider',
 'Technology': 'FT Tech',
 'Discover and Add New Feedly AI Feeds': 'Feedly AI',
 'Google News - Technology - Artificial intelligence': 'Google News',
 'Hacker News Page 1': 'Hacker News',
 'Hacker News Page 2': 'Hacker News 2',
 'HackerNoon - read, write and learn about any technology': 'HackerNoon',
 'Technology - The New York Times': 'NYT Tech',
 'top scoring links _ multi': 'Reddit',
 'Techmeme': 'Techmeme',
 'The Register_ Enterprise Technology News and Analysis': 'The Register',
 'Artificial Intelligence - The Verge': 'The Verge',
 'AI News _ VentureBeat': 'VentureBeat',
 'Technology - WSJ.com': 'WSJ Tech',
 'Technology - The Washington Post': 'WaPo Tech'}

In [11]:
# load list of files in htmldata directory
# List all paths in the directory matching today's date
nfiles = 50
files = [os.path.join(DOWNLOAD_DIR, file)
         for file in os.listdir(DOWNLOAD_DIR)]

# Get the current date
today = datetime.now()
year, month, day = today.year, today.month, today.day
datestr = datetime.now().strftime("%m_%d_%Y")

# filter files only
files = [file for file in files if os.path.isfile(file)]

# Sort files by modification time and take top 50
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
file = files[:nfiles]

# filter files by with today's date ending in .html
files = [
    file for file in files if datestr in file and file.endswith(".html")]
log(len(files))
for file in files:
    log(file)

saved_pages = []
for file in files:
    filename = os.path.basename(file)
    # locate date like '01_14_2024' in filename
    position = filename.find(" (" + datestr)
    basename = filename[:position]
    # match to source name
    sourcename = sources_reverse.get(basename)
    if sourcename is None:
        log(f"Skipping {basename}, no sourcename metadata")
        continue
    sources[sourcename]['latest'] = file
    saved_pages.append((sourcename, file))

2024-06-20 15:13:55,742 - AInewsbot - INFO - 17
2024-06-20 15:13:55,743 - AInewsbot - INFO - htmldata/Technology - The Washington Post (06_20_2024 01_55_22 PM).html
2024-06-20 15:13:55,744 - AInewsbot - INFO - htmldata/Technology - WSJ.com (06_20_2024 01_55_11 PM).html
2024-06-20 15:13:55,744 - AInewsbot - INFO - htmldata/AI News _ VentureBeat (06_20_2024 01_54_59 PM).html
2024-06-20 15:13:55,745 - AInewsbot - INFO - htmldata/Artificial Intelligence - The Verge (06_20_2024 01_54_48 PM).html
2024-06-20 15:13:55,745 - AInewsbot - INFO - htmldata/The Register_ Enterprise Technology News and Analysis (06_20_2024 01_54_38 PM).html
2024-06-20 15:13:55,745 - AInewsbot - INFO - htmldata/Techmeme (06_20_2024 01_54_27 PM).html
2024-06-20 15:13:55,746 - AInewsbot - INFO - htmldata/top scoring links _ multi (06_20_2024 01_54_16 PM).html
2024-06-20 15:13:55,746 - AInewsbot - INFO - htmldata/Technology - The New York Times (06_20_2024 01_53_44 PM).html
2024-06-20 15:13:55,747 - AInewsbot - INFO - ht

In [14]:
# Fetch HTML files from sources

# empty download directory
delete_files(DOWNLOAD_DIR)

# launch browser via selenium driver
driver = init_browser()

# save each file specified from sources
log("Saving HTML files")
saved_pages = []
for sourcename, sourcedict in sources.items():
    log(f'Processing {sourcename}')
    sourcefile = get_file(sourcedict, driver=driver)
    saved_pages.append((sourcename, sourcefile))

# Close the browser
log("Quit webdriver")
driver.quit()
# finished downloading files


2024-06-20 13:49:52,289 - AInewsbot - INFO - init_browser - Initializing webdriver
2024-06-20 13:50:04,426 - AInewsbot - INFO - init_browser - Initialized webdriver profile
2024-06-20 13:50:04,428 - AInewsbot - INFO - init_browser - Initialized webdriver service
2024-06-20 13:50:46,148 - AInewsbot - INFO - init_browser - Initialized webdriver
2024-06-20 13:50:46,187 - AInewsbot - INFO - Saving HTML files
2024-06-20 13:50:46,188 - AInewsbot - INFO - Processing Ars Technica
2024-06-20 13:50:46,189 - AInewsbot - INFO - get_files(Ars Technica) - starting get_files https://arstechnica.com/
2024-06-20 13:50:57,511 - AInewsbot - INFO - get_files(Ars Technica) - Saving Ars Technica (06_20_2024 01_50_57 PM).html as utf-8
2024-06-20 13:50:57,513 - AInewsbot - INFO - Processing Bloomberg Tech
2024-06-20 13:50:57,513 - AInewsbot - INFO - get_files(Bloomberg Technology - Bloomberg) - starting get_files https://www.bloomberg.com/technology
2024-06-20 13:51:08,747 - AInewsbot - INFO - Message: Unable

2024-06-20 13:53:23,618 - AInewsbot - INFO - get_files(Hacker News Page 2) - Saving Hacker News Page 2 (06_20_2024 01_53_23 PM).html as utf-8
2024-06-20 13:53:23,620 - AInewsbot - INFO - Processing HackerNoon
2024-06-20 13:53:23,620 - AInewsbot - INFO - get_files(HackerNoon - read, write and learn about any technology) - starting get_files https://hackernoon.com/
2024-06-20 13:53:34,374 - AInewsbot - INFO - Message: Unable to locate element: //meta[@http-equiv='Content-Type']; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:510:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

2024-06-20 13:53:34,375 - AInewsbot - INFO - get_files(HackerNoon - read

In [12]:
print(len(saved_pages))
for sourcename, page in saved_pages:
    # sources[sourcename]['latest'] = page
    print(sourcename, '->', page)
    

17
WaPo Tech -> htmldata/Technology - The Washington Post (06_20_2024 01_55_22 PM).html
WSJ Tech -> htmldata/Technology - WSJ.com (06_20_2024 01_55_11 PM).html
VentureBeat -> htmldata/AI News _ VentureBeat (06_20_2024 01_54_59 PM).html
The Verge -> htmldata/Artificial Intelligence - The Verge (06_20_2024 01_54_48 PM).html
The Register -> htmldata/The Register_ Enterprise Technology News and Analysis (06_20_2024 01_54_38 PM).html
Techmeme -> htmldata/Techmeme (06_20_2024 01_54_27 PM).html
Reddit -> htmldata/top scoring links _ multi (06_20_2024 01_54_16 PM).html
NYT Tech -> htmldata/Technology - The New York Times (06_20_2024 01_53_44 PM).html
HackerNoon -> htmldata/HackerNoon - read, write and learn about any technology (06_20_2024 01_53_34 PM).html
Hacker News 2 -> htmldata/Hacker News Page 2 (06_20_2024 01_53_23 PM).html
Hacker News -> htmldata/Hacker News Page 1 (06_20_2024 01_53_13 PM).html
Google News -> htmldata/Google News - Technology - Artificial intelligence (06_20_2024 01_53

In [13]:
# Parse news URLs and titles from downloaded HTML files
log("parsing html files")
all_urls = []
for sourcename, filename in saved_pages:
    print(sourcename, '->', filename, flush=True)
    log(f"{sourcename}", "parse loop")
    links = parse_file(sources[sourcename])
    log(f"{len(links)} links found", "parse loop")
    all_urls.extend(links)

log(f"found {len(all_urls)} links", "parse loop")

2024-06-20 15:14:05,240 - AInewsbot - INFO - parsing html files


WaPo Tech -> htmldata/Technology - The Washington Post (06_20_2024 01_55_22 PM).html


2024-06-20 15:14:05,244 - AInewsbot - INFO - parse loop - WaPo Tech
2024-06-20 15:14:05,274 - AInewsbot - INFO - parse_file - found 156 raw links
2024-06-20 15:14:05,277 - AInewsbot - INFO - parse_file - found 29 filtered links
2024-06-20 15:14:05,278 - AInewsbot - INFO - parse loop - 29 links found


WSJ Tech -> htmldata/Technology - WSJ.com (06_20_2024 01_55_11 PM).html


2024-06-20 15:14:05,279 - AInewsbot - INFO - parse loop - WSJ Tech
2024-06-20 15:14:05,322 - AInewsbot - INFO - parse_file - found 518 raw links
2024-06-20 15:14:05,328 - AInewsbot - INFO - parse_file - found 6 filtered links
2024-06-20 15:14:05,328 - AInewsbot - INFO - parse loop - 6 links found


VentureBeat -> htmldata/AI News _ VentureBeat (06_20_2024 01_54_59 PM).html


2024-06-20 15:14:05,329 - AInewsbot - INFO - parse loop - VentureBeat
2024-06-20 15:14:05,345 - AInewsbot - INFO - parse_file - found 322 raw links
2024-06-20 15:14:05,349 - AInewsbot - INFO - parse_file - found 44 filtered links
2024-06-20 15:14:05,349 - AInewsbot - INFO - parse loop - 44 links found


The Verge -> htmldata/Artificial Intelligence - The Verge (06_20_2024 01_54_48 PM).html


2024-06-20 15:14:05,350 - AInewsbot - INFO - parse loop - The Verge
2024-06-20 15:14:05,374 - AInewsbot - INFO - parse_file - found 310 raw links
2024-06-20 15:14:05,378 - AInewsbot - INFO - parse_file - found 33 filtered links
2024-06-20 15:14:05,378 - AInewsbot - INFO - parse loop - 33 links found


The Register -> htmldata/The Register_ Enterprise Technology News and Analysis (06_20_2024 01_54_38 PM).html


2024-06-20 15:14:05,378 - AInewsbot - INFO - parse loop - The Register
2024-06-20 15:14:05,394 - AInewsbot - INFO - parse_file - found 200 raw links
2024-06-20 15:14:05,397 - AInewsbot - INFO - parse_file - found 88 filtered links
2024-06-20 15:14:05,398 - AInewsbot - INFO - parse loop - 88 links found


Techmeme -> htmldata/Techmeme (06_20_2024 01_54_27 PM).html


2024-06-20 15:14:05,398 - AInewsbot - INFO - parse loop - Techmeme
2024-06-20 15:14:05,413 - AInewsbot - INFO - parse_file - found 350 raw links
2024-06-20 15:14:05,418 - AInewsbot - INFO - parse_file - found 144 filtered links
2024-06-20 15:14:05,418 - AInewsbot - INFO - parse loop - 144 links found


Reddit -> htmldata/top scoring links _ multi (06_20_2024 01_54_16 PM).html


2024-06-20 15:14:05,419 - AInewsbot - INFO - parse loop - Reddit
2024-06-20 15:14:05,550 - AInewsbot - INFO - parse_file - found 660 raw links
2024-06-20 15:14:05,560 - AInewsbot - INFO - parse_file - found 426 filtered links
2024-06-20 15:14:05,560 - AInewsbot - INFO - parse loop - 426 links found


NYT Tech -> htmldata/Technology - The New York Times (06_20_2024 01_53_44 PM).html


2024-06-20 15:14:05,560 - AInewsbot - INFO - parse loop - NYT Tech
2024-06-20 15:14:05,571 - AInewsbot - INFO - parse_file - found 72 raw links
2024-06-20 15:14:05,572 - AInewsbot - INFO - parse_file - found 19 filtered links
2024-06-20 15:14:05,572 - AInewsbot - INFO - parse loop - 19 links found


HackerNoon -> htmldata/HackerNoon - read, write and learn about any technology (06_20_2024 01_53_34 PM).html


2024-06-20 15:14:05,573 - AInewsbot - INFO - parse loop - HackerNoon
2024-06-20 15:14:05,626 - AInewsbot - INFO - parse_file - found 605 raw links
2024-06-20 15:14:05,634 - AInewsbot - INFO - parse_file - found 101 filtered links
2024-06-20 15:14:05,635 - AInewsbot - INFO - parse loop - 101 links found


Hacker News 2 -> htmldata/Hacker News Page 2 (06_20_2024 01_53_23 PM).html


2024-06-20 15:14:05,635 - AInewsbot - INFO - parse loop - Hacker News 2
2024-06-20 15:14:05,646 - AInewsbot - INFO - parse_file - found 260 raw links
2024-06-20 15:14:05,649 - AInewsbot - INFO - parse_file - found 20 filtered links
2024-06-20 15:14:05,649 - AInewsbot - INFO - parse loop - 20 links found


Hacker News -> htmldata/Hacker News Page 1 (06_20_2024 01_53_13 PM).html


2024-06-20 15:14:05,649 - AInewsbot - INFO - parse loop - Hacker News
2024-06-20 15:14:05,659 - AInewsbot - INFO - parse_file - found 257 raw links
2024-06-20 15:14:05,662 - AInewsbot - INFO - parse_file - found 23 filtered links
2024-06-20 15:14:05,662 - AInewsbot - INFO - parse loop - 23 links found


Google News -> htmldata/Google News - Technology - Artificial intelligence (06_20_2024 01_53_02 PM).html


2024-06-20 15:14:05,662 - AInewsbot - INFO - parse loop - Google News
2024-06-20 15:14:05,990 - AInewsbot - INFO - parse_file - found 1112 raw links
2024-06-20 15:14:05,997 - AInewsbot - INFO - parse_file - found 470 filtered links
2024-06-20 15:14:05,997 - AInewsbot - INFO - parse loop - 470 links found


Feedly AI -> htmldata/Discover and Add New Feedly AI Feeds (06_20_2024 01_52_30 PM).html


2024-06-20 15:14:05,998 - AInewsbot - INFO - parse loop - Feedly AI
2024-06-20 15:14:06,017 - AInewsbot - INFO - parse_file - found 126 raw links
2024-06-20 15:14:06,019 - AInewsbot - INFO - parse_file - found 33 filtered links
2024-06-20 15:14:06,019 - AInewsbot - INFO - parse loop - 33 links found


FT Tech -> htmldata/Technology (06_20_2024 01_51_30 PM).html


2024-06-20 15:14:06,019 - AInewsbot - INFO - parse loop - FT Tech
2024-06-20 15:14:06,045 - AInewsbot - INFO - parse_file - found 457 raw links
2024-06-20 15:14:06,050 - AInewsbot - INFO - parse_file - found 101 filtered links
2024-06-20 15:14:06,050 - AInewsbot - INFO - parse loop - 101 links found


Business Insider -> htmldata/Tech - Business Insider (06_20_2024 01_51_18 PM).html


2024-06-20 15:14:06,050 - AInewsbot - INFO - parse loop - Business Insider
2024-06-20 15:14:06,071 - AInewsbot - INFO - parse_file - found 315 raw links
2024-06-20 15:14:06,075 - AInewsbot - INFO - parse_file - found 56 filtered links
2024-06-20 15:14:06,075 - AInewsbot - INFO - parse loop - 56 links found


Bloomberg Tech -> htmldata/Bloomberg Technology - Bloomberg (06_20_2024 01_51_08 PM).html


2024-06-20 15:14:06,075 - AInewsbot - INFO - parse loop - Bloomberg Tech
2024-06-20 15:14:06,094 - AInewsbot - INFO - parse_file - found 298 raw links
2024-06-20 15:14:06,097 - AInewsbot - INFO - parse_file - found 51 filtered links
2024-06-20 15:14:06,097 - AInewsbot - INFO - parse loop - 51 links found


Ars Technica -> htmldata/Ars Technica (06_20_2024 01_50_57 PM).html


2024-06-20 15:14:06,098 - AInewsbot - INFO - parse loop - Ars Technica
2024-06-20 15:14:06,112 - AInewsbot - INFO - parse_file - found 252 raw links
2024-06-20 15:14:06,114 - AInewsbot - INFO - parse_file - found 25 filtered links
2024-06-20 15:14:06,114 - AInewsbot - INFO - parse loop - 25 links found
2024-06-20 15:14:06,115 - AInewsbot - INFO - parse loop - found 1669 links


20

In [14]:
# make a pandas dataframe of all the links found
orig_df = (
    pd.DataFrame(all_urls)
    .groupby("url")
    .first()
    .reset_index()
    .sort_values("src")[["src", "title", "url"]]
    .reset_index(drop=True)
    .reset_index(drop=False)
    .rename(columns={"index": "id"})
)
print(len(orig_df))
orig_df.head()

1390


Unnamed: 0,id,src,title,url
0,0,Ars Technica,Supermassive black hole roars to life as astro...,https://arstechnica.com/science/2024/06/superm...
1,1,Ars Technica,NASA delays Starliner return a few more days t...,https://arstechnica.com/space/2024/06/nasa-del...
2,2,Ars Technica,Ars Live Recap: Is SpaceX a launch company or ...,https://arstechnica.com/space/2024/06/ars-live...
3,3,Ars Technica,Men plead guilty to aggravated ID theft after ...,https://arstechnica.com/security/2024/06/men-w...
4,4,Ars Technica,How ShinyHunters hackers allegedly pilfered Ti...,https://arstechnica.com/security/2024/06/how-s...


In [15]:
# datestr = '2024-06-19'

# conn = sqlite3.connect('articles.db')

# c = conn.cursor()
# query = f"select * from news_articles where article_date > '{datestr}' order by article_date desc"
# df = pd.read_sql_query(query, conn)
# df



In [16]:

# conn.execute(f"delete from news_articles where article_date > '{datestr}'")

# # Committing the changes
# conn.commit()

# # Closing the connection
# conn.close()


In [17]:
filtered_df = filter_unseen_urls_db(orig_df)


2024-06-20 15:14:10,412 - AInewsbot - INFO - Existing URLs: 106847
2024-06-20 15:14:10,430 - AInewsbot - INFO - New URLs: 458


In [19]:
# use chatgpt to filter AI-related headlines using a prompt to OpenAI

client = OpenAI()

# make pages that fit in fa reasonably sized prompt
pages = paginate_df(filtered_df)

In [20]:
pages

[[{'id': 9, 'title': 'How hagfish burrow into deep-sea sediment'},
  {'id': 11, 'title': 'Cleaning up cow burps to combat global warming'},
  {'id': 13,
   'title': 'Why Interplay’s originalFallout 3was canceled 20+ years ago'},
  {'id': 25,
   'title': 'ASMI Tipped as European AI Winner by Morgan Stanley, BofA'},
  {'id': 26,
   'title': 'Anthropic Releases ‘Most Intelligent’ AI Model in Rivalry With OpenAI'},
  {'id': 38,
   'title': 'Car Dealers Across US Are Crippled by a Second Cyberattack'},
  {'id': 46,
   'title': 'MGM to Offer Online Betting With Live Dealers in Las Vegas'},
  {'id': 48,
   'title': 'Newest NATO Member Sweden Says Russia Disrupting Its Satellite Networks'},
  {'id': 49,
   'title': 'Ocado Falls After Sobeys Puts Brakes on Vancouver Warehouse'},
  {'id': 50,
   'title': 'Paris Gets a Wild Meme Stock as Atos Becomes a Chatroom Favorite'},
  {'id': 70,
   'title': "Elon Musk's Boring Company reduced the natural beauty of Texas to 'gravel mines,' resident says"},


In [23]:
enriched_urls = await process_pages(client, PROMPT, pages)


2024-06-20 14:13:36,153 - AInewsbot - INFO - send page 1 of 14, 50 items 
2024-06-20 14:13:36,154 - AInewsbot - INFO - send page 2 of 14, 50 items 
2024-06-20 14:13:36,155 - AInewsbot - INFO - send page 3 of 14, 50 items 
2024-06-20 14:13:36,156 - AInewsbot - INFO - send page 4 of 14, 50 items 
2024-06-20 14:13:36,157 - AInewsbot - INFO - send page 5 of 14, 50 items 
2024-06-20 14:13:36,157 - AInewsbot - INFO - send page 6 of 14, 50 items 
2024-06-20 14:13:36,158 - AInewsbot - INFO - send page 7 of 14, 50 items 
2024-06-20 14:13:36,158 - AInewsbot - INFO - send page 8 of 14, 50 items 
2024-06-20 14:13:36,159 - AInewsbot - INFO - send page 9 of 14, 50 items 
2024-06-20 14:13:36,159 - AInewsbot - INFO - send page 10 of 14, 50 items 
2024-06-20 14:13:36,160 - AInewsbot - INFO - send page 11 of 14, 50 items 
2024-06-20 14:13:36,161 - AInewsbot - INFO - send page 12 of 14, 50 items 
2024-06-20 14:13:36,161 - AInewsbot - INFO - send page 13 of 14, 50 items 
2024-06-20 14:13:36,161 - AInewsbo

In [22]:
promptlist = [PROMPT + json.dumps(p, indent=2) for p in pages]
print(promptlist[0])


You will act as a research assistant to categorize news articles based on their relevance
to the topic of artificial intelligence (AI). You will process and classify news headlines
formatted as JSON objects.

Input Specification:
You will receive a list of news stories formatted as JSON objects.
Each object will include an 'id' and a 'title'. For instance:
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
 ]

Classification Criteria:
Classify each story based on its title to determine whether it primarily pertains to AI.
Broadly define AI-related content to include topics such as machine learning, robotics,
computer vision, reinforcement l

In [23]:
import aiohttp
import asyncio
import json


In [25]:
API_URL = 'https://api.openai.com/v1/chat/completions'

headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}',
}



In [58]:
payloads = [{"model": "gpt-4o", 
             "response_format": {"type": "json_object"},
             "messages": [{"role": "user", 
                           "content": PROMPT + json.dumps(p)
                          }]
            } for p in pages]


In [59]:
len(payloads)


10

In [60]:
print(payloads[0])

{'model': 'gpt-4o', 'response_format': {'type': 'json_object'}, 'messages': [{'role': 'user', 'content': '\nYou will act as a research assistant to categorize news articles based on their relevance\nto the topic of artificial intelligence (AI). You will process and classify news headlines\nformatted as JSON objects.\n\nInput Specification:\nYou will receive a list of news stories formatted as JSON objects.\nEach object will include an \'id\' and a \'title\'. For instance:\n[{\'id\': 97, \'title\': \'AI to predict dementia, detect cancer\'},\n {\'id\': 103,\'title\': \'Figure robot learns to make coffee by watching humans for 10 hours\'},\n {\'id\': 103,\'title\': \'Baby trapped in refrigerator eats own foot\'},\n {\'id\': 210,\'title\': \'ChatGPT removes, then reinstates a summarization assistant without explanation.\'},\n {\'id\': 298,\'title\': \'The 5 most interesting PC monitors from CES 2024\'},\n ]\n\nClassification Criteria:\nClassify each story based on its title to determine w

In [75]:
# this runs fast
import nest_asyncio
nest_asyncio.apply()

async def fetch_openai(session, payload):
    async with session.post(API_URL, headers=headers, json=payload) as response:
        return await response.json()


async def fetch_pages(prompt, pages):
    log(f"{datetime.now().strftime('%H:%M:%S')} Sending {len(pages)} pages to OpenAI.")

    # make a prompt and payload for each page
    payloads = [{"model":  MODEL,
                 "response_format": {"type": "json_object"},
                 "messages": [{"role": "user",
                               "content": prompt + json.dumps(p)
                               }]
                 } for p in pages]

    async with aiohttp.ClientSession() as session:
        tasks = []
        for payload in payloads:
            task = asyncio.create_task(fetch_openai(session, payload))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)

    # validate and process the responses
    log(f"{datetime.now().strftime('%H:%M:%S')} Processing responses... ")

    return responses

    retlist = []
    for i, response in enumerate(responses):
        try:
            response_dict = json.loads(
                response["choices"][0]["message"]["content"])
        except Exception as e:
            raise TypeError("Error: Invalid response " + str(e))

        if type(response_dict) is dict:
            response_list = response_dict.get("stories")
        else:
            raise TypeError("Error: Invalid response type")

        if type(response_list) is not list:
            raise TypeError("Error: Invalid response type")

        log(f"{datetime.now().strftime('%H:%M:%S')} got list with {len(response_list)} items ")

        sent_ids = [s['id'] for s in pages[i]]
        received_ids = [r['id'] for r in response_list]
        difference = set(sent_ids) - set(received_ids)

        if difference:
            log(f"missing items, {str(difference)}")

        retlist.extend(response_list)

    log(f"{datetime.now().strftime('%H:%M:%S')} Processed {len(retlist)} responses.")

    return retlist


# Run the main function
responses = asyncio.run(fetch_pages(PROMPT, pages))
print(datetime.now())


2024-06-20 16:24:41,535 - AInewsbot - INFO - 16:24:41 Sending 10 pages to OpenAI.
2024-06-20 16:24:57,681 - AInewsbot - INFO - 16:24:57 Processing responses... 


2024-06-20 16:24:57.683514


In [77]:
    retlist = []
    for i, response in enumerate(responses):
        try:
            response_dict = json.loads(
                response["choices"][0]["message"]["content"])
        except Exception as e:
            raise TypeError("Error: Invalid response " + str(e))

        if type(response_dict) is dict:
            response_list = response_dict.get("stories")
        else:
            raise TypeError("Error: Invalid response type")

        if type(response_list) is not list:
            raise TypeError("Error: Invalid response type")

        log(f"{datetime.now().strftime('%H:%M:%S')} got list with {len(response_list)} items ")

        sent_ids = [s['id'] for s in pages[i]]
        received_ids = [r['id'] for r in response_list]
        difference = set(sent_ids) - set(received_ids)

        if difference:
            log(f"missing items, {str(difference)}")

        retlist.extend(response_list)

    log(f"{datetime.now().strftime('%H:%M:%S')} Processed {len(retlist)} responses.")


2024-06-20 16:25:18,668 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,670 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,672 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,673 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,673 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,674 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,675 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,676 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,677 - AInewsbot - INFO - 16:25:18 got list with 50 items 
2024-06-20 16:25:18,677 - AInewsbot - INFO - 16:25:18 got list with 8 items 
2024-06-20 16:25:18,678 - AInewsbot - INFO - 16:25:18 Processed 458 responses.


20

In [81]:
{1, 2, 4} - {1, 2, 3}

{4}

In [68]:
for i, response in enumerate(responses):
    retlist = []
    response_json = json.loads(response["choices"][0]["message"]["content"])

    if type(response_json) is dict:
        for k, v in response_json.items():
            if type(v) is list:    # came back correctly as e.g. {'stories': []}
                retlist.extend(v)
            else: 
                raise TypeError("Error: Invalid response type")
    else:
        raise TypeError("Error: Invalid response type")

    sent_ids = [s['id'] for s in pages[i]]
    received_ids = [r['id'] for r in retlist]
    difference = set(sent_ids) - set(received_ids)

    if difference:
        print(f"missing items, {str(difference)}")


In [17]:
enriched_df = pd.DataFrame(enriched_urls)
enriched_df.head()

ValueError: DataFrame constructor not properly called!

In [None]:
log("isAI", len(enriched_df.loc[enriched_df["isAI"]]))
log("not isAI", len(enriched_df.loc[~enriched_df["isAI"]]))


In [None]:
# merge returned df into original df
merged_df = pd.merge(filtered_df, enriched_df, on="id", how="outer")
merged_df['date'] = datetime.now().date()
merged_df.head()


In [None]:
# should be empty, shouldn't get back rows that don't match to existing
log(f"Unmatched response rows: {len(merged_df.loc[merged_df['src'].isna()])}")
# should be empty, should get back all rows from orig
log(f"Unmatched source rows: {len(merged_df.loc[merged_df['isAI'].isna()])}")


In [None]:
# update SQLite database with all seen articles
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()
for row in merged_df.itertuples():
    insert_article(conn, cursor, row.src, row.title,
                   row.url, row.isAI, row.date)
    

In [None]:
AIdf = merged_df.loc[merged_df["isAI"]].reset_index(drop=True)
log(f"Found {len(AIdf)} AI headlines")


In [None]:
# map title to ascii characters to avoid some dupes with e.g. different quote symbols

def unicode_to_ascii(input_string):
    # Normalize the Unicode string to NFKD form
    normalized_string = unicodedata.normalize('NFKD', input_string)
    
    # Encode to ASCII bytes, ignoring characters that cannot be converted
    ascii_bytes = normalized_string.encode('ascii', 'ignore')
    
    # Convert bytes back to a string
    ascii_string = ascii_bytes.decode('ascii')
    
    return ascii_string

AIdf['title'] = AIdf['title'].apply(unicode_to_ascii)


In [None]:
# dedupe identical headlines
AIdf['title_clean'] = AIdf['title'].map(lambda s: "".join(s.split()))
AIdf = AIdf.sort_values("src") \
    .groupby("title_clean") \
    .first() \
    .reset_index()
log(f"Found {len(AIdf)} unique AI headlines")


In [None]:
log(f"Fetching embeddings for {len(AIdf)} headlines")
embedding_model = 'text-embedding-3-large'
response = client.embeddings.create(input=AIdf['title'].tolist(),
                                    model=embedding_model)
embedding_df = pd.DataFrame([e.model_dump()['embedding'] for e in response.data])
embedding_array = embedding_df.values

# find index of most central headline
centroid = embedding_array.mean(axis=0)
distances = np.linalg.norm(embedding_array - centroid, axis=1)
start_index = np.argmin(distances)

# Get the sorted indices and use them to sort the df
# sorted_indices = nearest_neighbor_sort(embedding_array, start_index)
sorted_indices = traveling_salesman_sort_scipy(embedding_array)
AIdf = AIdf.iloc[sorted_indices]


In [None]:
# leaf_order = agglomerative_cluster_sort(embedding_df)
# AIdf = AIdf.iloc[leaf_order]


In [None]:
AIdf=AIdf.reset_index(drop=True)
with pd.option_context('display.max_rows', None):
  display(AIdf[["title"]])

In [None]:
html_str = ""
for row in AIdf.itertuples():
    log(f"[{row.Index}. {row.title} - {row.src}]({row.url})")
    html_str += f'{row.Index}.<a href="{row.url}">{row.title} - {row.src}</a><br />\n'


In [None]:
log("Sending mail")
from_addr = os.getenv("GMAIL_USER")
to_addr = os.getenv("GMAIL_USER")
subject = 'AI news ' + datetime.now().strftime('%H:%M:%S')
body = f"""
<html>
    <head></head>
    <body>
    <div>
    {html_str}
    </div>
    </body>
</html>
"""

# Setup the MIME
message = MIMEMultipart()
message['From'] = os.getenv("GMAIL_USER")
message['To'] = os.getenv("GMAIL_USER")
message['Subject'] = subject
message.attach(MIMEText(body, 'html'))

# Create SMTP session
with smtplib.SMTP('smtp.gmail.com', 587) as server:
    server.starttls()  # Secure the connection
    server.login(os.getenv("GMAIL_USER"), os.getenv("GMAIL_PASSWORD"))
    text = message.as_string()
    server.sendmail(from_addr, to_addr, text)

log("Finished")
