AInewsbot.ipynb

- Open URLs of news sites specififed in `sources` dict using Selenium and Firefox
- Save HTML of each URL in htmldata directory
- Extract URLs from all files, create a pandas dataframe with url, title, src
- Use ChatGPT to filter only AI-related headlines by sending a prompt and formatted table of headlines
- Use SQLite to filter headlines previously seen 
- OPENAI_API_KEY should be in the environment or in a .env file
  
Alternative manual workflow to get HTML files if necessary
- Use Chrome, open e.g. Tech News bookmark folder, right-click and open all bookmarks in new window
- on Google News, make sure switch to AI tab
- on Google News, Feedly, Reddit, scroll to additional pages as desired
- Use SingleFile extension, 'save all tabs'
- Move files to htmldata directory
- Run lower part of notebook to process the data


In [1]:
import json
import os
import re
from datetime import datetime, timedelta
from urllib.parse import urlparse
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# use firefox because it updates less often, can disable updates
# recommend importing profile from Chrome for cookies, passwords
# looks less like a bot with more user cruft in the profile
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service


import bs4
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

import openai
from openai import OpenAI
import tiktoken

import dotenv

import sqlite3

import IPython
from IPython.display import HTML, Markdown, display

from atproto import Client

import PIL
from PIL import Image

print(f"openai          {openai.__version__}")
print(f"requests        {requests.__version__}")
print(f"BeautifulSoup   {bs4.__version__}")

openai          1.14.2
requests        2.31.0
BeautifulSoup   4.12.3


In [2]:
print(datetime.now())

2024-03-25 16:32:43.889869


In [3]:
# load credentials if necessary
dotenv.load_dotenv()
client = OpenAI()

In [4]:
# delete files in output directory
download_dir = "htmldata"

def delete_files(outputdir):

    # Iterate over all files in the directory
    for filename in os.listdir(outputdir):
        if filename.startswith('.'):
            continue
        file_path = os.path.join(outputdir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.remove(file_path)  # Remove the file
            elif os.path.isdir(file_path):
                # If you want to remove subdirectories as well, use os.rmdir() here
                pass
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


delete_files(download_dir)

# Specify sources

In [5]:
sources = {
    # "Twitter": {
    #     "title": "@TheLinkfest_AI _ X",
    #     "include": ["^https://twitter.com/(\w+)/status/(\d+)$"],
    # },
    "Reddit": {
        "title": "top scoring links _ multi",
        "url": "https://www.reddit.com/r/ChatGPT+ChatGPTCoding+MacOS+MachineLearning+OpenAI+ProgrammerHumor+Windows10+battlestations+buildapc+cordcutters+dataisbeautiful+gadgets+hardware+linux+msp+programming+realtech+software+talesfromtechsupport+tech+technews+technology+techsupportgore+windows/top/?sort=top&t=day",
        "scroll": 2,
        "exclude": [
            "^https://www.reddit.com/",
            "^https://chat.reddit.com/",
            "^https://i.redd.it/",
            "^https://redditblog.com/",
            "^https://www.redditinc.com/",
            "^https://www.reddithelp.com/",
            "^https://itunes.apple.com/",
            "^https://play.google.com/",
        ],
    },
    "Hacker News": {
        "title": "Hacker News Page 1",
        "url": "https://news.ycombinator.com/",
        "exclude": [
            "https://news.ycombinator.com/",
            "https://www.ycombinator.com/",
        ],
    },
    "Hacker News 2": {
        "title": "Hacker News Page 2",
        "url": "https://news.ycombinator.com/?p=2",
        "exclude": [
            "https://news.ycombinator.com/",
            "https://www.ycombinator.com/",
        ],
    },
    "Techmeme": {
        "title": "Techmeme",
        "url": "https://www.techmeme.com/river",
        "exclude": [
            "^https://www.techmeme.com",
            "^https://twitter.com/",
            "^https://www.threads.net",
            "^https://www.linkedin.com",
            "^https://mastodon.social",
            "^https://bsky.app",
        ],
    },
    "Feedly AI": {
        "title": "Discover and Add New Feedly AI Feeds",
        "url": "https://feedly.com/i/aiFeeds?options=eyJsYXllcnMiOlt7InBhcnRzIjpbeyJpZCI6Im5scC9mL3RvcGljLzMwMDAifV0sInNlYXJjaEhpbnQiOiJ0ZWNobm9sb2d5IiwidHlwZSI6Im1hdGNoZXMiLCJzYWxpZW5jZSI6ImFib3V0In1dLCJidW5kbGVzIjpbeyJ0eXBlIjoic3RyZWFtIiwiaWQiOiJ1c2VyLzYyZWViYjlmLTcxNTEtNGY5YS1hOGM3LTlhNTdiODIwNTMwOC9jYXRlZ29yeS9HYWRnZXRzIn1dfQ",
        "scroll": 2,
    },
    "NYT Tech": {
        "title": "Technology - The New York Times",
        "url": "https://www.nytimes.com/section/technology",
        "include": ["^https://www.nytimes.com/(\d+)/(\d+)/(\d+)(.*).html$"],
    },
    "WSJ Tech": {
        "title": "Technology - WSJ.com",
        "url": "https://www.wsj.com/tech",
        "include": ["^https://www.wsj.com/articles/"],
    },
    "Bloomberg Tech": {
        "title": "Bloomberg Technology - Bloomberg",
        "url": "https://www.bloomberg.com/technology",
        "include": ["^https://www.bloomberg.com/news/(\w+)/(\d+)-(\d+)-(\d+)"],
    },
    "FT Tech": {
        "title": "Technology",
        "url": "https://www.ft.com/technology",
        "include": ["https://www.ft.com/content/"]
    },
    "WaPo Tech": {
        "title": "Technology - The Washington Post",
        "url": "https://www.washingtonpost.com/business/technology/",
        "include": ["https://www.washingtonpost.com/(\w+)/(\d+)/(\d+)/(\d+)/"],
    },
    "Google News": {
        "title": "Google News - Technology - Artificial intelligence",
        "url": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen",
        "scroll": 2,
        "click" : '//*[@aria-label="Artificial intelligence"]',
        "include": ["^https://news.google.com/articles/"],
    },
    "HackerNoon": {
        "title": "HackerNoon - read, write and learn about any technology",
        "url": "https://hackernoon.com/",
        "include": ["^https://hackernoon.com/([^/])*$"],
        "exclude": [
            "^https://hackernoon.com/$",
            "^https://hackernoon.com/c$",
            "^https://hackernoon.com/coins$",
            "^https://hackernoon.com/companies$",
            "^https://hackernoon.com/gallery$",
            "^https://hackernoon.com/how-to-gain-followers-and-newsletter-subs-directly-on-hackernoon$",
            "^https://hackernoon.com/login$",
            "^https://hackernoon.com/reader-boot$",
            "^https://hackernoon.com/sitemap.xml$",
            "^https://hackernoon.com/startups$",
            "^https://hackernoon.com/techbeat$",
            "^https://hackernoon.com/why-i-write-on-hacker-noon-nl28335q$",
            "^https://hackernoon.com/writer-signup$",
        ],
    },
    "Ars Technica": {
        "title": "Ars Technica",
        "url": "https://arstechnica.com/",
        "include": ["^https://arstechnica.com/gadgets/(\d+)/(\d+)/"],
    },
    "The Register": {
        "title": "The Register_ Enterprise Technology News and Analysis",
        "url": "https://www.theregister.com/",
        "include": ["^https://www.theregister.com/(\d+)/(\d+)/(\d+)/"],
    },
    "Business Insider": {
        "title": "Tech - Business Insider",
        "url": "https://www.businessinsider.com/tech",
        "exclude": ["^https://www.insider.com", "^https://www.passionfroot.me"],
    },
}

sources_reverse = {v["title"]: k for k, v in sources.items()}


# Download HTML files from sources

In [6]:
# download files via selenium and firefox
outputdir = "htmldata"
delete_files(outputdir)

# Print the formatted time
print(datetime.now().strftime('%H:%M:%S'), "Starting", flush=True)

firefox_app_path = '/Applications/Firefox.app'
# Path to your geckodriver
geckodriver_path = '/Users/drucev/webdrivers/geckodriver'

# Set up Firefox options to use your existing profile
# important for some sites that need a login, also a generic profile fingerprint that looks like a bot might get blocked
firefox_profile_path = '/Users/drucev/Library/Application Support/Firefox/Profiles/k8k0lcjj.default-release'
options = Options()
options.profile = firefox_profile_path

print(datetime.now().strftime('%H:%M:%S'), "Initialized profile", flush=True)

# Create a Service object with the path
service = Service(geckodriver_path)

print(datetime.now().strftime('%H:%M:%S'), "Initialized service", flush=True)
# Set up the Firefox driver
driver = webdriver.Firefox(service=service, options=options)

print(datetime.now().strftime('%H:%M:%S'), "Initialized webdriver", flush=True)
sleeptime = 10

for sourcename, sourcedict in sources.items():
    print(datetime.now().strftime('%H:%M:%S'), f'Processing {sourcename}', flush=True)
    title = sourcedict["title"]
    url = sourcedict["url"]
    scroll = sourcedict.get("scroll", 0)
    click = sourcedict.get("click")

    # Open the page
    driver.get(url)

    # Wait for the page to load
    time.sleep(sleeptime)  # Adjust the sleep time as necessary

    if click:
        print(datetime.now().strftime('%H:%M:%S'), f"Clicking on {click}", flush=True)
        button = driver.find_element(By.XPATH, click)
        if button:
            button.click()
            print(datetime.now().strftime('%H:%M:%S'), f"Clicked", flush=True)

    for _ in range(scroll):
        # scroll to bottom of infinite scrolling window
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        print(datetime.now().strftime('%H:%M:%S'), "Loading additional infinite scroll items", flush=True)
        time.sleep(sleeptime) # wait for it to load additional items

    # Get the HTML source of the page
    html_source = driver.page_source

    # check encoding, default utf-8
    encoding = "utf-8"  # Default to UTF-8 if not specified
    # Retrieve the content-type meta tag from the HTML
    try:
        meta_tag = driver.find_element(By.XPATH, "//meta[@http-equiv='Content-Type']")
        content_type = meta_tag.get_attribute("content")
        # Typical format is "text/html; charset=UTF-8"
        charset_start = content_type.find("charset=")
        if charset_start != -1:
            encoding = content_type[charset_start + 8:]
    except Exception as err:
        pass

    # Save the HTML to a local file
    datestr = datetime.now().strftime("%m_%d_%Y %I_%M_%S %p")
    outfile = f'{title} ({datestr}).html'
    print(datetime.now().strftime('%H:%M:%S'), f"Saving {outfile} as {encoding}", flush=True)
    with open(outputdir + "/" + outfile, 'w', encoding=encoding) as file:
        file.write(html_source)

# Close the browser
driver.quit()
print(datetime.now().strftime('%H:%M:%S'), "Quit webdriver", flush=True)


16:32:43 Starting
16:33:19 Initialized profile
16:33:19 Initialized service
16:34:36 Initialized webdriver
16:34:36 Processing Reddit
16:34:51 Loading additional infinite scroll items
16:35:01 Loading additional infinite scroll items
16:35:11 Saving top scoring links _ multi (03_25_2024 04_35_11 PM).html as utf-8
16:35:12 Processing Hacker News
16:35:22 Saving Hacker News Page 1 (03_25_2024 04_35_22 PM).html as utf-8
16:35:22 Processing Hacker News 2
16:35:33 Saving Hacker News Page 2 (03_25_2024 04_35_33 PM).html as utf-8
16:35:33 Processing Techmeme
16:35:44 Saving Techmeme (03_25_2024 04_35_44 PM).html as utf-8
16:35:44 Processing Feedly AI
16:35:55 Loading additional infinite scroll items
16:36:05 Loading additional infinite scroll items
16:36:15 Saving Discover and Add New Feedly AI Feeds (03_25_2024 04_36_15 PM).html as utf-8
16:36:15 Processing NYT Tech
16:36:27 Saving Technology - The New York Times (03_25_2024 04_36_27 PM).html as utf-8
16:36:27 Processing WSJ Tech
16:36:38 Sa

In [7]:
[os.path.join(download_dir, file) for file in os.listdir(download_dir)]

['htmldata/Hacker News Page 1 (03_25_2024 04_35_22 PM).html',
 'htmldata/Technology (03_25_2024 04_37_01 PM).html',
 'htmldata/The Register_ Enterprise Technology News and Analysis (03_25_2024 04_38_19 PM).html',
 'htmldata/top scoring links _ multi (03_25_2024 04_35_11 PM).html',
 'htmldata/.gitkeep',
 'htmldata/HackerNoon - read, write and learn about any technology (03_25_2024 04_37_57 PM).html',
 'htmldata/Bloomberg Technology - Bloomberg (03_25_2024 04_36_50 PM).html',
 'htmldata/Techmeme (03_25_2024 04_35_44 PM).html',
 'htmldata/Hacker News Page 2 (03_25_2024 04_35_33 PM).html',
 'htmldata/Ars Technica (03_25_2024 04_38_08 PM).html',
 'htmldata/Technology - WSJ.com (03_25_2024 04_36_38 PM).html',
 'htmldata/Google News - Technology - Artificial intelligence (03_25_2024 04_37_45 PM).html',
 'htmldata/Technology - The Washington Post (03_25_2024 04_37_12 PM).html',
 'htmldata/Discover and Add New Feedly AI Feeds (03_25_2024 04_36_15 PM).html',
 'htmldata/Technology - The New York 

In [8]:
# List all paths in the directory matching today's date
nfiles = 50

# Get the current date
today = datetime.now()
year, month, day = today.year, today.month, today.day

datestr = datetime.now().strftime("%m_%d_%Y")

# print(f"Year: {year}, Month: {month}, Day: {day}")

files = [os.path.join(download_dir, file) for file in os.listdir(download_dir)]
# filter files only
files = [file for file in files if os.path.isfile(file)]

# Sort files by modification time and take top 50
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
file = files[:nfiles]

# filter files by with today's date ending in .html
files = [file for file in files if datestr in file and file.endswith(".html")]
print(len(files))
for file in files:
    print(file)

15
htmldata/Tech - Business Insider (03_25_2024 04_38_29 PM).html
htmldata/The Register_ Enterprise Technology News and Analysis (03_25_2024 04_38_19 PM).html
htmldata/Ars Technica (03_25_2024 04_38_08 PM).html
htmldata/HackerNoon - read, write and learn about any technology (03_25_2024 04_37_57 PM).html
htmldata/Google News - Technology - Artificial intelligence (03_25_2024 04_37_45 PM).html
htmldata/Technology - The Washington Post (03_25_2024 04_37_12 PM).html
htmldata/Technology (03_25_2024 04_37_01 PM).html
htmldata/Bloomberg Technology - Bloomberg (03_25_2024 04_36_50 PM).html
htmldata/Technology - WSJ.com (03_25_2024 04_36_38 PM).html
htmldata/Technology - The New York Times (03_25_2024 04_36_27 PM).html
htmldata/Discover and Add New Feedly AI Feeds (03_25_2024 04_36_15 PM).html
htmldata/Techmeme (03_25_2024 04_35_44 PM).html
htmldata/Hacker News Page 2 (03_25_2024 04_35_33 PM).html
htmldata/Hacker News Page 1 (03_25_2024 04_35_22 PM).html
htmldata/top scoring links _ multi (03_

In [9]:
# you need this if you have not-descriptive link titles like 'link', can get a page title from html or tags
def get_og_tags(url):
    """get a dict of Open Graph og: tags such as title in the HEAD of a URL"""
    retdict = {}
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            head = soup.head
            if head:
                og_tags = head.find_all(
                    property=lambda prop: prop and prop.startswith("og:")
                )
                for tag in og_tags:
                    if "content" in tag.attrs:
                        retdict[tag["property"]] = tag["content"]

                page_title = ""
                title_tag = soup.find("title")
                if title_tag:
                    page_title = title_tag.text
                    if page_title:
                        retdict["title"] = page_title
        return retdict
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return retdict


url = "https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says"
get_og_tags(url)

{'og:locale': 'en-GB',
 'og:url': 'https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says',
 'og:site_name': 'euronews',
 'og:type': 'article',
 'og:title': 'AI to impact 40% of jobs around the world, IMF chief says',
 'og:description': 'Kristalina Georgieva said now is the time to act to create a set of policies ensuring the impact of AI is beneficial not detrimental to humanity.',
 'og:image': 'https://static.euronews.com/articles/stories/08/17/12/08/1200x675_cmsv2_1bac2582-b418-5da9-80f9-6c4b6254606d-8171208.jpg',
 'og:image:width': '1200',
 'og:image:height': '675',
 'og:image:type': 'image/jpeg',
 'og:image:alt': 'Almost 40% of jobs around the world will be impacted by AI, IMF chief says',
 'og:locale:alternate': 'el-GR',
 'og:locale:alternate:url': 'https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says',
 'title': 'Almost 40% of jobs around the world will be impa

In [10]:
def get_path_from_url(url):
    """
    Extracts the path following the top-level domain name from a URL.

    :param url: The URL string.
    :return: The path component of the URL.
    """
    parsed_url = urlparse(url)
    return parsed_url.path


# Example usage
example_url = "http://www.example.com/some/path?query=string"
path = get_path_from_url(example_url)
print(path)

/some/path


In [11]:
MODEL = "gpt-4-turbo-preview"

MAX_INPUT_TOKENS = 3072
MAX_OUTPUT_TOKENS = 4096
MAX_RETRIES = 3
TEMPERATURE = 0

In [12]:
enc = tiktoken.encoding_for_model(MODEL)
assert enc.decode(enc.encode("hello world")) == "hello world"


def count_tokens(s):
    return len(enc.encode(s))


count_tokens("four score and 7 years go our forefathers brought forth")

13

In [13]:
def trimmed_href(l):
    """
    Trims everything in the string after a question mark such as a session ID.

    :param s: The input string.
    :return: The trimmed string.
    """
    # Find the position of the question mark
    s = l.get("href")
    if s:
        question_mark_index = s.find("?")

        # If a question mark is found, trim the string up to that point
        if question_mark_index != -1:
            return s[:question_mark_index]
        else:
            # Return the original string if no question mark is found
            return s
    else:
        return s

# Parse news URLs and titles from downloaded HTML files

In [14]:
# parse all the URL that look like news articles
# into all_urls list of dicts with url, title, src
all_urls = []

for file in files:
    # Extract filename from path
    filename = os.path.basename(file)

    # Find the position of '1_14_2024' in the filename
    position = filename.find(" (" + datestr)
    basename = filename[:position]
#     print(basename)
#     if basename.startswith('Google News'):
#         pass
#     else:
#         continue

    sourcename = sources_reverse.get(basename)
    if sourcename is None:
        print(f"Skipping {basename}, no sourcename metadata")
        continue

    display(Markdown(f"# {sourcename}"))
    sources[sourcename]["latest"] = file

    # get contents
    with open(file, "r") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <a> tags
    if soup:
        links = soup.find_all("a")
    else:
        print(f"Skipping {sourcename}, unable to parse")

    # convert relative links to absolute links using base URL if present
    base_tag = soup.find('base')
    base_url = base_tag.get('href') if base_tag else sources[sourcename]["url"]
    for link in links:
#         print(link.get("href"))
        link["href"]= urljoin(base_url, link.get('href', ""))
#         print(link["href"])

#     print(len(links))
#     links = [l for l in links if l]
#     links = [l.strip() for l in links]

    print(len(links))

    for pattern in sources[sourcename].get("exclude", []):
        # filter links by exclusion pattern
#         print(pattern)
#         print([ l.get("href") for l in links])
        links = [
            l
            for l in links
            if l.get("href") is not None and not re.match(pattern, l.get("href"))
        ]
        # print(len(links))

    for pattern in sources[sourcename].get("include", []):
        # print(pattern, len(links))
        # filter links by inclusion pattern
        # print(pattern)
        # print(type(pattern))
        newlinks = []
        for l in links:
            href = l.get("href")
#             print(href)
            if href and re.match(pattern, href):
                newlinks.append(l)
        links = newlinks
        # links = [l for l in links if re.match(pattern, l.get("href"))]
        # print(len(links))

    # drop empty text
    links = [l for l in links if l.get_text(strip=True)]

    # drop empty url path, i.e. url = toplevel domain
    links = [l for l in links if len(get_path_from_url(trimmed_href(l))) > 1]
    # drop anything that is not http, like javascript: or mailto:
    links = [l for l in links if l.get("href") and l.get("href").startswith("http")]
    # drop some ArsTechnica links that are just the number of comments and dupe the primary link
    links = [l for l in links if not re.match("^(\d+)$", l.get_text(strip=True))]

    for l in links:
        url = trimmed_href(l)
        title = l.get_text(strip=True)
        if title == "LINK":
            # try to update title
            og_dict = get_og_tags(url)
            if og_dict.get("og:title"):
                title = og_dict.get("og:title")

        # skip some low quality links that don't have full headline, like link to a Twitter or Threads account
        if len(title) <= 28 and title != "LINK":
            continue

        all_urls.append({"title": title, "url": url, "src": sourcename})
#         display(Markdown(f"[{title}]({url})"))

    print(len(links))
    print()

    # for p in pages:
    #     print(p)

# Business Insider

334
250



# The Register

199
87



# Ars Technica

252
7



# HackerNoon

563
85



# Google News

974
426



# WaPo Tech

157
36



# FT Tech

459
112



# Bloomberg Tech

290
51



# WSJ Tech

485
9



# NYT Tech

75
18



# Feedly AI

256
241



# Techmeme

332
139



# Hacker News 2

261
29



# Hacker News

257
30



# Reddit

573
46



In [15]:
# make a pandas dataframe
orig_df = (
    pd.DataFrame(all_urls)
    .groupby("url")
    .first()
    .reset_index()
    .sort_values("src")[["src", "title", "url"]]
    .reset_index(drop=True)
    .reset_index(drop=False)
    .rename(columns={"index": "id"})
)
orig_df

Unnamed: 0,id,src,title,url
0,0,Ars Technica,macOS Sonoma 14.4.1 released to fix the stuff ...,https://arstechnica.com/gadgets/2024/03/macos-...
1,1,Ars Technica,Windows Notepad’s midlife renaissance continue...,https://arstechnica.com/gadgets/2024/03/window...
2,2,Ars Technica,“Temporary” disk formatting UI from 1994 still...,https://arstechnica.com/gadgets/2024/03/window...
3,3,Ars Technica,"Samsung users ask, “Why does the S-Pen smell s...",https://arstechnica.com/gadgets/2024/03/users-...
4,4,Ars Technica,"Android 15 gets satellite messaging, starts fo...",https://arstechnica.com/gadgets/2024/03/androi...
...,...,...,...,...
1012,1012,WaPo Tech,"Trump Media merger wins investor approval, net...",https://www.washingtonpost.com/technology/2024...
1013,1013,WaPo Tech,"Landlines are dying out. But to some, they’re ...",https://www.washingtonpost.com/technology/2024...
1014,1014,WaPo Tech,How to use your smartphone to photograph the s...,https://www.washingtonpost.com/technology/2024...
1015,1015,WaPo Tech,Lawmakers see rise in threatening messages as ...,https://www.washingtonpost.com/technology/2024...


In [16]:
# filter ones not seen before
conn = sqlite3.connect('articles.db')

# Retrieve all URLs from the SQLite table
existing_urls = pd.read_sql_query("SELECT url FROM news_articles", conn)

# Close the SQLite connection
conn.close()

# Convert the URLs to a list for easier comparison
existing_urls_list = existing_urls['url'].tolist()

# Filter the original DataFrame
# Keep rows where the URL is not in the existing_urls_list
filtered_df = orig_df[~orig_df['url'].isin(existing_urls_list)]


In [17]:
len(existing_urls_list)

27689

In [18]:
len(filtered_df)

123

# Filter AI-related headlines using a prompt to OpenAI

In [19]:
# make pages that fit in a reasonably sized prompt
MAXPAGELEN = 50
pages = []
current_page = []
pagelength = 0

for row in filtered_df.itertuples():
    curlink = {"id": row.Index, "title": row.title}
    curlength = count_tokens(json.dumps(curlink))
    # Check if adding the current string would exceed the limit
    if len(current_page) >= MAXPAGELEN or pagelength + curlength > MAX_INPUT_TOKENS:
        # If so, start a new page
        pages.append(current_page)
        current_page = [curlink]
        pagelength = curlength
    else:
        # Otherwise, add the string to the current page
        current_page.append(curlink)
        pagelength += curlength

# add the last page if it's not empty
if current_page:
    pages.append(current_page)

len(pages)

3

In [20]:
def get_response_json(
    client,
    messages,
    verbose=False,
    model=MODEL,
    # max_input_tokens=MAX_INPUT_TOKENS,
    max_output_tokens=MAX_OUTPUT_TOKENS,
    max_retries=MAX_RETRIES,
    temperature=TEMPERATURE,
):
    if type(messages) != list:  # allow passing one string for convenience
        messages = [{"role": "user", "content": messages}]

    if verbose:
        print("\n".join([str(msg) for msg in messages]))

    # truncate number of tokens
    # retry loop, have received untrapped 500 errors like too busy
    for i in range(max_retries):
        if i > 0:
            print(f"Attempt {i+1}...")
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=messages,
                temperature=0,
                max_tokens=max_output_tokens,
                response_format={"type": "json_object"},
            )
            # no exception thrown
            return response
        except Exception as error:
            print(f"An exception occurred on attempt {i+1}:", error)
            time.sleep(5)
            continue  # try again
        # retries exceeded if you got this far
    print("Retries exceeded.")
    return None


# messages = [
#     {
#         "role": "system",
#         "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.",
#     },
#     {
#         "role": "user",
#         "content": "Compose a poem that explains the concept of recursion in programming, returning each verse as a json object.",
#     },
# ]

# response = get_response_json(client, messages)
# response

In [21]:
models = sorted(openai.models.list(), key=lambda m: m.created)
models

[Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal'),
 Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo', created=1677610602, object='model', owned_by='openai'),
 Model(id='gpt-3.5-turbo-0301', created=1677649963, object='model', owned_by='openai'),
 Model(id='tts-1', created=1681940951, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k', created=1683758102, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k-0613', created=1685474247, object='model', owned_by='openai'),
 Model(id='gpt-3.5-turbo-0613', created=1686587434, object='model', owned_by='openai'),
 Model(id='gpt-4-0613', created=1686588896, object='model', owned_by='openai'),
 Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'),
 Model(id='davinci-002', created=1692634301, object='model', owned_by='system'),
 Model(id='babbage-002', created=1692634615,

In [22]:
prompt = """
You will act as a research assistant classifying news stories as related to artificial intelligence (AI) or unrelated to AI.

Your task is to read JSON format objects from an input list of news stories using the schema below delimited by |, and output JSON format objects for each using the schema below delimited by ~.

Define a list of objects representing news stories in JSON format as in the following example:
|
{'stories':
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
 ]
}
|

Based on the title, you will classify each story as being about AI or not.

For each object, you will output the input id field, and a field named isAI which is true if the input title is about AI and false if the input title is not about AI.

When extracting information please make sure it matches the JSON format below exactly. Do not output any attributes that do not appear in the schema below.
~
{'stories':
[{'id': 97, 'isAI': true},
 {'id': 103, 'isAI': true},
 {'id': 103, 'isAI': false},
 {'id': 210, 'isAI': true},
 {'id': 298, 'isAI': false}]
}
~

You may interpret the term AI broadly as pertaining to
- machine learning models
- large language models
- robotics
- reinforcement learning
- computer vision
- OpenAI
- ChatGPT
- other closely related topics.

You will return an array of valid JSON objects.

The field 'id' in the output must match the field 'id' in the input EXACTLY.

The field 'isAI' must be either true or false.

The list of news stories to classify and enrich is:


"""

In [23]:
pages[0][0]

{'id': 0,
 'title': 'macOS Sonoma 14.4.1 released to fix the stuff that the 14.4 update broke'}

In [24]:
responses = []
enriched_urls = []
for i, p in enumerate(pages):
    print(
        f"{datetime.now().strftime('%H:%M:%S')} send page {i+1} of {len(pages)}, {len(p)} items "
    )
    # print(prompt + json.dumps(p))
    response = get_response_json(client, prompt + json.dumps(p))
    responses.append(response.choices[0].message.content)
    retval = json.loads(responses[-1])
    retlist = []
    # usually comes back as a dict with a single arbitrary key like "stories" with a list value
    if type(retval) == dict:
        for k, v in retval.items():
            if type(v) == list:
                retlist.extend(v)
            else:
                retlist.append(v)
        print(
            f"{datetime.now().strftime('%H:%M:%S')} got dict with {len(retlist)} items "
        )
    elif type(retval) == list:  # in case it comes back as a list
        retlist = retval
        print(
            f"{datetime.now().strftime('%H:%M:%S')} got list with {len(retlist)} items "
        )
    else:
        print(str(type(retval)))
    enriched_urls.extend(retlist)

16:38:40 send page 1 of 3, 50 items 
16:39:17 got dict with 50 items 
16:39:17 send page 2 of 3, 50 items 
16:39:52 got dict with 50 items 
16:39:52 send page 3 of 3, 23 items 
16:40:07 got dict with 23 items 


In [25]:
enriched_df = pd.DataFrame(enriched_urls)
enriched_df.head()

Unnamed: 0,id,isAI
0,0,False
1,5,True
2,6,False
3,33,False
4,55,True


In [26]:
print("isAI", len(enriched_df.loc[enriched_df["isAI"]]))
print("not isAI", len(enriched_df.loc[~enriched_df["isAI"]]))

isAI 49
not isAI 74


In [27]:
merged_df = pd.merge(filtered_df, enriched_df, on="id", how="outer")
merged_df['date']=datetime.now().date()
merged_df.head()

Unnamed: 0,id,src,title,url,isAI,date
0,0,Ars Technica,macOS Sonoma 14.4.1 released to fix the stuff ...,https://arstechnica.com/gadgets/2024/03/macos-...,False,2024-03-25
1,5,Ars Technica,Where’d my results go? Google Search’s chatbot...,https://arstechnica.com/gadgets/2024/03/google...,True,2024-03-25
2,6,Ars Technica,Mozilla’s privacy service drops a provider wit...,https://arstechnica.com/gadgets/2024/03/mozill...,False,2024-03-25
3,33,Bloomberg Tech,Florida Bans Social Media for Kids Under 14,https://www.bloomberg.com/news/articles/2024-0...,False,2024-03-25
4,55,Business Insider,Watch Neuralink's first human patient play 'Ma...,https://www.businessinsider.com/neuralink-mari...,True,2024-03-25


In [28]:
# ideally should be empty, shouldn't get back rows that don't match to existing
merged_df.loc[merged_df["src"].isna()]

Unnamed: 0,id,src,title,url,isAI,date


In [29]:
# ideally should be empty, should get back all rows from orig
merged_df.loc[merged_df["isAI"].isna()]

Unnamed: 0,id,src,title,url,isAI,date


In [30]:
# # Connect to SQLite database
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

# # Create table with a date column
# cursor.execute('''
# CREATE TABLE IF NOT EXISTS news_articles (
#     id INTEGER PRIMARY KEY,
#     src TEXT,
#     title TEXT,
#     url TEXT UNIQUE,
#     isAI BOOLEAN,
#     article_date DATE
# )
# ''')
# conn.commit()
# conn.close()

In [31]:
# Function to insert a new article
def insert_article(cursor, src, title, url, isAI, article_date):
    try:
        cursor.execute("INSERT OR IGNORE INTO news_articles (src, title, url, isAI, article_date) VALUES (?, ?, ?, ?, ?)",
                       (src, title, url, isAI, article_date))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"Duplicate entry for URL: {url}")
    except Exception as err:
        print(err)

In [32]:
pd.read_sql_query("select count(*) from news_articles", conn)


Unnamed: 0,count(*)
0,27689


In [33]:
for row in merged_df.itertuples():
    # print(row)
    insert_article(cursor, row.src, row.title, row.url, row.isAI, row.date)


In [34]:
pd.read_sql_query("select count(*) from news_articles", conn)


Unnamed: 0,count(*)
0,27812


In [35]:
df = pd.read_sql_query("select * from news_articles", conn)
df


Unnamed: 0,id,src,title,url,isAI,article_date
0,1,Ars Technica,OnePlus 12 gets $800 US release along with the...,https://arstechnica.com/gadgets/2024/01/oneplu...,0,2024-01-24
1,2,Ars Technica,Chrome can now organize your tab bar for you,https://arstechnica.com/gadgets/2024/01/chrome...,0,2024-01-24
2,3,Ars Technica,HP CEO evokes James Bond-style hack via ink ca...,https://arstechnica.com/gadgets/2024/01/hp-ceo...,0,2024-01-24
3,4,Ars Technica,iOS 17.3 adds multiple features originally pla...,https://arstechnica.com/gadgets/2024/01/ios-17...,0,2024-01-24
4,5,Ars Technica,Wild Apples: The 12 weirdest and rarest Macs e...,https://arstechnica.com/gadgets/2024/01/macint...,0,2024-01-24
...,...,...,...,...,...,...
27807,27808,Techmeme,"Global Screening Services, which is developing...",https://techcrunch.com/2024/03/25/london-regte...,0,2024-03-25
27808,27809,The Register,Twitter's lawsuit against anti-hate-speech cru...,https://www.theregister.com/2024/03/25/musk_la...,0,2024-03-25
27809,27810,The Register,"As AI booms, land near nuclear power plants be...",https://www.theregister.com/2024/03/25/ai_boom...,1,2024-03-25
27810,27811,WSJ Tech,Biden Wants to Put AI on a Leash,https://www.wsj.com/articles/biden-wants-to-pu...,1,2024-03-25


In [36]:
len(merged_df.loc[merged_df["isAI"]])

49

In [38]:
AIdf = merged_df.loc[merged_df["isAI"]].reset_index()


In [39]:
# Attempt to order by topic by getting embeddings and solving a traveling salesman problem
embedding_model = 'text-embedding-3-small'
response = client.embeddings.create(input=AIdf['title'].tolist(),
                                    model=embedding_model)
embedding_list = response.data

In [41]:
embedding_df = pd.DataFrame([e.dict()['embedding'] for e in embedding_list])
embedding_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.015461,0.012258,-0.007253,0.027221,-0.006102,-0.029872,-0.043354,0.028042,-0.040905,-0.028499,...,-0.043542,0.04583,-0.012056,0.021529,-0.013456,0.025606,0.016227,-0.020345,0.027059,-0.035335
1,-0.005895,-0.01576,-0.050894,0.024071,-0.012657,-0.050576,0.024097,0.054519,0.018844,-0.003364,...,-0.017639,-0.004999,-0.03115,-0.02771,0.028345,-0.009356,0.021477,0.015602,-0.021398,0.007477
2,0.003589,-0.030053,0.00099,0.026293,0.03565,-0.01403,-0.000468,0.047213,0.017203,0.002332,...,-0.027565,0.03794,0.020101,0.004591,0.019734,0.042436,0.029035,-0.015111,0.00428,0.041276
3,-0.049665,-0.011553,-0.032403,0.027836,-0.033681,-0.044011,-0.042625,0.023446,0.011696,0.006528,...,-0.031262,0.028435,-0.022862,0.050318,0.022835,0.002538,0.043032,0.009569,-0.028407,0.016324
4,-0.01467,-0.002271,-0.010692,0.021591,0.021329,-0.048696,0.02705,0.002745,0.027064,-0.035185,...,-0.015331,-0.010106,-0.033227,0.00799,0.035736,0.027754,-0.035709,-0.009168,0.002756,0.01944
5,0.043282,0.005489,0.018624,0.006045,-0.002691,-0.034264,0.022436,0.023111,0.012939,-0.04461,...,-0.012928,0.029929,0.00477,0.031977,0.001162,0.044022,0.008217,-0.015466,-0.015781,0.006644
6,0.005771,0.002485,0.014066,-0.019889,0.003481,0.015826,-0.004927,0.026489,-0.105387,0.048475,...,-0.015166,0.027848,-0.056161,-0.034473,-0.059319,-0.0034,0.01514,-0.019009,0.010993,0.013089
7,-0.019223,-0.03131,0.02777,0.037578,0.000839,-0.047345,-0.008268,0.033589,-0.002976,-0.009564,...,-0.057357,0.024758,-0.015465,0.039531,0.034295,0.035461,0.029655,0.0097,-0.008947,0.003632
8,0.032998,0.010429,-0.000487,0.008406,-0.022425,-0.032815,-0.018339,0.022921,-0.010788,0.015624,...,-0.013229,0.078736,-0.020141,0.006788,-0.002888,0.030648,-0.00541,-0.00145,0.008302,0.001384
9,-0.01197,-0.021761,0.018384,0.0503,0.025029,-0.045749,-0.018009,0.035413,0.022681,0.05335,...,-0.043038,0.036261,-0.023795,0.044491,0.00153,0.004787,0.050542,-0.014027,-0.026385,0.047105


In [42]:
# naive greedy solution to traveling salesman problem

import numpy as np
from scipy.spatial.distance import cdist

embedding_array = embedding_df.values

def nearest_neighbor_sort(embedding_array):
    # Compute the pairwise Euclidean distances between all embeddings
    distances = cdist(embedding_array, embedding_array, metric='euclidean')

    # Start from the first headline as the initial point
    path = [0]
    visited = set(path)

    while len(path) < len(embedding_array):
        last = path[-1]
        # Set the distances to already visited nodes to infinity to avoid revisiting
        distances[:, last][list(visited)] = np.inf
        # Find the nearest neighbor
        nearest = np.argmin(distances[:, last])
        path.append(nearest)
        visited.add(nearest)

    return np.array(path)

# Get the sorted indices
sorted_indices = nearest_neighbor_sort(embedding_array)

# The sorted embedding array can be obtained using these indices
sorted_embedding_array = embedding_array[sorted_indices]

sorted_indices  # Show the first few indices of the sorted path


array([ 0, 38, 20, 14, 16, 37,  3, 39,  4, 32, 40, 24, 12,  5, 35, 22, 33,
       26, 23, 47, 36, 41, 44,  7, 11, 25, 13,  2, 45, 46,  8, 48, 15, 21,
       43, 10, 29, 17,  1, 27, 31, 42, 18, 34, 19,  6,  9, 30, 28])

In [45]:
for i, j in enumerate(sorted_indices):
    row = AIdf.iloc[j]
    display(Markdown(f"[{i}. {row.title} - {row.src}]({row.url})"))
    

[0. Where’d my results go? Google Search’s chatbot is no longer opt-in - Ars Technica](https://arstechnica.com/gadgets/2024/03/google-search-will-start-automatically-showing-a-chatbot-to-some-users/)

[1. Where'd my results go? Google Search's chatbot is no longer opt-in - Google News](https://news.google.com/articles/CBMib2h0dHBzOi8vYXJzdGVjaG5pY2EuY29tL2dhZGdldHMvMjAyNC8wMy9nb29nbGUtc2VhcmNoLXdpbGwtc3RhcnQtYXV0b21hdGljYWxseS1zaG93aW5nLWEtY2hhdGJvdC10by1zb21lLXVzZXJzL9IBAA)

[2. WhatsApp is testing an all-knowing AI chatbot that will live in your search bar - Google News](https://news.google.com/articles/CBMifGh0dHBzOi8vd3d3LnRlY2hyYWRhci5jb20vY29tcHV0aW5nL3NvZnR3YXJlL3doYXRzYXBwLWlzLXRlc3RpbmctYW4tYWxsLWtub3dpbmctYWktY2hhdGJvdC10aGF0LXdpbGwtbGl2ZS1pbi15b3VyLXNlYXJjaC1iYXLSAQA)

[3. The Financial Times is ready for its AI to answer your questions (well, some of them) - Google News](https://news.google.com/articles/CBMidWh0dHBzOi8vd3d3Lm5pZW1hbmxhYi5vcmcvMjAyNC8wMy90aGUtZmluYW5jaWFsLXRpbWVzLWlzLXJlYWR5LWZvci1pdHMtYWktdG8tYW5zd2VyLXlvdXItcXVlc3Rpb25zLXdlbGwtc29tZS1vZi10aGVtL9IBAA)

[4. Are you willing to get the COVID-19 vaccine? AI says it knows the answer - Google News](https://news.google.com/articles/CBMic2h0dHBzOi8vd3d3LmNsZXZlbGFuZC5jb20vbmV3cy8yMDI0LzAzL2FyZS15b3Utd2lsbGluZy10by1nZXQtdGhlLWNvdmlkLTE5LXZhY2NpbmUtYWktc2F5cy1pdC1rbm93cy10aGUtYW5zd2VyLmh0bWzSAYIBaHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9uZXdzLzIwMjQvMDMvYXJlLXlvdS13aWxsaW5nLXRvLWdldC10aGUtY292aWQtMTktdmFjY2luZS1haS1zYXlzLWl0LWtub3dzLXRoZS1hbnN3ZXIuaHRtbD9vdXRwdXRUeXBlPWFtcA)

[5. Your personal doomsday calculator: Can AI predict when you'll die? - Google News](https://news.google.com/articles/CBMiZWh0dHBzOi8vd3d3Lndpb25ld3MuY29tL3ZpZGVvcy95b3VyLXBlcnNvbmFsLWRvb21zZGF5LWNhbGN1bGF0b3ItY2FuLWFpLXByZWRpY3Qtd2hlbi15b3VsbC1kaWUtNzA0MTgw0gFpaHR0cHM6Ly93d3cud2lvbmV3cy5jb20vdmlkZW9zL3lvdXItcGVyc29uYWwtZG9vbXNkYXktY2FsY3VsYXRvci1jYW4tYWktcHJlZGljdC13aGVuLXlvdWxsLWRpZS03MDQxODAvYW1w)

[6. Google AI tool could potentially be used to diagnose a person's cough - Feedly AI](https://scrippsnews.com/stories/google-ai-tool-could-potentially-be-used-to-diagnose-a-person-s-cough/)

[7. Can you hear me now? AI-coustics to fight noisy audio with generative AI - Google News](https://news.google.com/articles/CBMiamh0dHBzOi8vdGVjaGNydW5jaC5jb20vMjAyNC8wMy8yNS9jYW4teW91LWhlYXItbWUtbm93LWFpLWNvdXN0aWNzLXRvLWZpZ2h0LW5vaXN5LWF1ZGlvLXdpdGgtZ2VuZXJhdGl2ZS1haS_SAW5odHRwczovL3RlY2hjcnVuY2guY29tLzIwMjQvMDMvMjUvY2FuLXlvdS1oZWFyLW1lLW5vdy1haS1jb3VzdGljcy10by1maWdodC1ub2lzeS1hdWRpby13aXRoLWdlbmVyYXRpdmUtYWkvYW1wLw)

[8. Sora AI: What is it? How to access video generator - Feedly AI](https://readwrite.com/sora-ai-what-is-it-how-to-access-video-generator/)

[9. Tiny Corp launching Nvidia-powered AI computer because 'it just works' - Google News](https://news.google.com/articles/CBMiPGh0dHBzOi8vd3d3LnRoZXJlZ2lzdGVyLmNvbS8yMDI0LzAzLzI1L3RpbnlfY29ycF9hbWRfbnZpZGlhL9IBQGh0dHBzOi8vd3d3LnRoZXJlZ2lzdGVyLmNvbS9BTVAvMjAyNC8wMy8yNS90aW55X2NvcnBfYW1kX252aWRpYS8)

[10. Tiny Corp Offers AMD & NVIDIA Powered AI Systems Starting At $15K, Says If You Want “It Just Works”, Buy Green - Google News](https://news.google.com/articles/CBMiamh0dHBzOi8vd2NjZnRlY2guY29tL3RpbnktY29ycC1hbWQtbnZpZGlhLWFpLXN5c3RlbXMtc3RhcnRpbmctMTVrLXVzZC1pZi15b3Utd2FudC1pdC1qdXN0LXdvcmtzLWJ1eS1ncmVlbi_SAW5odHRwczovL3djY2Z0ZWNoLmNvbS90aW55LWNvcnAtYW1kLW52aWRpYS1haS1zeXN0ZW1zLXN0YXJ0aW5nLTE1ay11c2QtaWYteW91LXdhbnQtaXQtanVzdC13b3Jrcy1idXktZ3JlZW4vYW1wLw)

[11. 11 Big Nvidia Announcements At GTC 2024: Blackwell GPUs, AI Microservices And More - Google News](https://news.google.com/articles/CBMiiAFodHRwczovL3d3dy5jcm4uY29tL25ld3MvY29tcG9uZW50cy1wZXJpcGhlcmFscy8yMDI0L252aWRpYS1zLWJpZ2dlc3QtYW5ub3VuY2VtZW50cy1hdC1ndGMtMjAyNC1ibGFja3dlbGwtZ3B1cy1haS1taWNyb3NlcnZpY2VzLWFuZC1tb3Jl0gEA)

[12. Pure Storage, Nvidia partner to democratize AI with new infrastructure solutions - Feedly AI](https://venturebeat.com/ai/pure-storage-nvidia-partner-to-democratize-ai-with-new-infrastructure-solutions/)

[13. Samsung is going after Nvidia's billions with new AI chip — Mach-1 accelerator will combine CPU, GPU and memory to tackle inference tasks but not training - Feedly AI](https://www.techradar.com/pro/samsungs-going-after-nvidias-billions-with-new-ai-chip-mach-1-accelerator-will-combine-cpu-gpu-and-memory-to-tackle-inference-tasks-but-not-training)

[14. The iPhone 16 Pro's chipset could be designed with AI in mind - Google News](https://news.google.com/articles/CBMiZGh0dHBzOi8vd3d3LnRlY2hyYWRhci5jb20vcGhvbmVzL2lwaG9uZS90aGUtaXBob25lLTE2LXByb3MtY2hpcHNldC1jb3VsZC1iZS1kZXNpZ25lZC13aXRoLWFpLWluLW1pbmTSAQA)

[15. 5-ish Things About AI: Apple Serious About AI, Pitting AI Against the Experts, the ELVIS Act Passes - Google News](https://news.google.com/articles/CBMihQFodHRwczovL3d3dy5jbmV0LmNvbS90ZWNoL2NvbXB1dGluZy81LWlzaC10aGluZ3MtYWJvdXQtYWktYXBwbGUtc2VyaW91cy1hYm91dC1haS1waXR0aW5nLWFpLWFnYWluc3QtdGhlLWV4cGVydHMtdGhlLWVsdmlzLWFjdC1wYXNzZXMv0gEA)

[16. What CEOs talked about in Q1 2024: AI, sustainability, and upcoming elections - Google News](https://news.google.com/articles/CBMiPGh0dHBzOi8vaW90LWFuYWx5dGljcy5jb20vd2hhdC1jZW9zLXRhbGtlZC1hYm91dC1pbi1xMS0yMDI0L9IBAA)

[17. GDC 2024, PS5 Pro, and Ubisoft's AI NPCs | GI Microcast - Google News](https://news.google.com/articles/CBMiUGh0dHBzOi8vd3d3LmdhbWVzaW5kdXN0cnkuYml6L2dkYy0yMDI0LXBzNS1wcm8tYW5kLXViaXNvZnRzLWFpLW5wY3MtZ2ktbWljcm9jYXN00gEA)

[18. Nvidia head reckons we'll have games where AI generates 'every pixel in real-time' in under a decade - Google News](https://news.google.com/articles/CBMiggFodHRwczovL3d3dy5yb2NrcGFwZXJzaG90Z3VuLmNvbS9udmlkaWEtaGVhZC1yZWNrb25zLXdlbGwtaGF2ZS1nYW1lcy13aGVyZS1haS1nZW5lcmF0ZXMtZXZlcnktcGl4ZWwtaW4tcmVhbC10aW1lLWluLXVuZGVyLWEtZGVjYWRl0gEA)

[19. As AI booms, land near nuclear power plants becomes hot real estateCheap low-carbon energy? What's not to love...Systems1 hr|7 - The Register](https://www.theregister.com/2024/03/25/ai_boom_nuclear/)

[20. This AI architect will design your climate-friendly dream home - Google News](https://news.google.com/articles/CBMiYmh0dHBzOi8vd3d3Lm1vbmV5d2ViLmNvLnphL25ld3MvYWkvdGhpcy1haS1hcmNoaXRlY3Qtd2lsbC1kZXNpZ24teW91ci1jbGltYXRlLWZyaWVuZGx5LWRyZWFtLWhvbWUv0gEA)

[21. ASU business school launches AI degree program | ASU News - Google News](https://news.google.com/articles/CBMiamh0dHBzOi8vbmV3cy5hc3UuZWR1LzIwMjQwMzI1LWJ1c2luZXNzLWFuZC1lbnRyZXByZW5ldXJzaGlwLWFzdS1idXNpbmVzcy1zY2hvb2wtbGF1bmNoZXMtYWktZGVncmVlLXByb2dyYW3SAQA)

[22. Georgia Tech produces most talent in AI across country, per JLL report - Atlanta Business Chronicle - Google News](https://news.google.com/articles/CBMiUmh0dHBzOi8vd3d3LmJpempvdXJuYWxzLmNvbS9hdGxhbnRhL25ld3MvMjAyNC8wMy8yNS9nZW9yZ2lhLXRlY2gtYWktZ3JhZHVhdGVzLmh0bWzSAQA)

[23. OpenAI Sets Meetings With Hollywood In Bid To Break Into The Movie Business | GIANT FREAKIN ROBOT - Feedly AI](https://www.giantfreakinrobot.com/ent/openai-hollywood.html)

[24. Bankrupt FTX Sells Stake In Hot AI Startup Anthropic - Feedly AI](https://www.barrons.com/news/bankrupt-ftx-sells-stake-in-hot-ai-startup-anthropic-9a42e039)

[25. FTX to sell majority of stake in Anthropic for $884M - Google News](https://news.google.com/articles/CBMiT2h0dHBzOi8vZmluYW5jZS55YWhvby5jb20vbmV3cy9mdHgtc2VsbC1tYWpvcml0eS1zdGFrZS1hbnRocm9waWMtMTcwMjAwNzYzLmh0bWzSAQA)

[26. FTX to Sell $884M of Anthropic Shares to Two Dozen Institutional Investors - Google News](https://news.google.com/articles/CBMidWh0dHBzOi8vd3d3LmNvaW5kZXNrLmNvbS9wb2xpY3kvMjAyNC8wMy8yNS9mdHgtdG8tc2VsbC04ODRtLW9mLWFudGhyb3BpYy1zaGFyZXMtdG8tdHdvLWRvemVuLWluc3RpdHV0aW9uYWwtaW52ZXN0b3JzL9IBeWh0dHBzOi8vd3d3LmNvaW5kZXNrLmNvbS9wb2xpY3kvMjAyNC8wMy8yNS9mdHgtdG8tc2VsbC04ODRtLW9mLWFudGhyb3BpYy1zaGFyZXMtdG8tdHdvLWRvemVuLWluc3RpdHV0aW9uYWwtaW52ZXN0b3JzL2FtcC8)

[27. FTX to Sell Shares in Claude AI Developer Anthropic for $884 Million - Feedly AI](https://decrypt.co/223288/ftx-to-sell-shares-in-claude-ai-developer-anthropic-for-884-millionposted)

[28. FTX to Sell Shares in Claude AI Developer Anthropic for $884 Million - Google News](https://news.google.com/articles/CBMiXWh0dHBzOi8vZGVjcnlwdC5jby8yMjMyODgvZnR4LXRvLXNlbGwtc2hhcmVzLWluLWNsYXVkZS1haS1kZXZlbG9wZXItYW50aHJvcGljLWZvci04ODQtbWlsbGlvbtIBY2h0dHBzOi8vZGVjcnlwdC5jby8yMjMyODgvZnR4LXRvLXNlbGwtc2hhcmVzLWluLWNsYXVkZS1haS1kZXZlbG9wZXItYW50aHJvcGljLWZvci04ODQtbWlsbGlvbj9hbXA9MQ)

[29. Fake AI-Generated Books Swarm Amazon - Hacker News 2](https://goodereader.com/blog/amazon-news/fake-ai-generated-books-swarm-amazon)

[30. Doctor Who Promos Spark Massive Backlash for Using AI, BBC Responds - Feedly AI](https://www.cbr.com/doctor-who-ai-backlash/)

[31. Biden Wants to Put AI on a Leash - WSJ Tech](https://www.wsj.com/articles/biden-wants-to-put-artificial-intelligence-on-a-leash-progressive-regulation-45275102)

[32. Akina Is The New AI Platform That Wants To Be A “Google For Black Women" - Google News](https://news.google.com/articles/CBMicGh0dHBzOi8vcGVvcGxlb2Zjb2xvcmludGVjaC5jb20vYXJ0aWNsZXMvYWtpbmEtaXMtdGhlLW5ldy1haS1wbGF0Zm9ybS10aGF0LXdhbnRzLXRvLWJlLWEtZ29vZ2xlLWZvci1ibGFjay13b21lbi_SAQA)

[33. One year in, Khan Academy's AI has 65000 students, and is still learning new skills - Google News](https://news.google.com/articles/CBMifGh0dHBzOi8vd3d3LmZhc3Rjb21wYW55LmNvbS85MTA2NjczMS9vbmUteWVhci1pbi1raGFuLWFjYWRlbXlzLWFpLWhhcy02NTAwMC1zdHVkZW50cy1hbmQtaXMtc3RpbGwtbGVhcm5pbmctbmV3LXNraWxscy1pdHNlbGbSAQA)

[34. Distillery in Scotland using AI to create limited edition whisky - Google News](https://news.google.com/articles/CBMiV2h0dHBzOi8vd3d3LmZveG5ld3MuY29tL3RlY2gvZGlzdGlsbGVyeS1zY290bGFuZC11c2luZy1haS1jcmVhdGUtbGltaXRlZC1lZGl0aW9uLXdoaXNredIBW2h0dHBzOi8vd3d3LmZveG5ld3MuY29tL3RlY2gvZGlzdGlsbGVyeS1zY290bGFuZC11c2luZy1haS1jcmVhdGUtbGltaXRlZC1lZGl0aW9uLXdoaXNreS5hbXA)

[35. Distillery in Scotland using AI to create limited edition whisky - Feedly AI](https://www.foxnews.com/tech/distillery-scotland-using-ai-create-limited-edition-whisky)

[36. Distillery in Scotland using AI and algorithms to explore whisky flavors and mouthfeel - Google News](https://news.google.com/articles/CBMiK2h0dHBzOi8vd3d3LmZveG5ld3MuY29tL3ZpZGVvLzYzNDk2ODM2MzAxMTLSAQA)

[37. This AI Paper from Max Planck, Adobe, and UCSD Proposes Explorative Inbetweening of Time and Space Using Time Reversal Fusion (TRF) - Google News](https://news.google.com/articles/CBMiqAFodHRwczovL3d3dy5tYXJrdGVjaHBvc3QuY29tLzIwMjQvMDMvMjUvdGhpcy1haS1wYXBlci1mcm9tLW1heC1wbGFuY2stYWRvYmUtYW5kLXVjc2QtcHJvcG9zZXMtZXhwbG9yYXRpdmUtaW5iZXR3ZWVuaW5nLW9mLXRpbWUtYW5kLXNwYWNlLXVzaW5nLXRpbWUtcmV2ZXJzYWwtZnVzaW9uLXRyZi_SAawBaHR0cHM6Ly93d3cubWFya3RlY2hwb3N0LmNvbS8yMDI0LzAzLzI1L3RoaXMtYWktcGFwZXItZnJvbS1tYXgtcGxhbmNrLWFkb2JlLWFuZC11Y3NkLXByb3Bvc2VzLWV4cGxvcmF0aXZlLWluYmV0d2VlbmluZy1vZi10aW1lLWFuZC1zcGFjZS11c2luZy10aW1lLXJldmVyc2FsLWZ1c2lvbi10cmYvP2FtcA)

[38. Watch Neuralink's first human patient play 'Mario Kart' with his mind - Business Insider](https://www.businessinsider.com/neuralink-mario-kart-brain-video-human-patient-2024-3)

[39. New Apple Pencil With Vision Pro Support in Testing Ahead of visionOS 2 - Google News](https://news.google.com/articles/CBMiTmh0dHBzOi8vd3d3Lm1hY3J1bW9ycy5jb20vMjAyNC8wMy8yNS9hcHBsZS1wZW5jaWwtdmlzaW9ub24tc3VwcG9ydC1pbi10ZXN0aW5nL9IBAA)

[40. Rumor: New Apple Pencil could come with Vision Pro support - Google News](https://news.google.com/articles/CBMiPWh0dHBzOi8vOXRvNW1hYy5jb20vMjAyNC8wMy8yNS9hcHBsZS12aXNpb24tcHJvLWFwcGxlLXBlbmNpbC_SAQA)

[41. New Apple Pencil might work with Vision Pro, which sounds weird but is actually genius - Google News](https://news.google.com/articles/CBMia2h0dHBzOi8vYmdyLmNvbS90ZWNoL25ldy1hcHBsZS1wZW5jaWwtbWlnaHQtd29yay13aXRoLXZpc2lvbi1wcm8td2hpY2gtc291bmRzLXdlaXJkLWJ1dC1pcy1hY3R1YWxseS1nZW5pdXMv0gEA)

[42. Camera flagship Nubia Z60 Ultra coming soon as a special Photographer Edition with more AI features - Google News](https://news.google.com/articles/CBMijwFodHRwczovL3d3dy5ub3RlYm9va2NoZWNrLm5ldC9DYW1lcmEtZmxhZ3NoaXAtTnViaWEtWjYwLVVsdHJhLWNvbWluZy1zb29uLWFzLWEtc3BlY2lhbC1QaG90b2dyYXBoZXItRWRpdGlvbi13aXRoLW1vcmUtQUktZmVhdHVyZXMuODE3NTY1LjAuaHRtbNIBAA)

[43. nubia Z60 Ultra is getting a Photography Edition with more AI - GSMArena.com news - Google News](https://news.google.com/articles/CBMiZWh0dHBzOi8vd3d3LmdzbWFyZW5hLmNvbS9udWJpYV96NjBfdWx0cmFfaXNfZ2V0dGluZ19hX3Bob3RvZ3JhcGh5X2VkaXRpb25fd2l0aF9tb3JlX2FpLW5ld3MtNjIxMzYucGhw0gFiaHR0cHM6Ly9tLmdzbWFyZW5hLmNvbS9udWJpYV96NjBfdWx0cmFfaXNfZ2V0dGluZ19hX3Bob3RvZ3JhcGh5X2VkaXRpb25fd2l0aF9tb3JlX2FpLWFtcC02MjEzNi5waHA)

[44. Nubia Z60 Ultra 'Photographer's Edition' will do real-time language translation similar to the Galaxy S24 series - Google News](https://news.google.com/articles/CBMikwFodHRwczovL3d3dy5naXptb2NoaW5hLmNvbS8yMDI0LzAzLzI1L251YmlhLXo2MC11bHRyYS1waG90b2dyYXBoZXJzLWVkaXRpb20tcmVhbC10aW1lLWFpLWNhbGwtY29udmVyc2F0aW9uLXRyYW5zbGF0aW9uLXNpbWlsYXItdG8tZ2FsYXh5LXMyNC11bHRyYS_SAQA)

[45. Large language models use a surprisingly simple mechanism to retrieve some stored knowledge - Feedly AI](https://news.mit.edu/2024/large-language-models-use-surprisingly-simple-mechanism-retrieve-stored-knowledge-0325)

[46. AI chatbots can help with your mental health. Just don’t think of them as therapy - Feedly AI](https://www.fastcompany.com/91068813/ai-chatbots-mental-health-therapy)

[47. Vernor Vinge, first author to describe cyberspace and 'The Singularity,' dies at 79 - Google News](https://news.google.com/articles/CBMiPWh0dHBzOi8vd3d3LnRoZXJlZ2lzdGVyLmNvbS8yMDI0LzAzLzIyL3Zlcm5vcl92aW5nZV9vYml0dWFyeS_SAUFodHRwczovL3d3dy50aGVyZWdpc3Rlci5jb20vQU1QLzIwMjQvMDMvMjIvdmVybm9yX3ZpbmdlX29iaXR1YXJ5Lw)

[48. Vernor Vinge (1944-2024) – Locus Online - Google News](https://news.google.com/articles/CBMiNGh0dHBzOi8vbG9jdXNtYWcuY29tLzIwMjQvMDMvdmVybm9yLXZpbmdlLTE5NDQtMjAyNC_SAQA)

# Load posts from BlueSky and format for Substack or a blog post
for now I share the interesting stuff on bluesky and then use this code to grab latest BlueSky 'tweets' and format a [Substack post](https://skynetandchill.com)

In [None]:
client = Client(base_url='https://bsky.social')
client.login(os.environ['BSKY_USERNAME'], os.environ['BSKY_SECRET'])

mydid = {"did":"did:plc:qomkdnxrqw3gkbytdxea5z65"}

data = client.get_author_feed(
    actor=mydid['did'],
    filter='posts_and_author_threads',
    limit=50,
)


In [None]:
def remove_urls(text):
    # Regular expression to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    # Substitute found URLs with an empty string
    clean_text = re.sub(url_pattern, '', text)
    return clean_text



In [None]:
def rawfetchurl(url, timeout=60):
    """get url using requests with specified timeout. return response object, status, content-type"""
    try:
        response = requests.get(url, timeout=timeout)
    except httplib.BadStatusLine:
        print("Bad response (?) fetching url %s " % url)
        response = None
    except requests.Timeout:
        print("Timeout fetching url %s " % url)
        response = None
    except requests.ConnectionError as e:
        print("Connection error (%s) fetching url %s " % (str(e), url))
        response = None
    except requests.TooManyRedirects:
        print("Too many redirects fetching url %s " % url)
        response = None
    except requests.exceptions.MissingSchema:
        print("Missing schema url %s " % url)
        response = None
    except requests.exceptions.InvalidSchema:
        print("Invalid schema url %s " % url)
        response = None
    except requests.exceptions.InvalidURL as e:
        print("Invalid url %s, %s" % (url, str(e)))
        response = None
    except ValueError as e:
        # don't log url because possibly malformed url
        print("ValueError, url ?: ? ")
        response = None
    except httplib.IncompleteRead as e:
        print("IncompleteRead, url %s: %s " % (url, str(e)))
        response = None
    except urllib3.exceptions.SSLError as e:
        print("SSLError, url %s: %s " % (url, str(e)))
        response = None
    except requests.exceptions.ContentDecodingError as e:
        print("SSLError, url %s: %s " % (url, str(e)))
        response = None
    except requests.exceptions.ChunkedEncodingError as e:
        print("ChunkedEncodingError, url %s: %s " % (url, str(e)))
        response = None
    except UnicodeEncodeError as e:
        print("UnicodeEncodeError, url %s: %s " % (url, str(e)))
        response = None
    except OpenSSL.SSL.SysCallError as e:
        print("OpenSSL.SSL.SysCallError, url %s: %s " % (url, str(e)))
        response = -1
    except OpenSSL.SSL.ZeroReturnError as e:
        print("OpenSSL.SSL.ZeroReturnError, url %s: %s " % (url, str(e)))
        response = -1

    # except requests.packages.urllib3.exceptions.DecodeError as e:
    #     utilLog("DecodeError, url %s: %s " % (url, str(e)))
    #     response = None

    return response


imgurl = 'https://nypost.com/wp-content/uploads/sites/2/2024/02/nate-silver-calls-shut-gemini-77192719.jpg?quality=75&strip=all&w=1024'
r = rawfetchurl(imgurl)

In [None]:

# #         impath = "%s/%d.%s" % (today_orig_dir, actualurl.id, file_ext)
# impath = 'x.jpg'
# with open(impath, 'wb') as file:
#     file.write(r.content)

# display(IPython.display.Image(filename=impath))


In [None]:

# def resize_and_crop(image_path, output_path, size=(360, 360)):
#     # Open the image
#     image = PIL.Image.open(image_path)

#     # Calculate the aspect ratio
#     aspect_ratio = image.width / image.height
#     target_aspect_ratio = size[0] / size[1]

#     # Determine the scaling factor and new size
#     if aspect_ratio > target_aspect_ratio:
#         # Image is wider than desired aspect ratio
#         new_height = size[1]
#         new_width = int(new_height * aspect_ratio)
#     else:
#         # Image is taller than desired aspect ratio
#         new_width = size[0]
#         new_height = int(new_width / aspect_ratio)

#     # Resize the image
#     image = image.resize((new_width, new_height))

#     # Calculate coordinates to crop the image to the target size
#     left = (new_width - size[0]) / 2
#     top = (new_height - size[1]) / 2
#     right = (new_width + size[0]) / 2
#     bottom = (new_height + size[1]) / 2

#     # Crop the image
#     image = image.crop((left, top, right, bottom))

#     # Save the cropped image
#     image.save(output_path)

def resize_and_crop(input_image_path, output_image_path, desired_height=240):
    # Load the image
    with Image.open(input_image_path) as img:
        img = img.convert('RGB')

        # Calculate the new width maintaining the aspect ratio
        aspect_ratio = img.width / img.height
        new_width = int(desired_height * aspect_ratio)

        # Resize the image
        resized_img = img.resize((new_width, desired_height))

        # Save the resized image
        resized_img.save(output_image_path)

# output_path = 'square.jpg'
# resize_and_crop(impath, output_path)
# display(IPython.display.Image(filename=output_path))


In [None]:
# attempt to remove traiing inline URLs

def truncate_last_occurrence(text: str) -> str:
    # Find the last occurrence of a space followed by any sequence of characters followed by 3 periods
    pattern = r'\s+\S+\.{3}$'
    return re.sub(pattern, '', text)

# Example text for testing

example_text = """Elon Musk says we'll run out of power capacity to run all the AI chips in 2025
newatlas.com/technology/e..."""

# Truncate the last occurrence
print(truncate_last_occurrence(example_text))



In [None]:

# for post in data.feed:
#     post_str = post.post.record.text.rstrip()
#     post_str = truncate_last_occurrence(post_str)
#     post_url = ""
#     try:
#         post_url = post.post.record.embed.external.uri.rstrip()
#     except:
#         pass

#     print(remove_urls(post_str))
#     print(post_url)
#     print()


In [None]:
# for i, post in enumerate(data.feed):
#     post_str = post.post.record.text.rstrip()
#     post_str = truncate_last_occurrence(post_str)
#     post_url = ""
#     tag_dict = {}
#     try:
#         post_url = post.post.record.embed.external.uri.rstrip()
#     except:
#         pass
#     if post_url:
#         tag_dict = get_og_tags(post_url)
#         display_str = f"[{post_str}]({post_url})"
#         site_name = tag_dict.get('og:site_name')
#         img_url = tag_dict.get('og:image')
#         if site_name:
#             display_str += f" - {site_name}"
#         if img_url:
#             try:
#                 r = rawfetchurl(img_url)
#                 content_type = r.headers['Content-Type']
#                 content_type = content_type[content_type.find('/')+1:]
#                 impath = f"source{i}.{content_type}"
#                 with open(impath, 'wb') as file:
#                     file.write(r.content)
#                 output_path = f'Image{i}.jpg'
#                 resize_and_crop(impath, output_path)
#                 display(IPython.display.Image(filename=output_path))
#             except Exception as e:
#                 print(e)
#         display(Markdown(display_str))
#     else:
#         display(Markdown(f"{post_str}"))


In [None]:
imgdir = 'tmp'  # for images
delete_files(imgdir)

display(Markdown(f"Follow latest AI headlines via [SkynetAndChill.com on Bluesky](https://bsky.app/profile/skynetandchill.com)"))


for i, post in enumerate(data.feed):
    post_str = post.post.record.text.rstrip()
    post_str = truncate_last_occurrence(post_str)
    post_url = ""
    tag_dict = {}
    try:
        post_url = post.post.record.embed.external.uri.rstrip()
    except:
        pass
    if post_url:
        tag_dict = get_og_tags(post_url)
        display_str = f"[{post_str}]({post_url})"
        site_name = tag_dict.get('og:site_name')
        img_url = tag_dict.get('og:image')
        if site_name:
            display_str += f" - {site_name}"
        if img_url:
            try:
                r = rawfetchurl(img_url)
                content_type = r.headers['Content-Type']
                content_type = content_type[content_type.find('/')+1:]
                impath = f"{imgdir}/source{i}.{content_type}"
                with open(impath, 'wb') as file:
                    file.write(r.content)
                output_path = f'{imgdir}/Image{i}.jpg'
                resize_and_crop(impath, output_path)
                display(IPython.display.Image(filename=output_path))
            except Exception as e:
                print(e)
        display(Markdown(display_str))
        display(Markdown("___"))

    else:
        display(Markdown(f"{post_str}"))

In [None]:
data.feed[0].post.record.embed.external.thumb.dict()

In [None]:
data.feed[0].post.record.embed.dict()

In [None]:
print(datetime.now())

In [None]:
pd.read_sql_query("select * from news_articles where article_date > '2024-03-23'", conn)
