AInewsbot.ipynb

- Open URLs of news sites specififed in `sources` dict using Selenium and Firefox
- Save HTML of each URL in htmldata directory
- Extract URLs from all files, create a pandas dataframe with url, title, src
- Use ChatGPT to filter only AI-related headlines by sending a prompt and formatted table of headlines
- Use SQLite to filter headlines previously seen 
- OPENAI_API_KEY should be in the environment or in a .env file
  
Alternative manual workflow to get HTML files if necessary
- Use Chrome, open e.g. Tech News bookmark folder, right-click and open all bookmarks in new window
- on Google News, make sure switch to AI tab
- on Google News, Feedly, Reddit, scroll to additional pages as desired
- Use SingleFile extension, 'save all tabs'
- Move files to htmldata directory
- Run lower part of notebook to process the data


In [1]:
import json
import os
import re
from datetime import datetime, timedelta
from urllib.parse import urlparse
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# use firefox because it updates less often, can disable updates
# recommend importing profile from Chrome for cookies, passwords
# looks less like a bot with more user cruft in the profile
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service

import bs4
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

import openai
from openai import OpenAI
import tiktoken

import dotenv
import pandas as pd
import sqlite3

import IPython
from IPython.display import HTML, Markdown, display

from atproto import Client

import PIL
from PIL import Image

print(f"openai          {openai.__version__}")
print(f"requests        {requests.__version__}")
print(f"BeautifulSoup   {bs4.__version__}")

openai          1.14.2
requests        2.31.0
BeautifulSoup   4.12.3


In [2]:
# load credentials if necessary
dotenv.load_dotenv()
client = OpenAI()

In [3]:
# delete files in output directory
download_dir = "htmldata"

def delete_files(outputdir):

    # Iterate over all files in the directory
    for filename in os.listdir(outputdir):
        file_path = os.path.join(outputdir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.remove(file_path)  # Remove the file
            elif os.path.isdir(file_path):
                # If you want to remove subdirectories as well, use os.rmdir() here
                pass
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')


delete_files(download_dir)

# Specify sources

In [4]:
sources = {
    # "Twitter": {
    #     "title": "@TheLinkfest_AI _ X",
    #     "include": ["^https://twitter.com/(\w+)/status/(\d+)$"],
    # },
    "Reddit": {
        "title": "top scoring links _ multi",
        "url": "https://www.reddit.com/r/ChatGPT+ChatGPTCoding+MacOS+MachineLearning+OpenAI+ProgrammerHumor+Windows10+battlestations+buildapc+cordcutters+dataisbeautiful+gadgets+hardware+linux+msp+programming+realtech+software+talesfromtechsupport+tech+technews+technology+techsupportgore+windows/top/?sort=top&t=day",
        "scroll": 2,
        "exclude": [
            "^https://www.reddit.com/",
            "^https://chat.reddit.com/",
            "^https://i.redd.it/",
            "^https://redditblog.com/",
            "^https://www.redditinc.com/",
            "^https://www.reddithelp.com/",
            "^https://itunes.apple.com/",
            "^https://play.google.com/",
        ],
    },
    "Hacker News": {
        "title": "Hacker News Page 1",
        "url": "https://news.ycombinator.com/",
        "exclude": [
            "https://news.ycombinator.com/",
            "https://www.ycombinator.com/",
        ],
    },
    "Hacker News 2": {
        "title": "Hacker News Page 2",
        "url": "https://news.ycombinator.com/?p=2",
        "exclude": [
            "https://news.ycombinator.com/",
            "https://www.ycombinator.com/",
        ],
    },
    "Techmeme": {
        "title": "Techmeme",
        "url": "https://www.techmeme.com/river",
        "exclude": [
            "^https://www.techmeme.com",
            "^https://twitter.com/",
            "^https://www.threads.net",
            "^https://www.linkedin.com",
            "^https://mastodon.social",
            "^https://bsky.app",
        ],
    },
    "Feedly AI": {
        "title": "Discover and Add New Feedly AI Feeds",
        "url": "https://feedly.com/i/aiFeeds?options=eyJsYXllcnMiOlt7InBhcnRzIjpbeyJpZCI6Im5scC9mL3RvcGljLzMwMDAifV0sInNlYXJjaEhpbnQiOiJ0ZWNobm9sb2d5IiwidHlwZSI6Im1hdGNoZXMiLCJzYWxpZW5jZSI6ImFib3V0In1dLCJidW5kbGVzIjpbeyJ0eXBlIjoic3RyZWFtIiwiaWQiOiJ1c2VyLzYyZWViYjlmLTcxNTEtNGY5YS1hOGM3LTlhNTdiODIwNTMwOC9jYXRlZ29yeS9HYWRnZXRzIn1dfQ",
        "scroll": 2,
    },
    "NYT Tech": {
        "title": "Technology - The New York Times",
        "url": "https://www.nytimes.com/section/technology",
        "include": ["^https://www.nytimes.com/(\d+)/(\d+)/(\d+)(.*).html$"],
    },
    "WSJ Tech": {
        "title": "Technology - WSJ.com",
        "url": "https://www.wsj.com/tech",
        "include": ["^https://www.wsj.com/articles/"],
    },
    "Bloomberg Tech": {
        "title": "Bloomberg Technology - Bloomberg",
        "url": "https://www.bloomberg.com/technology",
        "include": ["^https://www.bloomberg.com/news/(\w+)/(\d+)-(\d+)-(\d+)"],
    },
    "FT Tech": {
        "title": "Technology",
        "url": "https://www.ft.com/technology",
        "include": ["https://www.ft.com/content/"]
    },
    "WaPo Tech": {
        "title": "Technology - The Washington Post",
        "url": "https://www.washingtonpost.com/business/technology/",
        "include": ["https://www.washingtonpost.com/(\w+)/(\d+)/(\d+)/(\d+)/"],
    },
    "Google News": {
        "title": "Google News - Technology - Artificial intelligence",
        "url": "https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US%3Aen",
        "scroll": 2,
        "click" : '//*[@aria-label="Artificial intelligence"]',
        "include": ["^https://news.google.com/articles/"],
    },
    "HackerNoon": {
        "title": "HackerNoon - read, write and learn about any technology",
        "url": "https://hackernoon.com/",
        "include": ["^https://hackernoon.com/([^/])*$"],
        "exclude": [
            "^https://hackernoon.com/$",
            "^https://hackernoon.com/c$",
            "^https://hackernoon.com/coins$",
            "^https://hackernoon.com/companies$",
            "^https://hackernoon.com/gallery$",
            "^https://hackernoon.com/how-to-gain-followers-and-newsletter-subs-directly-on-hackernoon$",
            "^https://hackernoon.com/login$",
            "^https://hackernoon.com/reader-boot$",
            "^https://hackernoon.com/sitemap.xml$",
            "^https://hackernoon.com/startups$",
            "^https://hackernoon.com/techbeat$",
            "^https://hackernoon.com/why-i-write-on-hacker-noon-nl28335q$",
            "^https://hackernoon.com/writer-signup$",
        ],
    },
    "Ars Technica": {
        "title": "Ars Technica",
        "url": "https://arstechnica.com/",
        "include": ["^https://arstechnica.com/gadgets/(\d+)/(\d+)/"],
    },
    "The Register": {
        "title": "The Register_ Enterprise Technology News and Analysis",
        "url": "https://www.theregister.com/",
        "include": ["^https://www.theregister.com/(\d+)/(\d+)/(\d+)/"],
    },
    "Business Insider": {
        "title": "Tech - Business Insider",
        "url": "https://www.businessinsider.com/tech",
        "exclude": ["^https://www.insider.com", "^https://www.passionfroot.me"],
    },
}

sources_reverse = {v["title"]: k for k, v in sources.items()}


# Download HTML files from sources

In [5]:
# download files via selenium and firefox
outputdir = "htmldata"
delete_files(outputdir)

# Print the formatted time
print(datetime.now().strftime('%H:%M:%S'), "Starting", flush=True)

firefox_app_path = '/Applications/Firefox.app'
# Path to your geckodriver
geckodriver_path = '/Users/drucev/webdrivers/geckodriver'

# Set up Firefox options to use your existing profile
firefox_profile_path = '/Users/drucev/Library/Application Support/Firefox/Profiles/k8k0lcjj.default-release'
options = Options()
options.profile = firefox_profile_path

print(datetime.now().strftime('%H:%M:%S'), "Initialized profile", flush=True)

# Create a Service object with the path
service = Service(geckodriver_path)

print(datetime.now().strftime('%H:%M:%S'), "Initialized service", flush=True)
# Set up the Firefox driver
driver = webdriver.Firefox(service=service, options=options)

print(datetime.now().strftime('%H:%M:%S'), "Initialized webdriver", flush=True)
sleeptime = 10

for sourcename, sourcedict in sources.items():
    print(datetime.now().strftime('%H:%M:%S'), f'Processing {sourcename}', flush=True)
    title = sourcedict["title"]
    url = sourcedict["url"]
    scroll = sourcedict.get("scroll", 0)
    click = sourcedict.get("click")

    # Open the page
    driver.get(url)

    # Wait for the page to load
    time.sleep(sleeptime)  # Adjust the sleep time as necessary

    if click:
        print(datetime.now().strftime('%H:%M:%S'), f"Clicking on {click}", flush=True)
        button = driver.find_element(By.XPATH, click)
        if button:
            button.click()
            print(datetime.now().strftime('%H:%M:%S'), f"Clicked", flush=True)

    for _ in range(scroll):
        # scroll to bottom of infinite scrolling window
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        print(datetime.now().strftime('%H:%M:%S'), "Loading additional infinite scroll items", flush=True)
        time.sleep(sleeptime) # wait for it to load additional items

    # Get the HTML source of the page
    html_source = driver.page_source

    # check encoding, default utf-8
    encoding = "utf-8"  # Default to UTF-8 if not specified
    # Retrieve the content-type meta tag from the HTML
    try:
        meta_tag = driver.find_element(By.XPATH, "//meta[@http-equiv='Content-Type']")
        content_type = meta_tag.get_attribute("content")
        # Typical format is "text/html; charset=UTF-8"
        charset_start = content_type.find("charset=")
        if charset_start != -1:
            encoding = content_type[charset_start + 8:]
    except Exception as err:
        pass

    # Save the HTML to a local file
    datestr = datetime.now().strftime("%m_%d_%Y %I_%M_%S %p")
    outfile = f'{title} ({datestr}).html'
    print(datetime.now().strftime('%H:%M:%S'), f"Saving {outfile} as {encoding}", flush=True)
    with open(outputdir + "/" + outfile, 'w', encoding=encoding) as file:
        file.write(html_source)

# Close the browser
driver.quit()
print(datetime.now().strftime('%H:%M:%S'), "Quit webdriver", flush=True)


23:13:47 Starting
23:14:18 Initialized profile
23:14:18 Initialized service
23:15:22 Initialized webdriver
23:15:22 Processing Reddit
23:15:38 Loading additional infinite scroll items
23:15:48 Loading additional infinite scroll items
23:15:59 Saving top scoring links _ multi (03_24_2024 11_15_59 PM).html as utf-8
23:15:59 Processing Hacker News
23:16:09 Saving Hacker News Page 1 (03_24_2024 11_16_09 PM).html as utf-8
23:16:09 Processing Hacker News 2
23:16:19 Saving Hacker News Page 2 (03_24_2024 11_16_19 PM).html as utf-8
23:16:19 Processing Techmeme
23:16:31 Saving Techmeme (03_24_2024 11_16_31 PM).html as utf-8
23:16:31 Processing Feedly AI
23:16:42 Loading additional infinite scroll items
23:16:52 Loading additional infinite scroll items
23:17:02 Saving Discover and Add New Feedly AI Feeds (03_24_2024 11_17_02 PM).html as utf-8
23:17:02 Processing NYT Tech
23:17:14 Saving Technology - The New York Times (03_24_2024 11_17_14 PM).html as utf-8
23:17:14 Processing WSJ Tech
23:17:25 Sa

In [6]:
[os.path.join(download_dir, file) for file in os.listdir(download_dir)]

['htmldata/Tech - Business Insider (03_24_2024 11_19_19 PM).html',
 'htmldata/Google News - Technology - Artificial intelligence (03_24_2024 11_18_33 PM).html',
 'htmldata/Bloomberg Technology - Bloomberg (03_24_2024 11_17_37 PM).html',
 'htmldata/The Register_ Enterprise Technology News and Analysis (03_24_2024 11_19_08 PM).html',
 'htmldata/Hacker News Page 1 (03_24_2024 11_16_09 PM).html',
 'htmldata/HackerNoon - read, write and learn about any technology (03_24_2024 11_18_44 PM).html',
 'htmldata/Technology (03_24_2024 11_17_48 PM).html',
 'htmldata/top scoring links _ multi (03_24_2024 11_15_59 PM).html',
 'htmldata/Ars Technica (03_24_2024 11_18_58 PM).html',
 'htmldata/Technology - The New York Times (03_24_2024 11_17_14 PM).html',
 'htmldata/Hacker News Page 2 (03_24_2024 11_16_19 PM).html',
 'htmldata/Discover and Add New Feedly AI Feeds (03_24_2024 11_17_02 PM).html',
 'htmldata/Techmeme (03_24_2024 11_16_31 PM).html',
 'htmldata/Technology - WSJ.com (03_24_2024 11_17_25 PM).

In [7]:
# List all paths in the directory matching today's date
nfiles = 50

# Get the current date
today = datetime.now()
year, month, day = today.year, today.month, today.day

datestr = datetime.now().strftime("%m_%d_%Y")

# print(f"Year: {year}, Month: {month}, Day: {day}")

files = [os.path.join(download_dir, file) for file in os.listdir(download_dir)]
# filter files only
files = [file for file in files if os.path.isfile(file)]

# Sort files by modification time and take top 50
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
file = files[:nfiles]

# filter files by with today's date ending in .html
files = [file for file in files if datestr in file and file.endswith(".html")]
print(len(files))
for file in files:
    print(file)

15
htmldata/Tech - Business Insider (03_24_2024 11_19_19 PM).html
htmldata/The Register_ Enterprise Technology News and Analysis (03_24_2024 11_19_08 PM).html
htmldata/Ars Technica (03_24_2024 11_18_58 PM).html
htmldata/HackerNoon - read, write and learn about any technology (03_24_2024 11_18_44 PM).html
htmldata/Google News - Technology - Artificial intelligence (03_24_2024 11_18_33 PM).html
htmldata/Technology - The Washington Post (03_24_2024 11_18_00 PM).html
htmldata/Technology (03_24_2024 11_17_48 PM).html
htmldata/Bloomberg Technology - Bloomberg (03_24_2024 11_17_37 PM).html
htmldata/Technology - WSJ.com (03_24_2024 11_17_25 PM).html
htmldata/Technology - The New York Times (03_24_2024 11_17_14 PM).html
htmldata/Discover and Add New Feedly AI Feeds (03_24_2024 11_17_02 PM).html
htmldata/Techmeme (03_24_2024 11_16_31 PM).html
htmldata/Hacker News Page 2 (03_24_2024 11_16_19 PM).html
htmldata/Hacker News Page 1 (03_24_2024 11_16_09 PM).html
htmldata/top scoring links _ multi (03_

In [8]:
# you need this if you have not-descriptive link titles like 'link', can get a page title from html or tags
def get_og_tags(url):
    """get a dict of Open Graph og: tags such as title in the HEAD of a URL"""
    retdict = {}
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            head = soup.head
            if head:
                og_tags = head.find_all(
                    property=lambda prop: prop and prop.startswith("og:")
                )
                for tag in og_tags:
                    if "content" in tag.attrs:
                        retdict[tag["property"]] = tag["content"]

                page_title = ""
                title_tag = soup.find("title")
                if title_tag:
                    page_title = title_tag.text
                    if page_title:
                        retdict["title"] = page_title
        return retdict
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return retdict


url = "https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says"
get_og_tags(url)

{'og:locale': 'en-GB',
 'og:url': 'https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says',
 'og:site_name': 'euronews',
 'og:type': 'article',
 'og:title': 'AI to impact 40% of jobs around the world, IMF chief says',
 'og:description': 'Kristalina Georgieva said now is the time to act to create a set of policies ensuring the impact of AI is beneficial not detrimental to humanity.',
 'og:image': 'https://static.euronews.com/articles/stories/08/17/12/08/1200x675_cmsv2_1bac2582-b418-5da9-80f9-6c4b6254606d-8171208.jpg',
 'og:image:width': '1200',
 'og:image:height': '675',
 'og:image:type': 'image/jpeg',
 'og:image:alt': 'Almost 40% of jobs around the world will be impacted by AI, IMF chief says',
 'og:locale:alternate': 'el-GR',
 'og:locale:alternate:url': 'https://www.euronews.com/next/2024/01/15/almost-40-of-jobs-around-the-world-will-be-impacted-by-ai-imf-chief-says',
 'title': 'Almost 40% of jobs around the world will be impa

In [9]:
def get_path_from_url(url):
    """
    Extracts the path following the top-level domain name from a URL.

    :param url: The URL string.
    :return: The path component of the URL.
    """
    parsed_url = urlparse(url)
    return parsed_url.path


# Example usage
example_url = "http://www.example.com/some/path?query=string"
path = get_path_from_url(example_url)
print(path)

/some/path


In [10]:
MODEL = "gpt-4-0125-preview"

MAX_INPUT_TOKENS = 3072
MAX_OUTPUT_TOKENS = 4096
MAX_RETRIES = 3
TEMPERATURE = 0

In [11]:
enc = tiktoken.encoding_for_model(MODEL)
assert enc.decode(enc.encode("hello world")) == "hello world"


def count_tokens(s):
    return len(enc.encode(s))


count_tokens("four score and 7 years go our forefathers brought forth")

13

In [12]:
def trimmed_href(l):
    """
    Trims everything in the string after a question mark such as a session ID.

    :param s: The input string.
    :return: The trimmed string.
    """
    # Find the position of the question mark
    s = l.get("href")
    if s:
        question_mark_index = s.find("?")

        # If a question mark is found, trim the string up to that point
        if question_mark_index != -1:
            return s[:question_mark_index]
        else:
            # Return the original string if no question mark is found
            return s
    else:
        return s

# Parse news URLs and titles from downloaded HTML

In [13]:
# parse all the URL that look like news articles
# into all_urls list of dicts with url, title, src
all_urls = []

for file in files:
    # Extract filename from path
    filename = os.path.basename(file)

    # Find the position of '1_14_2024' in the filename
    position = filename.find(" (" + datestr)
    basename = filename[:position]
#     print(basename)
#     if basename.startswith('Google News'):
#         pass
#     else:
#         continue

    sourcename = sources_reverse.get(basename)
    if sourcename is None:
        print(f"Skipping {basename}, no sourcename metadata")
        continue

    display(Markdown(f"# {sourcename}"))
    sources[sourcename]["latest"] = file

    # get contents
    with open(file, "r") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all <a> tags
    if soup:
        links = soup.find_all("a")
    else:
        print(f"Skipping {sourcename}, unable to parse")

    # convert relative links to absolute links using base URL if present
    base_tag = soup.find('base')
    base_url = base_tag.get('href') if base_tag else sources[sourcename]["url"]
    for link in links:
#         print(link.get("href"))
        link["href"]= urljoin(base_url, link.get('href', ""))
#         print(link["href"])

#     print(len(links))
#     links = [l for l in links if l]
#     links = [l.strip() for l in links]

    print(len(links))

    for pattern in sources[sourcename].get("exclude", []):
        # filter links by exclusion pattern
#         print(pattern)
#         print([ l.get("href") for l in links])
        links = [
            l
            for l in links
            if l.get("href") is not None and not re.match(pattern, l.get("href"))
        ]
        # print(len(links))

    for pattern in sources[sourcename].get("include", []):
        # print(pattern, len(links))
        # filter links by inclusion pattern
        # print(pattern)
        # print(type(pattern))
        newlinks = []
        for l in links:
            href = l.get("href")
#             print(href)
            if href and re.match(pattern, href):
                newlinks.append(l)
        links = newlinks
        # links = [l for l in links if re.match(pattern, l.get("href"))]
        # print(len(links))

    # drop empty text
    links = [l for l in links if l.get_text(strip=True)]

    # drop empty url path, i.e. url = toplevel domain
    links = [l for l in links if len(get_path_from_url(trimmed_href(l))) > 1]
    # drop anything that is not http, like javascript: or mailto:
    links = [l for l in links if l.get("href") and l.get("href").startswith("http")]
    # drop some ArsTechnica links that are just the number of comments and dupe the primary link
    links = [l for l in links if not re.match("^(\d+)$", l.get_text(strip=True))]

    for l in links:
        url = trimmed_href(l)
        title = l.get_text(strip=True)
        if title == "LINK":
            # try to update title
            og_dict = get_og_tags(url)
            if og_dict.get("og:title"):
                title = og_dict.get("og:title")

        # skip some low quality links that don't have full headline, like link to a Twitter or Threads account
        if len(title) <= 28 and title != "LINK":
            continue

        all_urls.append({"title": title, "url": url, "src": sourcename})
#         display(Markdown(f"[{title}]({url})"))

    print(len(links))
    print()

    # for p in pages:
    #     print(p)

# Business Insider

334
250



# The Register

199
87



# Ars Technica

252
5



# HackerNoon

532
77



# Google News

891
390



# WaPo Tech

163
38



# FT Tech

458
110



# Bloomberg Tech

302
51



# WSJ Tech

520
12



# NYT Tech

75
17



# Feedly AI

263
241



# Techmeme

350
148



# Hacker News 2

261
27



# Hacker News

256
27



# Reddit

642
43



In [14]:
# make a pandas dataframe
orig_df = (
    pd.DataFrame(all_urls)
    .groupby("url")
    .first()
    .reset_index()
    .sort_values("src")[["src", "title", "url"]]
    .reset_index(drop=True)
    .reset_index(drop=False)
    .rename(columns={"index": "id"})
)
orig_df

Unnamed: 0,id,src,title,url
0,0,Ars Technica,"Android 15 gets satellite messaging, starts fo...",https://arstechnica.com/gadgets/2024/03/androi...
1,1,Ars Technica,Windows Notepad’s midlife renaissance continue...,https://arstechnica.com/gadgets/2024/03/window...
2,2,Ars Technica,"Samsung users ask, “Why does the S-Pen smell s...",https://arstechnica.com/gadgets/2024/03/users-...
3,3,Ars Technica,Microsoft debuts major Surface overhauls that ...,https://arstechnica.com/gadgets/2024/03/micros...
4,4,Ars Technica,AMD promises big upscaling improvements and a ...,https://arstechnica.com/gadgets/2024/03/amd-pr...
...,...,...,...,...
992,992,WaPo Tech,How to use your smartphone to photograph the s...,https://www.washingtonpost.com/technology/2024...
993,993,WaPo Tech,"Landlines are dying out. But to some, they’re ...",https://www.washingtonpost.com/technology/2024...
994,994,WaPo Tech,"Trump Media merger wins investor approval, net...",https://www.washingtonpost.com/technology/2024...
995,995,WaPo Tech,"Trump Media, launched after an insurrection, f...",https://www.washingtonpost.com/technology/2024...


In [15]:
# filter ones not seen before
conn = sqlite3.connect('articles.db')

# Retrieve all URLs from the SQLite table
existing_urls = pd.read_sql_query("SELECT url FROM news_articles", conn)

# Close the SQLite connection
conn.close()

# Convert the URLs to a list for easier comparison
existing_urls_list = existing_urls['url'].tolist()

# Filter the original DataFrame
# Keep rows where the URL is not in the existing_urls_list
filtered_df = orig_df[~orig_df['url'].isin(existing_urls_list)]


In [16]:
len(existing_urls_list)

26617

In [17]:
len(filtered_df)

226

# Filter AI-related headlines using a prompt to OpenAI

In [18]:
# make pages that fit in a reasonably sized prompt
MAXPAGELEN = 50
pages = []
current_page = []
pagelength = 0

for row in filtered_df.itertuples():
    curlink = {"id": row.Index, "title": row.title}
    curlength = count_tokens(json.dumps(curlink))
    # Check if adding the current string would exceed the limit
    if len(current_page) >= MAXPAGELEN or pagelength + curlength > MAX_INPUT_TOKENS:
        # If so, start a new page
        pages.append(current_page)
        current_page = [curlink]
        pagelength = curlength
    else:
        # Otherwise, add the string to the current page
        current_page.append(curlink)
        pagelength += curlength

# add the last page if it's not empty
if current_page:
    pages.append(current_page)

len(pages)

5

In [19]:
def get_response_json(
    client,
    messages,
    verbose=False,
    model=MODEL,
    # max_input_tokens=MAX_INPUT_TOKENS,
    max_output_tokens=MAX_OUTPUT_TOKENS,
    max_retries=MAX_RETRIES,
    temperature=TEMPERATURE,
):
    if type(messages) != list:  # allow passing one string for convenience
        messages = [{"role": "user", "content": messages}]

    if verbose:
        print("\n".join([str(msg) for msg in messages]))

    # truncate number of tokens
    # retry loop, have received untrapped 500 errors like too busy
    for i in range(max_retries):
        if i > 0:
            print(f"Attempt {i+1}...")
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=messages,
                temperature=0,
                max_tokens=max_output_tokens,
                response_format={"type": "json_object"},
            )
            # no exception thrown
            return response
        except Exception as error:
            print(f"An exception occurred on attempt {i+1}:", error)
            time.sleep(5)
            continue  # try again
        # retries exceeded if you got this far
    print("Retries exceeded.")
    return None


# messages = [
#     {
#         "role": "system",
#         "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair.",
#     },
#     {
#         "role": "user",
#         "content": "Compose a poem that explains the concept of recursion in programming, returning each verse as a json object.",
#     },
# ]

# response = get_response_json(client, messages)
# response

In [20]:
models = sorted(openai.models.list(), key=lambda m: m.created)
models

[Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal'),
 Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo', created=1677610602, object='model', owned_by='openai'),
 Model(id='gpt-3.5-turbo-0301', created=1677649963, object='model', owned_by='openai'),
 Model(id='tts-1', created=1681940951, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k', created=1683758102, object='model', owned_by='openai-internal'),
 Model(id='gpt-3.5-turbo-16k-0613', created=1685474247, object='model', owned_by='openai'),
 Model(id='gpt-3.5-turbo-0613', created=1686587434, object='model', owned_by='openai'),
 Model(id='gpt-4-0613', created=1686588896, object='model', owned_by='openai'),
 Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'),
 Model(id='davinci-002', created=1692634301, object='model', owned_by='system'),
 Model(id='babbage-002', created=1692634615,

In [21]:
prompt = """
You will act as a research assistant classifying news stories as related to artificial intelligence (AI) or unrelated to AI.

Your task is to read JSON format objects from an input list of news stories using the schema below delimited by |, and output JSON format objects for each using the schema below delimited by ~.

Define a list of objects representing news stories in JSON format as in the following example:
|
{'stories':
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
 ]
}
|

Based on the title, you will classify each story as being about AI or not.

For each object, you will output the input id field, and a field named isAI which is true if the input title is about AI and false if the input title is not about AI.

When extracting information please make sure it matches the JSON format below exactly. Do not output any attributes that do not appear in the schema below.
~
{'stories':
[{'id': 97, 'isAI': true},
 {'id': 103, 'isAI': true},
 {'id': 103, 'isAI': false},
 {'id': 210, 'isAI': true},
 {'id': 298, 'isAI': false}]
}
~

You may interpret the term AI broadly as pertaining to
- machine learning models
- large language models
- robotics
- reinforcement learning
- computer vision
- OpenAI
- ChatGPT
- other closely related topics.

You will return an array of valid JSON objects.

The field 'id' in the output must match the field 'id' in the input EXACTLY.

The field 'isAI' must be either true or false.

The list of news stories to classify and enrich is:


"""

In [22]:
pages[0][0]

{'id': 20,
 'title': 'Popular YouTube Channel Ryan’s World Pulls a Taylor Swift With New Movie'}

In [23]:
responses = []
enriched_urls = []
for i, p in enumerate(pages):
    print(
        f"{datetime.now().strftime('%H:%M:%S')} send page {i+1} of {len(pages)}, {len(p)} items "
    )
    # print(prompt + json.dumps(p))
    response = get_response_json(client, prompt + json.dumps(p))
    responses.append(response.choices[0].message.content)
    retval = json.loads(responses[-1])
    retlist = []
    # usually comes back as a dict with a single arbitrary key like "stories" with a list value
    if type(retval) == dict:
        for k, v in retval.items():
            if type(v) == list:
                retlist.extend(v)
            else:
                retlist.append(v)
        print(
            f"{datetime.now().strftime('%H:%M:%S')} got dict with {len(retlist)} items "
        )
    elif type(retval) == list:  # in case it comes back as a list
        retlist = retval
        print(
            f"{datetime.now().strftime('%H:%M:%S')} got list with {len(retlist)} items "
        )
    else:
        print(str(type(retval)))
    enriched_urls.extend(retlist)

23:19:28 send page 1 of 5, 50 items 
23:20:10 got dict with 50 items 
23:20:10 send page 2 of 5, 50 items 
23:20:40 got dict with 50 items 
23:20:40 send page 3 of 5, 50 items 
23:21:05 got dict with 50 items 
23:21:05 send page 4 of 5, 50 items 
23:21:42 got dict with 50 items 
23:21:42 send page 5 of 5, 26 items 
23:21:59 got dict with 26 items 


In [24]:
enriched_df = pd.DataFrame(enriched_urls)
enriched_df.head()

Unnamed: 0,id,isAI
0,20,False
1,22,False
2,48,False
3,59,False
4,68,False


In [25]:
print("isAI", len(enriched_df.loc[enriched_df["isAI"]]))
print("not isAI", len(enriched_df.loc[~enriched_df["isAI"]]))

isAI 72
not isAI 154


In [26]:
merged_df = pd.merge(filtered_df, enriched_df, on="id", how="outer")
merged_df['date']=datetime.now().date()
merged_df.head()

Unnamed: 0,id,src,title,url,isAI,date
0,20,Bloomberg Tech,Popular YouTube Channel Ryan’s World Pulls a T...,https://www.bloomberg.com/news/newsletters/202...,False,2024-03-24
1,22,Bloomberg Tech,"BYD Takes On EV Laggards Toyota, VW With Steep...",https://www.bloomberg.com/news/articles/2024-0...,False,2024-03-24
2,48,Business Insider,The US government is running out of funding fo...,https://www.businessinsider.com/digital-divide...,False,2024-03-24
3,59,Business Insider,Grimes linked to new love interest DJ Anyma as...,https://www.businessinsider.com/meet-grimes-ne...,False,2024-03-24
4,68,Business Insider,Trump's Truth Social has only 5 million member...,https://www.businessinsider.com/trumps-social-...,False,2024-03-24


In [27]:
# ideally should be empty, shouldn't get back rows that don't match to existing
merged_df.loc[merged_df["src"].isna()]

Unnamed: 0,id,src,title,url,isAI,date


In [28]:
# ideally should be empty, should get back all rows from orig
merged_df.loc[merged_df["isAI"].isna()]

Unnamed: 0,id,src,title,url,isAI,date


In [29]:
# # Connect to SQLite database
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

# # Create table with a date column
# cursor.execute('''
# CREATE TABLE IF NOT EXISTS news_articles (
#     id INTEGER PRIMARY KEY,
#     src TEXT,
#     title TEXT,
#     url TEXT UNIQUE,
#     isAI BOOLEAN,
#     article_date DATE
# )
# ''')
# conn.commit()
# conn.close()

In [30]:
# Function to insert a new article
def insert_article(cursor, src, title, url, isAI, article_date):
    try:
        cursor.execute("INSERT OR IGNORE INTO news_articles (src, title, url, isAI, article_date) VALUES (?, ?, ?, ?, ?)",
                       (src, title, url, isAI, article_date))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"Duplicate entry for URL: {url}")
    except Exception as err:
        print(err)

In [31]:
pd.read_sql_query("select count(*) from news_articles", conn)


Unnamed: 0,count(*)
0,26617


In [32]:
for row in merged_df.itertuples():
    # print(row)
    insert_article(cursor, row.src, row.title, row.url, row.isAI, row.date)



In [33]:
pd.read_sql_query("select count(*) from news_articles", conn)


Unnamed: 0,count(*)
0,26843


In [34]:
df = pd.read_sql_query("select * from news_articles", conn)
df


Unnamed: 0,id,src,title,url,isAI,article_date
0,1,Ars Technica,OnePlus 12 gets $800 US release along with the...,https://arstechnica.com/gadgets/2024/01/oneplu...,0,2024-01-24
1,2,Ars Technica,Chrome can now organize your tab bar for you,https://arstechnica.com/gadgets/2024/01/chrome...,0,2024-01-24
2,3,Ars Technica,HP CEO evokes James Bond-style hack via ink ca...,https://arstechnica.com/gadgets/2024/01/hp-ceo...,0,2024-01-24
3,4,Ars Technica,iOS 17.3 adds multiple features originally pla...,https://arstechnica.com/gadgets/2024/01/ios-17...,0,2024-01-24
4,5,Ars Technica,Wild Apples: The 12 weirdest and rarest Macs e...,https://arstechnica.com/gadgets/2024/01/macint...,0,2024-01-24
...,...,...,...,...,...,...
26838,26839,Techmeme,"Analysis: Jeff Bezos, Peter Thiel, and Mark Zu...",https://t.co/Ej0ZiQZel3,0,2024-03-24
26839,26840,The Register,Samsung preps inferencing accelerator to take ...,https://www.theregister.com/2024/03/24/asia_te...,1,2024-03-24
26840,26841,The Register,"Some 300,000 IPs vulnerable to this Loop DoS a...",https://www.theregister.com/2024/03/24/loop_ip...,0,2024-03-24
26841,26842,The Register,Microsoft confirms memory leak in March Window...,https://www.theregister.com/2024/03/25/microso...,0,2024-03-24


In [35]:
len(merged_df.loc[merged_df["isAI"]])

72

In [36]:
for row in merged_df.loc[merged_df["isAI"]].reset_index().itertuples():
    display(Markdown(f"[{row.index}. {row.title} - {row.src}]({row.url})"))

[7. AI execs who urgently need more energy to power their tech revolution are turning to fossil fuels - Business Insider](https://www.businessinsider.com/ai-energy-electricity-data-centers-fossil-fuels-tech-revolution-2024-3)

[16. Filipino journalist upends industry’s AI doomsday fears with custom chatbot to aid in-depth reporting - Feedly AI](https://www.scmp.com/week-asia/people/article/3256522/filipino-journalist-upends-industrys-ai-doomsday-fears-custom-chatbot-aid-depth-reporting)

[18. 3 HotArtificial Intelligence(AI) Stocks to Buy With $1,000 and Hold Forever - Feedly AI](https://finance.yahoo.com/news/3-hot-artificial-intelligence-ai-110000760.html)

[19. How human-like are some robots? - Feedly AI](https://flipboard.com/video/dw/bd7f6277aa)

[20. Tiny-autodiff: A tiny autograd library in D - Feedly AI](https://code.dlang.org/packages/tiny-autodiff)

[21. Google bringing Gemini AI to iPhone is so infuriating, but not surprising - Feedly AI](https://bgr.com/business/google-bringing-gemini-ai-to-iphone-is-so-infuriating-but-not-surprising/)

[22. Four generative AI cyber risks that keep CISOs up at night — and how to combat them - SiliconANGLE - Feedly AI](https://siliconangle.com/2024/03/24/four-generative-ai-cyber-risks-keep-cisos-night-combat/)

[23. Explorer: Exploration-Guided Reasoning for Textual Reinforcement Learning - Feedly AI](https://arxiv.org/abs/2403.10692)

[24. What Is Claude 3? Is it better than ChatGPT? - Feedly AI](https://readwrite.com/what-is-claude-3-is-it-better-than-chatgpt/)

[26. iPhone 16 Pro: New A18 Pro chip to offer powerful on-device AI performance - Feedly AI](https://9to5mac.com/2024/03/24/iphone-16-pro-a18-pro-chip-artificial-intelligence/)

[27. OpenAI Showing Off Video-Generating System to Hollywood Directors - Feedly AI](https://futurism.com/the-byte/openai-video-sora-hollywood)

[28. Scientists Gave AI an "Inner Monologue" and Something Fascinating Happened - Feedly AI](https://futurism.com/the-byte/ai-inner-monologue)

[30. Nvidia has a parade of partners, but these two stocks are my favorite beneficiaries - Feedly AI](https://www.cnbc.com/2024/03/24/jim-cramer-amazon-and-dell-are-benefiting-the-most-from-nvidia-partnership.html)

[31. LWiAI Podcast #160 - Nvidia's new GPU, Microsoft pays for Inflection AI, Grok-1 open sourced, Jeremie's Action Plan - Feedly AI](https://lastweekin.ai/p/lwiai-podcast-160-nvidias-new-gpu)

[32. Gemini on the iPhone would be AI's mainstream moment - Feedly AI](https://www.cnet.com/tech/mobile/google-gemini-on-the-iphone-would-be-ais-mainstream-moment/)

[33. How ML Model Data Poisoning Works in 5 Minutes - Feedly AI](https://journal.hexmos.com/training-data-poisoning/)

[34. How ChatGPT Can Help Coders Earn Big in 2024; Check out - Feedly AI](https://www.dnpindia.in/technology/how-chatgpt-can-help-coders-earn-big-in-2024-check-out/378111/)

[36. Sam Altman's Under-The-Radar SPAC Fuses AI Expertise With Nuclear Energy: Here Are The Others Involved - Feedly AI](https://www.benzinga.com/trading-ideas/long-ideas/24/03/37906984/sam-altmans-under-the-radar-spac-fuses-ai-expertise-with-nuclear-energy-here-are-the-oth)

[39. Samsung preps inferencing accelerator to take on Nvidia, scores huge sale - Feedly AI](https://go.theregister.com/feed/www.theregister.com/2024/03/24/asia_tech_news_roundup/)

[40. How to build an OpenAI-compatible API - Feedly AI](https://towardsdatascience.com/how-to-build-an-openai-compatible-api-87c8edea2f06)

[41. Companies are about to waste billions on AI — here’s how not to become one of them - Feedly AI](https://venturebeat.com/ai/companies-are-about-to-waste-billions-on-ai-heres-how-not-to-become-one-of-them/)

[42. Deepfake Kari Lake video shows coming chaos of AI in elections - Feedly AI](https://www.adn.com/nation-world/2024/03/24/deepfake-kari-lake-video-shows-coming-chaos-of-ai-in-elections/)

[43. Voice-Controlled AI Gadgets Could Fix the Worst Part of Using a Computer - Feedly AI](https://www.inverse.com/tech/open-interpreter-01-light-ai-control-your-computer-for-you)

[44. I was there for the dot-com bust. Here's why the AI boom isn't the same. - Google News](https://news.google.com/articles/CBMicmh0dHBzOi8vZmluYW5jZS55YWhvby5jb20vbmV3cy9pLXdhcy10aGVyZS1mb3ItdGhlLWRvdC1jb20tYnVzdC1oZXJlcy13aHktdGhlLWFpLWJvb20taXNudC10aGUtc2FtZS0xNzU5NDQ2ODAuaHRtbNIBAA)

[47. Apple's Tim Cook Believes AI is an Essential Tool for Climate Action - Google News](https://news.google.com/articles/CBMicGh0dHBzOi8vd3d3LnRlY2h0aW1lcy5jb20vYXJ0aWNsZXMvMzAyODg2LzIwMjQwMzI0L2FwcGxlcy10aW0tY29vay1iZWxpZXZlcy1haS1lc3NlbnRpYWwtdG9vbC1jbGltYXRlLWFjdGlvbi5odG3SAQA)

[50. AI shows Deadpool & Wolverine celebrating Holi in India! See pics - Google News](https://news.google.com/articles/CBMicGh0dHBzOi8vd3d3LmJvbGx5d29vZGh1bmdhbWEuY29tL25ld3MvaW50ZXJuYXRpb25hbC9haS1zaG93cy1kZWFkcG9vbC13b2x2ZXJpbmUtY2VsZWJyYXRpbmctaG9saS1pbmRpYS1zZWUtcGljcy_SAXRodHRwczovL3d3dy5ib2xseXdvb2RodW5nYW1hLmNvbS9hbXAvbmV3cy9pbnRlcm5hdGlvbmFsL2FpLXNob3dzLWRlYWRwb29sLXdvbHZlcmluZS1jZWxlYnJhdGluZy1ob2xpLWluZGlhLXNlZS1waWNzLw)

[53. A Psychologist Explains Why It’s Possible To Fall In Love With AI - Google News](https://news.google.com/articles/CBMidWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvdHJhdmVyc21hcmsvMjAyNC8wMy8yNC9hLXBzeWNob2xvZ2lzdC1leHBsYWlucy13aHktaXRzLXBvc3NpYmxlLXRvLWZhbGwtaW4tbG92ZS13aXRoLWFpL9IBeWh0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvdHJhdmVyc21hcmsvMjAyNC8wMy8yNC9hLXBzeWNob2xvZ2lzdC1leHBsYWlucy13aHktaXRzLXBvc3NpYmxlLXRvLWZhbGwtaW4tbG92ZS13aXRoLWFpL2FtcC8)

[55. Stability AI CEO Resigns and Calls for 'Transparent' AI Governance - Google News](https://news.google.com/articles/CBMid2h0dHBzOi8vd3d3LnB5bW50cy5jb20vYXJ0aWZpY2lhbC1pbnRlbGxpZ2VuY2UtMi8yMDI0L3N0YWJpbGl0eS1haS1jZW8tcmVzaWducy1hbmQtY2FsbHMtZm9yLXRyYW5zcGFyZW50LWFpLWdvdmVybmFuY2Uv0gEA)

[62. CDOs, data science heads to fill Chief AI Officer positions in India - Google News](https://news.google.com/articles/CBMibGh0dHBzOi8vd3d3LmNpby5jb20vYXJ0aWNsZS8yMDY5ODQ5L2Nkb3MtZGF0YS1zY2llbmNlLWhlYWRzLXRvLWZpbGwtY2hpZWYtYWktb2ZmaWNlci1wb3NpdGlvbnMtaW4taW5kaWEuaHRtbNIBbGh0dHBzOi8vd3d3LmNpby5jb20vYXJ0aWNsZS8yMDY5ODQ5L2Nkb3MtZGF0YS1zY2llbmNlLWhlYWRzLXRvLWZpbGwtY2hpZWYtYWktb2ZmaWNlci1wb3NpdGlvbnMtaW4taW5kaWEuaHRtbA)

[66. NVIDIA unveils LATTE3D text-to-3D generative AI model dubbed “virtual 3D printer” - Google News](https://news.google.com/articles/CBMie2h0dHBzOi8vd3d3Lm5vdGVib29rY2hlY2submV0L05WSURJQS11bnZlaWxzLUxBVFRFM0QtdGV4dC10by0zRC1nZW5lcmF0aXZlLUFJLW1vZGVsLWR1YmJlZC12aXJ0dWFsLTNELXByaW50ZXIuODE2MjQ4LjAuaHRtbNIBAA)

[73. 3 Billionaires Are Selling Artificial Intelligence (AI) Stock Nvidia and Buying These 10 AI Stocks Instead - Google News](https://news.google.com/articles/CBMivgFodHRwczovL3d3dy50aGVnbG9iZWFuZG1haWwuY29tL2ludmVzdGluZy9tYXJrZXRzL3N0b2Nrcy9HT09HL3ByZXNzcmVsZWFzZXMvMjQ5NDc3MDIvMy1iaWxsaW9uYWlyZXMtYXJlLXNlbGxpbmctYXJ0aWZpY2lhbC1pbnRlbGxpZ2VuY2UtYWktc3RvY2stbnZpZGlhLWFuZC1idXlpbmctdGhlc2UtMTAtYWktc3RvY2tzLWluc3RlYWQv0gEA)

[74. Health equity is 'the greatest challenge with AI' - Google News](https://news.google.com/articles/CBMieGh0dHBzOi8vd3d3LmNoaWVmaGVhbHRoY2FyZWV4ZWN1dGl2ZS5jb20vdmlldy9haS1jb3VsZC1jaGFuZ2UtaGVhbHRoY2FyZS1idXQtdXNlZC1pbmNvcnJlY3RseS1pdC1jb3VsZC13aWRlbi1kaXNwYXJpdGllc9IBAA)

[76. Chinese scientists say AI model can mark the best spots for solar panels - Google News](https://news.google.com/articles/CBMiiwFodHRwczovL3d3dy5zY21wLmNvbS9uZXdzL2NoaW5hL3NjaWVuY2UvYXJ0aWNsZS8zMjU2NTE1L2NoaW5lc2Utc2NpZW50aXN0cy1zYXktdGhlaXItYWktbW9kZWwtY2FuLW1hcmstYmVzdC1zcG90cy1kb3VibGUtZmFjZWQtc29sYXItcGFuZWxz0gGLAWh0dHBzOi8vYW1wLnNjbXAuY29tL25ld3MvY2hpbmEvc2NpZW5jZS9hcnRpY2xlLzMyNTY1MTUvY2hpbmVzZS1zY2llbnRpc3RzLXNheS10aGVpci1haS1tb2RlbC1jYW4tbWFyay1iZXN0LXNwb3RzLWRvdWJsZS1mYWNlZC1zb2xhci1wYW5lbHM)

[84. Meet Pretzel: An AI Dev Startup with an Open-Source, Offline Browser-based Tool and AI-Native Alternative to Jupyter Notebooks - Google News](https://news.google.com/articles/CBMipQFodHRwczovL3d3dy5tYXJrdGVjaHBvc3QuY29tLzIwMjQvMDMvMjQvbWVldC1wcmV0emVsLWFuLWFpLWRldi1zdGFydHVwLXdpdGgtYW4tb3Blbi1zb3VyY2Utb2ZmbGluZS1icm93c2VyLWJhc2VkLXRvb2wtYW5kLWFpLW5hdGl2ZS1hbHRlcm5hdGl2ZS10by1qdXB5dGVyLW5vdGVib29rcy_SAakBaHR0cHM6Ly93d3cubWFya3RlY2hwb3N0LmNvbS8yMDI0LzAzLzI0L21lZXQtcHJldHplbC1hbi1haS1kZXYtc3RhcnR1cC13aXRoLWFuLW9wZW4tc291cmNlLW9mZmxpbmUtYnJvd3Nlci1iYXNlZC10b29sLWFuZC1haS1uYXRpdmUtYWx0ZXJuYXRpdmUtdG8tanVweXRlci1ub3RlYm9va3MvP2FtcA)

[91. Bankers Are Lining Up Buyers for FTX’s 8% Stake in AI Startup Anthropic: Report - Google News](https://news.google.com/articles/CBMieGh0dHBzOi8vd3d3LmNvaW5kZXNrLmNvbS9wb2xpY3kvMjAyNC8wMy8yMi9iYW5rZXJzLWFyZS1saW5pbmctdXAtYnV5ZXJzLWZvci1mdHhzLTgtc3Rha2UtaW4tYWktc3RhcnR1cC1hbnRocm9waWMtcmVwb3J0L9IBfGh0dHBzOi8vd3d3LmNvaW5kZXNrLmNvbS9wb2xpY3kvMjAyNC8wMy8yMi9iYW5rZXJzLWFyZS1saW5pbmctdXAtYnV5ZXJzLWZvci1mdHhzLTgtc3Rha2UtaW4tYWktc3RhcnR1cC1hbnRocm9waWMtcmVwb3J0L2FtcC8)

[92. Anthropic weighs slate of sovereign wealth funds to acquire FTX's $1 bln stake, CNBC reports - Google News](https://news.google.com/articles/CBMiemh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3RlY2hub2xvZ3kvYW50aHJvcGljLXdlaWdocy1zbGF0ZS1zb3ZlcmVpZ24td2VhbHRoLWZ1bmRzLWFjcXVpcmUtZnR4cy0xLWJsbi1zdGFrZS1jbmJjLTIwMjQtMDMtMjIv0gEA)

[96. FTX to offload $1B Anthropic stake to pay off bankruptcy debts within weeks — report - Google News](https://news.google.com/articles/CBMihQFodHRwczovL2NvaW50ZWxlZ3JhcGguY29tL25ld3MvZnR4LXRvLW9mZmxvYWQtMS1iaWxsaW9uLWFpLWFydGlmaWNpYWwtaW50ZWxsaWdlbmNlLXNhbS1iYW5rbWFuLWZyaWVkLWFudGhyb3BpYy1zdGFrZS1iYW5rcnVwdGN5LWRlYnRz0gEA)

[97. The AI boom makes millions for an unlikely industry player: Anguilla - Google News](https://news.google.com/articles/CBMihAFodHRwczovL3d3dy5idXNpbmVzcy1zdGFuZGFyZC5jb20vd29ybGQtbmV3cy90aGUtYWktYm9vbS1tYWtlcy1taWxsaW9ucy1mb3ItYW4tdW5saWtlbHktaW5kdXN0cnktcGxheWVyLWFuZ3VpbGxhLTEyNDAzMjQwMDcxOV8xLmh0bWzSAQA)

[100. The small Caribbean island making a fortune from artificial intelligence - Google News](https://news.google.com/articles/CBMifmh0dHBzOi8vZW5nbGlzaC5lbHBhaXMuY29tL3RlY2hub2xvZ3kvMjAyNC0wMy0xNi90aGUtc21hbGwtY2FyaWJiZWFuLWlzbGFuZC1tYWtpbmctYS1mb3J0dW5lLWZyb20tYXJ0aWZpY2lhbC1pbnRlbGxpZ2VuY2UuaHRtbNIBAA)

[101. United Nations Artificial Intelligence | Nation/World | bdtonline.com - Google News](https://news.google.com/articles/CBMiggFodHRwczovL3d3dy5iZHRvbmxpbmUuY29tL25ld3MvbmF0aW9uX3dvcmxkL3VuaXRlZC1uYXRpb25zLWFydGlmaWNpYWwtaW50ZWxsaWdlbmNlL2ltYWdlXzk5Mjc0ZWE4LTExMmEtNTg2OC1hMmY5LTk4MTJiYmQ3MDY1OS5odG1s0gEA)

[102. Sam Altman's Under-The-Radar SPAC Fuses AI Expertise With Nuclear Energy: Here Are The Others Involved - - Google News](https://news.google.com/articles/CBMimQFodHRwczovL3d3dy5iZW56aW5nYS5jb20vdHJhZGluZy1pZGVhcy9sb25nLWlkZWFzLzI0LzAzLzM3OTA2OTg0L3NhbS1hbHRtYW5zLXVuZGVyLXRoZS1yYWRhci1zcGFjLWZ1c2VzLWFpLWV4cGVydGlzZS13aXRoLW51Y2xlYXItZW5lcmd5LWhlcmUtYXJlLXRoZS1vdGjSAS1odHRwczovL3d3dy5iZW56aW5nYS5jb20vYW1wL2NvbnRlbnQvMzc5MDY5ODQ)

[103. The first patient with Elon Musk's chip in his brain and his incredible reaction: It's crazy - Google News](https://news.google.com/articles/CBMiTGh0dHBzOi8vd3d3Lm1hcmNhLmNvbS9lbi90ZWNobm9sb2d5LzIwMjQvMDMvMjQvNjYwMDUzMTBlMjcwNGVkMzQ3OGI0NTliLmh0bWzSAUxodHRwczovL2FtcC5tYXJjYS5jb20vZW4vdGVjaG5vbG9neS8yMDI0LzAzLzI0LzY2MDA1MzEwZTI3MDRlZDM0NzhiNDU5Yi5odG1s)

[106. Is Nvidia AI Foundry a Threat to Palantir Stock? - Google News](https://news.google.com/articles/CBMiT2h0dHBzOi8vd3d3Lm5hc2RhcS5jb20vYXJ0aWNsZXMvaXMtbnZpZGlhLWFpLWZvdW5kcnktYS10aHJlYXQtdG8tcGFsYW50aXItc3RvY2vSAQA)

[112. This Artificial Intelligence Giant Could Be the Next to Join the "Magnificent Seven" - Google News](https://news.google.com/articles/CBMiV2h0dHBzOi8vd3d3LmZvb2wuY29tL2ludmVzdGluZy8yMDI0LzAzLzIwL3RoaXMtYWktZ2lhbnQtbWF5LWpvaW4tdGhlLW1hZ25pZmljZW50LXNldmVuL9IBAA)

[114. iPhone 16 Pro: New A18 Pro chip to offer powerful on-device AI performance - Google News](https://news.google.com/articles/CBMiUmh0dHBzOi8vOXRvNW1hYy5jb20vMjAyNC8wMy8yNC9pcGhvbmUtMTYtcHJvLWExOC1wcm8tY2hpcC1hcnRpZmljaWFsLWludGVsbGlnZW5jZS_SAQA)

[115. Unveiling New Physics With AI-Powered Particle Tracking - Google News](https://news.google.com/articles/CBMiUWh0dHBzOi8vc2NpdGVjaGRhaWx5LmNvbS91bnZlaWxpbmctbmV3LXBoeXNpY3Mtd2l0aC1haS1wb3dlcmVkLXBhcnRpY2xlLXRyYWNraW5nL9IBAA)

[123. New Neural Model Enables AI-to-AI Linguistic Communication - Google News](https://news.google.com/articles/CBMiUGh0dHBzOi8vd3d3LnVuaXRlLmFpL25ldy1uZXVyYWwtbW9kZWwtZW5hYmxlcy1haS10by1haS1saW5ndWlzdGljLWNvbW11bmljYXRpb24v0gEA)

[125. Video Shows First Neuralink Patient Playing Mario Kart With His Mind - Google News](https://news.google.com/articles/CBMiOmh0dHBzOi8vZnV0dXJpc20uY29tL25lb3Njb3BlL25ldXJhbGluay1wYXRpZW50LW1hcmlvLWthcnTSAQA)

[126. Revolutionizing healthcare, Israel hosts first-ever AI in medicine conference - Google News](https://news.google.com/articles/CBMiOWh0dHBzOi8vd3d3LnluZXRuZXdzLmNvbS9oZWFsdGhfc2NpZW5jZS9hcnRpY2xlL2JqejV3MmEwdNIBAA)

[128. Stable Diffusion CEO Resigns as Company Crumbles - Google News](https://news.google.com/articles/CBMiNmh0dHBzOi8vZnV0dXJpc20uY29tL3RoZS1ieXRlL3N0YWJpbGl0eS1haS1jZW8tcmVzaWduc9IBAA)

[130. Doctors Using AI to Automatically Generate Clinical Notes on Patients - Google News](https://news.google.com/articles/CBMiN2h0dHBzOi8vZnV0dXJpc20uY29tL25lb3Njb3BlL2RvY3RvcnMtYWktY2xpbmljYWwtbm90ZXPSAQA)

[132. Grokking X.ai’s Grok—Real Advance or Just Real Troll? - Google News](https://news.google.com/articles/CBMiMWh0dHBzOi8vc3BlY3RydW0uaWVlZS5vcmcvb3Blbi1zb3VyY2UtYWktZ3Jvay1sbG3SAUBodHRwczovL3NwZWN0cnVtLmllZWUub3JnL2FtcC9vcGVuLXNvdXJjZS1haS1ncm9rLWxsbS0yNjY3NTYxNjEz)

[142. What Does a Chief AI Officer Look Like in Marketing? - Google News](https://news.google.com/articles/CBMiQmh0dHBzOi8vd3d3LmFkd2Vlay5jb20vYnJhbmQtbWFya2V0aW5nL2NoaWVmLWFpLW9mZmljZXItbWFya2V0aW5nL9IBAA)

[151. Fujitsu taps generative AI to help speed drug development - Google News](https://news.google.com/articles/CBMiZWh0dHBzOi8vYXNpYS5uaWtrZWkuY29tL0J1c2luZXNzL1RlY2hub2xvZ3kvRnVqaXRzdS10YXBzLWdlbmVyYXRpdmUtQUktdG8taGVscC1zcGVlZC1kcnVnLWRldmVsb3BtZW500gEA)

[154. AI scribe saves doctors an hour at the keyboard every day - Google News](https://news.google.com/articles/CBMiZGh0dHBzOi8vd3d3LmFtYS1hc3NuLm9yZy9wcmFjdGljZS1tYW5hZ2VtZW50L2RpZ2l0YWwvYWktc2NyaWJlLXNhdmVzLWRvY3RvcnMtaG91ci1rZXlib2FyZC1ldmVyeS1kYXnSAQA)

[155. SMCI Stock: Why Chasing the 'Obvious' AI Play Could Leave You Burned - Google News](https://news.google.com/articles/CBMiZGh0dHBzOi8vaW52ZXN0b3JwbGFjZS5jb20vMjAyNC8wMy9zbWNpLXN0b2NrLXdoeS1jaGFzaW5nLXRoZS1vYnZpb3VzLWFpLXBsYXktY291bGQtbGVhdmUteW91LWJ1cm5lZC_SAQA)

[156. The iPhone 16 could come with extra RAM and storage – just for AI - Google News](https://news.google.com/articles/CBMiZ2h0dHBzOi8vd3d3LnRlY2hyYWRhci5jb20vcGhvbmVzL2lwaG9uZS90aGUtaXBob25lLTE2LWNvdWxkLWNvbWUtd2l0aC1leHRyYS1yYW0tYW5kLXN0b3JhZ2UtanVzdC1mb3ItYWnSAQA)

[157. The most powerful AI job in the market may end up being no more than a figurehead - Google News](https://news.google.com/articles/CBMiamh0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjQvMDMvMjIvdGhlLW1vc3QtcG93ZXJmdWwtYWktam9iLWluLXRoZS1tYXJrZXQtbWF5LWJlLW5vLW1vcmUtdGhhbi1maWd1cmVoZWFkLmh0bWzSAW5odHRwczovL3d3dy5jbmJjLmNvbS9hbXAvMjAyNC8wMy8yMi90aGUtbW9zdC1wb3dlcmZ1bC1haS1qb2ItaW4tdGhlLW1hcmtldC1tYXktYmUtbm8tbW9yZS10aGFuLWZpZ3VyZWhlYWQuaHRtbA)

[160. Nvidia's AI ambitions in medicine and health care are becoming clear - Google News](https://news.google.com/articles/CBMiaGh0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjQvMDMvMjQvbnZpZGlhcy1haS1hbWJpdGlvbnMtaW4tbWVkaWNpbmUtYW5kLWhlYWx0aC1jYXJlLWFyZS1iZWNvbWluZy1jbGVhci5odG1s0gEA)

[162. Could This Artificial Intelligence (AI) Fintech Be 1 of 2024's Largest IPOs? - Google News](https://news.google.com/articles/CBMiaGh0dHBzOi8vd3d3Lm5hc2RhcS5jb20vYXJ0aWNsZXMvY291bGQtdGhpcy1hcnRpZmljaWFsLWludGVsbGlnZW5jZS1haS1maW50ZWNoLWJlLTEtb2YtMjAyNHMtbGFyZ2VzdC1pcG9z0gEA)

[163. Companies are about to waste billions on AI — here's how not to become one of them - Google News](https://news.google.com/articles/CBMia2h0dHBzOi8vdmVudHVyZWJlYXQuY29tL2FpL2NvbXBhbmllcy1hcmUtYWJvdXQtdG8td2FzdGUtYmlsbGlvbnMtb24tYWktaGVyZXMtaG93LW5vdC10by1iZWNvbWUtb25lLW9mLXRoZW0v0gEA)

[167. AI boom makes millions for unlikely industry player, Anguilla - Google News](https://news.google.com/articles/CBMiYmh0dHBzOi8vd3d3LnN0cmFpdHN0aW1lcy5jb20vYnVzaW5lc3MvYWktYm9vbS1tYWtlcy1taWxsaW9ucy1mb3ItdW5saWtlbHktaW5kdXN0cnktcGxheWVyLWFuZ3VpbGxh0gEA)

[168. Gemini AI: Google is the last company Apple should partner with - Google News](https://news.google.com/articles/CBMiYmh0dHBzOi8vYmdyLmNvbS9idXNpbmVzcy9nb29nbGUtYnJpbmdpbmctZ2VtaW5pLWFpLXRvLWlwaG9uZS1pcy1zby1pbmZ1cmlhdGluZy1idXQtbm90LXN1cnByaXNpbmcv0gEA)

[171. Here's why Apple is negotiating with Google on AI partnership - Google News](https://news.google.com/articles/CBMiWWh0dHBzOi8vd3d3Lmltb3JlLmNvbS9pcGhvbmUvaGVyZXMtd2h5LWFwcGxlLWlzLW5lZ290aWF0aW5nLXdpdGgtZ29vZ2xlLWFuLWFpLXBhcnRuZXJzaGlw0gEA)

[172. This chief AI officer says the job is more of a temporary gig - Google News](https://news.google.com/articles/CBMiWWh0dHBzOi8vd3d3LmVtZXJnaW5ndGVjaGJyZXcuY29tL3N0b3JpZXMvMjAyNC8wMy8yMC9jaGllZi1haS1vZmZpY2VyLWRpYWxwYWQtZGFuLW9jb25uZWxs0gEA)

[180. Apple CEO Tim Cook praises Tesla-beating BYD on China visit, pushes AI-infused environmental theme as geopolitical tensions loom - Google News](https://news.google.com/articles/CBMiX2h0dHBzOi8vZm9ydHVuZS5jb20vMjAyNC8wMy8yNC9hcHBsZS1jZW8tdGltLWNvb2stdGVzbGEtYmVhdGluZy1ieWQtY2hpbmEtdmlzaXQtYWktZW52aXJvbm1lbnQv0gFjaHR0cHM6Ly9mb3J0dW5lLmNvbS8yMDI0LzAzLzI0L2FwcGxlLWNlby10aW0tY29vay10ZXNsYS1iZWF0aW5nLWJ5ZC1jaGluYS12aXNpdC1haS1lbnZpcm9ubWVudC9hbXAv)

[193. Large Language Models' Emergent Abilities Are a Mirage - Hacker News](https://www.wired.com/story/how-quickly-do-large-language-models-learn-unexpected-skills/)

[201. Supervision: Reusable Computer Vision - Hacker News](https://github.com/roboflow/supervision)

[206. The Expressive Power of Transformers with Chain of Thought (Revised) - Hacker News 2](https://arxiv.org/abs/2310.07923)

[212. How to Test Multiple Variations of Generative AI Prompts - HackerNoon](https://hackernoon.com/how-to-test-multiple-variations-of-generative-ai-prompts)

[213. AI and Crowdsourcing: Using Human-in-the-Loop Labeling - HackerNoon](https://hackernoon.com/ai-and-crowdsourcing-using-human-in-the-loop-labeling)

[222. Samsung preps inferencing accelerator to take on Nvidia, scores huge saleAsia In BriefPLUS: Tencent's profit plunge; Singtel to build three AI datacenters; McDonald's China gobbles Microsoft AIAI + ML4 hrs| - The Register](https://www.theregister.com/2024/03/24/asia_tech_news_roundup/)

In [37]:
client = Client(base_url='https://bsky.social')
client.login(os.environ['BSKY_USERNAME'], os.environ['BSKY_SECRET'])

mydid = {"did":"did:plc:qomkdnxrqw3gkbytdxea5z65"}

data = client.get_author_feed(
    actor=mydid['did'],
    filter='posts_and_author_threads',
    limit=50,
)


In [38]:
def remove_urls(text):
    # Regular expression to match URLs
    url_pattern = r'https?://\S+|www\.\S+'
    # Substitute found URLs with an empty string
    clean_text = re.sub(url_pattern, '', text)
    return clean_text



In [39]:
def rawfetchurl(url, timeout=60):
    """get url using requests with specified timeout. return response object, status, content-type"""
    try:
        response = requests.get(url, timeout=timeout)
    except httplib.BadStatusLine:
        print("Bad response (?) fetching url %s " % url)
        response = None
    except requests.Timeout:
        print("Timeout fetching url %s " % url)
        response = None
    except requests.ConnectionError as e:
        print("Connection error (%s) fetching url %s " % (str(e), url))
        response = None
    except requests.TooManyRedirects:
        print("Too many redirects fetching url %s " % url)
        response = None
    except requests.exceptions.MissingSchema:
        print("Missing schema url %s " % url)
        response = None
    except requests.exceptions.InvalidSchema:
        print("Invalid schema url %s " % url)
        response = None
    except requests.exceptions.InvalidURL as e:
        print("Invalid url %s, %s" % (url, str(e)))
        response = None
    except ValueError as e:
        # don't log url because possibly malformed url
        print("ValueError, url ?: ? ")
        response = None
    except httplib.IncompleteRead as e:
        print("IncompleteRead, url %s: %s " % (url, str(e)))
        response = None
    except urllib3.exceptions.SSLError as e:
        print("SSLError, url %s: %s " % (url, str(e)))
        response = None
    except requests.exceptions.ContentDecodingError as e:
        print("SSLError, url %s: %s " % (url, str(e)))
        response = None
    except requests.exceptions.ChunkedEncodingError as e:
        print("ChunkedEncodingError, url %s: %s " % (url, str(e)))
        response = None
    except UnicodeEncodeError as e:
        print("UnicodeEncodeError, url %s: %s " % (url, str(e)))
        response = None
    except OpenSSL.SSL.SysCallError as e:
        print("OpenSSL.SSL.SysCallError, url %s: %s " % (url, str(e)))
        response = -1
    except OpenSSL.SSL.ZeroReturnError as e:
        print("OpenSSL.SSL.ZeroReturnError, url %s: %s " % (url, str(e)))
        response = -1

    # except requests.packages.urllib3.exceptions.DecodeError as e:
    #     utilLog("DecodeError, url %s: %s " % (url, str(e)))
    #     response = None

    return response


imgurl = 'https://nypost.com/wp-content/uploads/sites/2/2024/02/nate-silver-calls-shut-gemini-77192719.jpg?quality=75&strip=all&w=1024'
r = rawfetchurl(imgurl)

In [40]:

# #         impath = "%s/%d.%s" % (today_orig_dir, actualurl.id, file_ext)
# impath = 'x.jpg'
# with open(impath, 'wb') as file:
#     file.write(r.content)

# display(IPython.display.Image(filename=impath))


In [41]:

# def resize_and_crop(image_path, output_path, size=(360, 360)):
#     # Open the image
#     image = PIL.Image.open(image_path)

#     # Calculate the aspect ratio
#     aspect_ratio = image.width / image.height
#     target_aspect_ratio = size[0] / size[1]

#     # Determine the scaling factor and new size
#     if aspect_ratio > target_aspect_ratio:
#         # Image is wider than desired aspect ratio
#         new_height = size[1]
#         new_width = int(new_height * aspect_ratio)
#     else:
#         # Image is taller than desired aspect ratio
#         new_width = size[0]
#         new_height = int(new_width / aspect_ratio)

#     # Resize the image
#     image = image.resize((new_width, new_height))

#     # Calculate coordinates to crop the image to the target size
#     left = (new_width - size[0]) / 2
#     top = (new_height - size[1]) / 2
#     right = (new_width + size[0]) / 2
#     bottom = (new_height + size[1]) / 2

#     # Crop the image
#     image = image.crop((left, top, right, bottom))

#     # Save the cropped image
#     image.save(output_path)

def resize_and_crop(input_image_path, output_image_path, desired_height=240):
    # Load the image
    with Image.open(input_image_path) as img:
        img = img.convert('RGB')

        # Calculate the new width maintaining the aspect ratio
        aspect_ratio = img.width / img.height
        new_width = int(desired_height * aspect_ratio)

        # Resize the image
        resized_img = img.resize((new_width, desired_height))

        # Save the resized image
        resized_img.save(output_image_path)

output_path = 'square.jpg'
resize_and_crop(impath, output_path)
display(IPython.display.Image(filename=output_path))


NameError: name 'impath' is not defined

In [None]:
def truncate_last_occurrence(text: str) -> str:
    # Find the last occurrence of a space followed by any sequence of characters followed by 3 periods
    pattern = r'\s+\S+\.{3}$'
    return re.sub(pattern, '', text)

# Example text for testing

example_text = """Elon Musk says we'll run out of power capacity to run all the AI chips in 2025
newatlas.com/technology/e..."""

# Truncate the last occurrence
print(truncate_last_occurrence(example_text))



In [None]:

# for post in data.feed:
#     post_str = post.post.record.text.rstrip()
#     post_str = truncate_last_occurrence(post_str)
#     post_url = ""
#     try:
#         post_url = post.post.record.embed.external.uri.rstrip()
#     except:
#         pass

#     print(remove_urls(post_str))
#     print(post_url)
#     print()


In [None]:
# for i, post in enumerate(data.feed):
#     post_str = post.post.record.text.rstrip()
#     post_str = truncate_last_occurrence(post_str)
#     post_url = ""
#     tag_dict = {}
#     try:
#         post_url = post.post.record.embed.external.uri.rstrip()
#     except:
#         pass
#     if post_url:
#         tag_dict = get_og_tags(post_url)
#         display_str = f"[{post_str}]({post_url})"
#         site_name = tag_dict.get('og:site_name')
#         img_url = tag_dict.get('og:image')
#         if site_name:
#             display_str += f" - {site_name}"
#         if img_url:
#             try:
#                 r = rawfetchurl(img_url)
#                 content_type = r.headers['Content-Type']
#                 content_type = content_type[content_type.find('/')+1:]
#                 impath = f"source{i}.{content_type}"
#                 with open(impath, 'wb') as file:
#                     file.write(r.content)
#                 output_path = f'Image{i}.jpg'
#                 resize_and_crop(impath, output_path)
#                 display(IPython.display.Image(filename=output_path))
#             except Exception as e:
#                 print(e)
#         display(Markdown(display_str))
#     else:
#         display(Markdown(f"{post_str}"))


In [None]:
display(Markdown(f"Follow latest AI headlines via [SkynetAndChill.com on Bluesky](https://bsky.app/profile/skynetandchill.com)"))


for i, post in enumerate(data.feed):
    post_str = post.post.record.text.rstrip()
    post_str = truncate_last_occurrence(post_str)
    post_url = ""
    tag_dict = {}
    try:
        post_url = post.post.record.embed.external.uri.rstrip()
    except:
        pass
    if post_url:
        tag_dict = get_og_tags(post_url)
        display_str = f"[{post_str}]({post_url})"
        site_name = tag_dict.get('og:site_name')
        img_url = tag_dict.get('og:image')
        if site_name:
            display_str += f" - {site_name}"
        if img_url:
            try:
                r = rawfetchurl(img_url)
                content_type = r.headers['Content-Type']
                content_type = content_type[content_type.find('/')+1:]
                impath = f"source{i}.{content_type}"
                with open(impath, 'wb') as file:
                    file.write(r.content)
                output_path = f'Image{i}.jpg'
                resize_and_crop(impath, output_path)
                display(IPython.display.Image(filename=output_path))
            except Exception as e:
                print(e)
        display(Markdown(display_str))
        display(Markdown("___"))

    else:
        display(Markdown(f"{post_str}"))

In [None]:
data.feed[0].post.record.embed.external.thumb.ref.dict()

In [None]:
data.feed[0].post.record.embed.external.thumb.dict()

In [None]:
data.feed[0].post.record.embed.external.dict()

In [None]:
data.feed[0].post.record.embed.dict()

In [None]:
data.feed[0].post.record.embed.external.uri

In [None]:
print(datetime.now())