# AInewsbot.ipynb - Automate collecting daily AI news

1. initial scrape of front pages of tech sites
  - Open URLs of news sites specififed in `sources` dict (sources.yaml) using Selenium and Firefox
  - Save HTML of each URL in htmldata directory  
  - Extract URLs from all files, create a pandas dataframe with url, title, src

2. Filter and clean to AI-related headlines not seen before
  - Use ChatGPT prompt to filter only AI-related headlines by sending a prompt and formatted table of headlines
  - Use SQLite to filter headlines previously seen 
  - remove duplicate URLs and headlines
  - ensure there are pretty source names for each news site

3. Topic analysis, make a list of topics for each headline
  - using a prompt, check each headline against a number of evergreen AI topics, e.g. deepfakes, regulation, AI in education
  - extract free from topics from each headline
  - combine topics into topic list for each headline
  - cluster headlines using dimensionality-reduced embeddings and DBSCAN; ask chatgpt to name each cluster
  - sort headlines by doing a traveling salesman shortest traversal in embedding space

4. Summarize individual news story pages in 3 bullets using a prompt

5. create a large markdown file with all bullet points and topics

6. give the markdown file to ChatGPT and ask it to make a list of most popular and import topics of the day

7. human should make a list of the day's topics, combining the chatgpt response to the quesion and cluster topics and 

8. Put summaries in vector store along with metadata. For each topic, retrieve all associated stories and have chatgpt write a digest of those stories in the given format.

9. assemble stories into first draft of newsletter for rewriting as necessary

todo:

use langgraph for final editing workfow
1. prompt to edit final copy for dupes, combine similar sections, copy edit
2. have a reviewer prompt check if there are any bullet points to move to a different section 
3. if so have an editor prompt remove them , return to 2. until nothing left to move dupes left
4. have a reviewer check each section, identify bullet points that are similar to other bullet points in the section and have identical links. rewrite combining so there is no duplication. 
5. identify sections that are short or similar to other sections and suggest sections that should be combined them
6. have an editor prompt merge short sections, return to 4, until no orphan sections left
7. maybe final copy-edit prompt

Original, alternative manual workflow to get HTML files if necessary
- Use Chrome, open e.g. Tech News bookmark folder, right-click and open all bookmarks in new window
- on Google News, make sure switch to AI tab
- on Google News, Feedly, Reddit, scroll to additional pages as desired
- Use SingleFile extension, 'save all tabs'
- Move files to htmldata directory
- Run lower part of notebook to process the data


In [1]:
# sys.modules
# import sys
# del sys.modules['ainb_llm']


In [2]:
from datetime import datetime, timedelta
import os
import yaml
import dotenv
import sqlite3
import unicodedata
import json
import pickle
from collections import Counter
import shutil

import numpy as np
import pandas as pd
import umap
# import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

# import bs4
import requests
from urllib.parse import urlparse
import bs4
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import trafilatura

import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio
import aiohttp

from IPython.display import HTML, Image, Markdown, display
import markdown

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

import langchain
from langchain_community.vectorstores import Chroma
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings

import openai
from openai import OpenAI
import tiktoken

import langchain

from ainb_const import (DOWNLOAD_DIR, LOWCOST_MODEL, MODEL, CANONICAL_TOPICS,
                        SOURCECONFIG, FILTER_PROMPT, TOPIC_PROMPT,
                        SUMMARIZE_SYSTEM_PROMPT, SUMMARIZE_USER_PROMPT, FINAL_SUMMARY_PROMPT, 
                        TOP_CATEGORIES_PROMPT, HOSTNAME_SKIPLIST, SITE_NAME_SKIPLIST,
                        MAX_INPUT_TOKENS, MAX_OUTPUT_TOKENS, MAX_RETRIES, TEMPERATURE)
from ainb_utilities import (log, delete_files, filter_unseen_urls_db, insert_article, 
                            nearest_neighbor_sort, agglomerative_cluster_sort, traveling_salesman_sort_scipy,
                            unicode_to_ascii, send_gmail)
from ainb_webscrape import (get_driver, quit_drivers, launch_drivers, get_file, get_url, parse_file, 
                            get_og_tags, get_path_from_url, trimmed_href, process_source_queue_factory, 
                            process_url_queue_factory, get_google_news_redirects)
from ainb_llm import (paginate_df, process_pages, fetch_pages, fetch_openai, fetch_all_summaries, 
                      fetch_openai_summary, trunc_tokens, categorize_headline)


import asyncio
# need this to run async in jupyter since it already has an asyncio event loop running
import nest_asyncio
nest_asyncio.apply()


In [3]:
print(f"LangChain         {langchain.__version__}")
print(f"OpenAI            {openai.__version__}")
# print(f"smtplib           {smtplib.sys.version}")
print(f"trafilatura       {trafilatura.__version__}")
print(f"bs4               {bs4.__version__}")
print(f"numpy             {np.__version__}")
print(f"pandas            {pd.__version__}")
print(f"sklearn           {sklearn.__version__}")
print(f"umap              {umap.__version__}")


LangChain         0.3.0
OpenAI            1.47.0
trafilatura       1.12.2
bs4               4.12.3
numpy             1.26.4
pandas            2.2.3
sklearn           1.5.2
umap              0.5.6


# Initialize

In [4]:
before_date = None
# before_date = '2024-09-20 18:00:00'

# for performance just query last 90 days from sqlite
date_six_months_ago = datetime.now() - timedelta(days=90)
after_date = date_six_months_ago.strftime('%Y-%m-%d %H:%M:%S')
print(before_date, after_date)

None 2024-06-25 08:21:08


In [5]:
# OpenAI API module
client = OpenAI()

# Or can use REST API directly for asyncio
API_URL = 'https://api.openai.com/v1/chat/completions'

headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}',
}


In [6]:
#  load sources to scrape from sources.yaml
with open(SOURCECONFIG, "r") as stream:
    try:
        sources = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

log(f"Load {len(sources)} sources from {SOURCECONFIG}")

# make a reverse dict to map output file titles to source names
sources_reverse = {}
for k, v in sources.items():
    log(f"{k} -> {v['url']} -> {v['title']}.html")
    v['sourcename'] = k
    # map filename (title) to source name
    sources_reverse[v['title']] = k

log(f"Mapped {len(sources_reverse)} source page titles to sources")


2024-09-23 08:21:11,695 - AInewsbot - INFO - Load 17 sources from sources.yaml
2024-09-23 08:21:11,696 - AInewsbot - INFO - Ars Technica -> https://arstechnica.com/ -> Ars Technica.html
2024-09-23 08:21:11,697 - AInewsbot - INFO - Bloomberg Tech -> https://www.bloomberg.com/technology -> Bloomberg Technology.html
2024-09-23 08:21:11,697 - AInewsbot - INFO - Business Insider -> https://www.businessinsider.com/tech -> Business Insider Tech.html
2024-09-23 08:21:11,697 - AInewsbot - INFO - FT Tech -> https://www.ft.com/technology -> FT Technology.html
2024-09-23 08:21:11,697 - AInewsbot - INFO - Feedly AI -> https://feedly.com/i/aiFeeds?options=eyJsYXllcnMiOlt7InBhcnRzIjpbeyJpZCI6Im5scC9mL3RvcGljLzMwMDAifV0sInNlYXJjaEhpbnQiOiJ0ZWNobm9sb2d5IiwidHlwZSI6Im1hdGNoZXMiLCJzYWxpZW5jZSI6ImFib3V0In1dLCJidW5kbGVzIjpbeyJ0eXBlIjoic3RyZWFtIiwiaWQiOiJ1c2VyLzYyZWViYjlmLTcxNTEtNGY5YS1hOGM3LTlhNTdiODIwNTMwOC9jYXRlZ29yeS9HYWRnZXRzIn1dfQ -> Feedly AI.html
2024-09-23 08:21:11,698 - AInewsbot - INFO - Google N

In [7]:
sources

{'Ars Technica': {'include': ['^https://arstechnica.com/(\\w+)/(\\d+)/(\\d+)/'],
  'title': 'Ars Technica',
  'url': 'https://arstechnica.com/',
  'sourcename': 'Ars Technica'},
 'Bloomberg Tech': {'include': ['^https://www.bloomberg.com/news/'],
  'title': 'Bloomberg Technology',
  'url': 'https://www.bloomberg.com/technology',
  'sourcename': 'Bloomberg Tech'},
 'Business Insider': {'exclude': ['^https://www.insider.com',
   '^https://www.passionfroot.me'],
  'title': 'Business Insider Tech',
  'url': 'https://www.businessinsider.com/tech',
  'sourcename': 'Business Insider'},
 'FT Tech': {'include': ['https://www.ft.com/content/'],
  'title': 'FT Technology',
  'url': 'https://www.ft.com/technology',
  'sourcename': 'FT Tech'},
 'Feedly AI': {'exclude': ['^https://feedly.com',
   '^https://s1.feedly.com',
   '^https://blog.feedly.com'],
  'scroll': 5,
  'initial_sleep': 30,
  'title': 'Feedly AI',
  'url': 'https://feedly.com/i/aiFeeds?options=eyJsYXllcnMiOlt7InBhcnRzIjpbeyJpZCI6Im5

In [8]:
# determine files already in htmldata directory
# List all paths in the directory matching today's date
nfiles = 50
files = [os.path.join(DOWNLOAD_DIR, file)
         for file in os.listdir(DOWNLOAD_DIR)]
# Get the current date
today = datetime.now()
year, month, day = today.year, today.month, today.day
datestr = datetime.now().strftime("%m_%d_%Y")

# filter files only
files = [file for file in files if os.path.isfile(file)]

# Sort files by modification time and take top 50
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
file = files[:nfiles]

# filter files by with today's date ending in .html
files = [
    file for file in files if datestr in file and file.endswith(".html")]
log(len(files))
for file in files:
    log(file)

saved_pages = []
for file in files:
    filename = os.path.basename(file)
    # locate date like '01_14_2024' in filename
    position = filename.find(" (" + datestr)
    basename = filename[:position]
    # match to source name
    sourcename = sources_reverse.get(basename)
    if sourcename is None:
        log(f"Skipping {basename}, no sourcename metadata")
        continue
    sources[sourcename]['latest'] = file
    saved_pages.append((sourcename, file))
    

2024-09-23 08:21:17,665 - AInewsbot - INFO - 4
2024-09-23 08:21:17,666 - AInewsbot - INFO - htmldata/Google News - Technology - Artificial intelligence (09_23_2024 08_18_44 AM).html
2024-09-23 08:21:17,666 - AInewsbot - INFO - htmldata/Ars Technica (09_23_2024 08_18_23 AM).html
2024-09-23 08:21:17,667 - AInewsbot - INFO - htmldata/Bloomberg Technology (09_23_2024 08_18_21 AM).html
2024-09-23 08:21:17,667 - AInewsbot - INFO - htmldata/Business Insider Tech (09_23_2024 08_18_20 AM).html


In [9]:
log(f"{len(files)} files found")


2024-09-23 08:21:19,452 - AInewsbot - INFO - 4 files found


In [10]:
try:
    os.remove('bullets.md')
    os.remove('headlines.html')
    print(f"Files deleted successfully.")
except FileNotFoundError as e:
    print(e)

[Errno 2] No such file or directory: 'headlines.html'


# Fetch and save source pages

In [11]:
# Fetch HTML files from sources

# empty download directory
delete_files(DOWNLOAD_DIR)

# save each file specified from sources
num_browsers = 3
log(f"Saving HTML files using {num_browsers} browsers")

# Create a queue for multiprocessing and populate it 
queue = multiprocessing.Queue()
for item in sources.values():
    queue.put(item)
    
# Function to take the queue and pop entries off and process until none are left
# lets you create an array of functions with different args
callable = process_source_queue_factory(queue)

saved_pages = launch_drivers(num_browsers, callable)


2024-09-23 08:21:28,600 - AInewsbot - INFO - Saving HTML files using 3 browsers
2024-09-23 08:21:28,620 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 08:21:28,620 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 08:21:28,621 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 08:21:42,717 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 08:21:42,717 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 08:21:42,717 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 08:21:42,718 - AInewsbot - INFO - get_driver - Initialized webdriver service
2024-09-23 08:21:42,718 - AInewsbot - INFO - get_driver - Initialized webdriver service
2024-09-23 08:21:42,718 - AInewsbot - INFO - get_driver - Initialized webdriver service
2024-09-23 08:22:13,850 - AInewsbot - INFO - get_driver - Initialized webdriver
2024-09-23 08:22:13,871 - AInewsbot - INFO - Proces

2024-09-23 08:22:58,843 - AInewsbot - INFO - get_files(Feedly AI) - Loading additional infinite scroll items
2024-09-23 08:23:00,867 - AInewsbot - INFO - Message: Unable to locate element: //meta[@http-equiv='Content-Type']; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:192:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:510:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

2024-09-23 08:23:00,868 - AInewsbot - INFO - get_files(HackerNoon) - Saving HackerNoon (09_23_2024 08_23_00 AM).html as utf-8
2024-09-23 08:23:00,870 - AInewsbot - INFO - Processing Reddit
2024-09-23 08:23:00,870 - AInewsbot - INFO - get_files(Reddit multiple subreddits) - starting get_files https://www.reddit.com/r/ChatGPT+Cha

2024-09-23 08:23:57,024 - AInewsbot - INFO - get_files(Washington Post Technology) - Saving Washington Post Technology (09_23_2024 08_23_57 AM).html as utf-8
2024-09-23 08:23:57,029 - AInewsbot - INFO - Quit webdriver
2024-09-23 08:23:59,321 - AInewsbot - INFO - returned 17


In [12]:
log(f"Saved {len(saved_pages)} pages")

print(len(saved_pages))
for sourcename, page in saved_pages:
    sources[sourcename]['latest'] = page
    log(f"{sourcename} -> {page}")
    

2024-09-23 08:23:59,325 - AInewsbot - INFO - Saved 17 pages
2024-09-23 08:23:59,325 - AInewsbot - INFO - Business Insider -> htmldata/Business Insider Tech (09_23_2024 08_22_28 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - Feedly AI -> htmldata/Feedly AI (09_23_2024 08_23_48 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - Ars Technica -> htmldata/Ars Technica (09_23_2024 08_22_25 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - FT Tech -> htmldata/FT Technology (09_23_2024 08_22_36 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - Hacker News -> htmldata/Hacker News Page 1 (09_23_2024 08_22_47 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - Hacker News 2 -> htmldata/Hacker News Page 2 (09_23_2024 08_22_57 AM).html
2024-09-23 08:23:59,326 - AInewsbot - INFO - NYT Tech -> htmldata/New York Times Technology (09_23_2024 08_23_08 AM).html
2024-09-23 08:23:59,327 - AInewsbot - INFO - Techmeme -> htmldata/Techmeme (09_23_2024 08_23_19 AM).html
2024-09-23 08:23:59,

17


# Extract news URLs from saved pages

In [13]:
# Parse news URLs and titles from downloaded HTML files
log("Parsing html files")
all_urls = []
for sourcename, filename in saved_pages:
    log(sourcename +' -> ' + filename)
    log(f"{sourcename}", "parse loop")
    links = parse_file(sources[sourcename])
    log(f"{len(links)} links found", "parse loop")
    all_urls.extend(links)

log(f"found {len(all_urls)} links", "parse loop")

# make a pandas dataframe of all the links found
orig_df = (
    pd.DataFrame(all_urls)
    .groupby("url")
    .first()
    .reset_index()
    .sort_values("src")[["src", "title", "url"]]
    .reset_index(drop=True)
    .reset_index(drop=False)
    .rename(columns={"index": "id"})
)
orig_df.head()


2024-09-23 08:23:59,331 - AInewsbot - INFO - Parsing html files
2024-09-23 08:23:59,331 - AInewsbot - INFO - Business Insider -> htmldata/Business Insider Tech (09_23_2024 08_22_28 AM).html
2024-09-23 08:23:59,331 - AInewsbot - INFO - parse loop - Business Insider
2024-09-23 08:23:59,358 - AInewsbot - INFO - parse_file - found 309 raw links
2024-09-23 08:23:59,362 - AInewsbot - INFO - parse_file - found 50 filtered links
2024-09-23 08:23:59,362 - AInewsbot - INFO - parse loop - 50 links found
2024-09-23 08:23:59,363 - AInewsbot - INFO - Feedly AI -> htmldata/Feedly AI (09_23_2024 08_23_48 AM).html
2024-09-23 08:23:59,363 - AInewsbot - INFO - parse loop - Feedly AI
2024-09-23 08:23:59,397 - AInewsbot - INFO - parse_file - found 211 raw links
2024-09-23 08:23:59,400 - AInewsbot - INFO - parse_file - found 56 filtered links
2024-09-23 08:23:59,400 - AInewsbot - INFO - parse loop - 56 links found
2024-09-23 08:23:59,402 - AInewsbot - INFO - Ars Technica -> htmldata/Ars Technica (09_23_2024

Unnamed: 0,id,src,title,url
0,0,Ars Technica,Life imitates xkcd comic as Florida gang beats...,https://arstechnica.com/security/2024/09/forge...
1,1,Ars Technica,Ars is seeking a seasoned senior reporter for ...,https://arstechnica.com/staff/2024/09/ars-is-s...
2,2,Ars Technica,Rocket Report: Eutelsat’s surprising decision;...,https://arstechnica.com/space/2024/09/rocket-r...
3,3,Ars Technica,One company appears to be thriving as part of ...,https://arstechnica.com/space/2024/09/nasa-tak...
4,4,Ars Technica,NASA has a fine plan for deorbiting the ISS—un...,https://arstechnica.com/space/2024/09/nasa-has...


In [14]:
orig_df.groupby("src").count()

Unnamed: 0_level_0,id,title,url
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ars Technica,26,26,26
Business Insider,50,50,50
FT Tech,66,66,66
Feedly AI,54,54,54
Google News,55,55,55
Hacker News,26,26,26
Hacker News 2,25,25,25
HackerNoon,97,97,97
NYT Tech,18,18,18
Reddit,99,99,99


In [15]:
# # extracts all links from history where isAI=1
# # useful for training dimensionality reduction
# conn = sqlite3.connect('articles.db')
# c = conn.cursor()
# #  and timestamp > '2024-07-01' 
# query = "select * from news_articles where isAI=1 order by id"
# ai_history_df = pd.read_sql_query(query, conn)
# ai_history_df

# Filter URLs to new AI headlines only

In [16]:
# filter urls we've already seen in previous runs and saved in SQLite
filtered_df = filter_unseen_urls_db(orig_df, before_date=globals().get('before_date'), 
                                    after_date=globals().get('after_date'))
len(filtered_df)


2024-09-23 08:23:59,913 - AInewsbot - INFO - Querying SQLite with where_clause: WHERE timestamp > '2024-06-25 08:21:08'
2024-09-23 08:24:00,529 - AInewsbot - INFO - URLs in orig_df: 823
2024-09-23 08:24:00,531 - AInewsbot - INFO - Existing URLs in DB: 149747
2024-09-23 08:24:00,552 - AInewsbot - INFO - New URLs in df filtered by URL: 200
2024-09-23 08:24:00,627 - AInewsbot - INFO - Existing src+title: 8
2024-09-23 08:24:00,628 - AInewsbot - INFO - New URLs in df filtered by src+title: 192


192

In [17]:
filtered_df.groupby('src').count()

Unnamed: 0_level_0,id,title,url
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ars Technica,1,1,1
Business Insider,7,7,7
FT Tech,25,25,25
Feedly AI,49,49,49
Google News,40,40,40
Hacker News,12,12,12
Hacker News 2,9,9,9
HackerNoon,14,14,14
NYT Tech,1,1,1
Reddit,5,5,5


In [18]:
# use chatgpt to filter AI-related headlines using a prompt to OpenAI
print(FILTER_PROMPT)



You will act as a research assistant to categorize news articles based on their relevance
to the topic of artificial intelligence (AI). You will closely read the title of each story
to determine if it is primarily about AI based on the semantic meaning of the title and
the keywords and entities mentioned. The input headlines and outptu classifications will
be formatted as JSON objects.

Input Specification:
You will receive a list of news headlines formatted as JSON objects.
Each object will include an 'id' and a 'title'. For instance:
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 103,'title': 'Baby trapped in refrigerator eats own foot'},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
 ]

Classification Criteria:
Classify each story based 

In [19]:
# make pages that fit in a reasonably sized (MAXPAGELEN or MAX_INPUT_TOKENS) prompt
pages = paginate_df(filtered_df)
log(f"Paginated {len(pages)} pages")


2024-09-23 08:24:00,902 - AInewsbot - INFO - Paginated 4 pages


In [20]:
# use REST API directly. OpenAI python API doesn't support concurrent requests from a single client
# this runs fast with async aiohttp and on gpt-3.5 (15 seconds vs 2 minutes synchronously with gpt-4o)
# the old API supported submitting multiple payloads in a single completion request
# current API supports a slow 'batch' submission https://platform.openai.com/docs/guides/rate-limits/usage-tiers
# there is a more complex example here - https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py

json_schema = {
    "name": "json_schema",
    "strict": True,
    "schema": {
        "type": "object",
        "properties": {
            "isai_array": {
                "type": "array",
                "items": {
                    "type": "object",
                            "properties": {
                                "id": {
                                    "type": "number"
                                },
                                "isAI": {
                                    "type": "boolean"
                                }
                            },
                    "required": ["id", "isAI"],
                    "additionalProperties": False
                }
            }
        },
        "required": ["isai_array"],
        "additionalProperties": False
    }
}


log("start AI classify")
enriched_urls = asyncio.run(fetch_pages(pages, prompt=FILTER_PROMPT, json_schema=json_schema))
log("end AI classify")

enriched_df = pd.DataFrame(enriched_urls)
print(len(enriched_df))
log("isAI", len(enriched_df.loc[enriched_df["isAI"]]))
log("not isAI", len(enriched_df.loc[~enriched_df["isAI"]]))
enriched_df.head()


2024-09-23 08:24:00,906 - AInewsbot - INFO - start AI classify
2024-09-23 08:24:00,908 - AInewsbot - INFO - Applying prompt to 4 pages using gpt-4o-mini
2024-09-23 08:24:00,909 - AInewsbot - INFO - sent 50 items 
2024-09-23 08:24:00,909 - AInewsbot - INFO - sent 50 items 
2024-09-23 08:24:00,910 - AInewsbot - INFO - sent 50 items 
2024-09-23 08:24:00,910 - AInewsbot - INFO - sent 42 items 
2024-09-23 08:24:05,183 - AInewsbot - INFO - got dict with 42 items 
2024-09-23 08:24:05,503 - AInewsbot - INFO - got dict with 50 items 
2024-09-23 08:24:05,555 - AInewsbot - INFO - got dict with 50 items 
2024-09-23 08:24:05,963 - AInewsbot - INFO - got dict with 50 items 
2024-09-23 08:24:05,967 - AInewsbot - INFO - Processed 192 responses.
2024-09-23 08:24:05,968 - AInewsbot - INFO - end AI classify
2024-09-23 08:24:05,971 - AInewsbot - INFO - 61 - isAI
2024-09-23 08:24:05,973 - AInewsbot - INFO - 131 - not isAI


192


Unnamed: 0,id,isAI
0,17,False
1,28,True
2,41,False
3,60,False
4,62,True


In [21]:
# merge returned df with isAI column into original df on id column
merged_df = pd.merge(filtered_df, enriched_df, on="id", how="outer")
merged_df['date'] = datetime.now().date()
merged_df.head()


Unnamed: 0,id,src,title,url,isAI,date
0,17,Ars Technica,Tandem OLED is OLED’s latest weapon in holding...,https://arstechnica.com/gadgets/2024/09/what-t...,False,2024-09-23
1,28,Business Insider,Sam Altman is joining forces with design guru ...,https://www.businessinsider.com/sam-altman-wor...,True,2024-09-23
2,41,Business Insider,Intel is in talks to get an investment of up t...,https://www.businessinsider.com/intel-talks-in...,False,2024-09-23
3,60,Business Insider,Mike Lynch's sunken superyacht may contain saf...,https://www.businessinsider.com/mike-lynch-yac...,False,2024-09-23
4,62,Business Insider,Internal memo reveals new Amazon bonuses for s...,https://www.businessinsider.com/amazon-bonuses...,True,2024-09-23


In [22]:
# should be empty, shouldn't get back rows that don't match to existing
log(f"Unmatched response rows: {len(merged_df.loc[merged_df['src'].isna()])}")
# should be empty, should get back all rows from orig
log(f"Unmatched source rows: {len(merged_df.loc[merged_df['isAI'].isna()])}")


2024-09-23 08:24:05,996 - AInewsbot - INFO - Unmatched response rows: 0
2024-09-23 08:24:05,998 - AInewsbot - INFO - Unmatched source rows: 0


In [23]:
# keep headlines that are related to AI
AIdf = merged_df.loc[merged_df["isAI"]==1] \
    .reset_index(drop=True)  \
    .reset_index()  \
    .drop(columns=["id"])  \
    .rename(columns={'index': 'id'})

log(f"Found {len(AIdf)} AI headlines")

AIdf

2024-09-23 08:24:06,003 - AInewsbot - INFO - Found 61 AI headlines


Unnamed: 0,id,src,title,url,isAI,date
0,0,Business Insider,Sam Altman is joining forces with design guru ...,https://www.businessinsider.com/sam-altman-wor...,True,2024-09-23
1,1,Business Insider,Internal memo reveals new Amazon bonuses for s...,https://www.businessinsider.com/amazon-bonuses...,True,2024-09-23
2,2,FT Tech,UAE president to meet Joe Biden in push for mo...,https://www.ft.com/content/e85bef92-5f53-4f71-...,True,2024-09-23
3,3,Feedly AI,Introducing the OpenAI Academy,https://openai.com/global-affairs/openai-academy,True,2024-09-23
4,4,Feedly AI,China’s biggest AI model is challenging Americ...,https://restofworld.org/2024/alibaba-qwen-ai-m...,True,2024-09-23
...,...,...,...,...,...,...
56,56,HackerNoon,Details of the OpenAI Lawsuit: The Plaintiffs'...,https://hackernoon.com/details-of-the-openai-l...,True,2024-09-23
57,57,Techmeme,"A look at Bot Farm Corporation, a Siberia-base...",https://www.bloomberg.com/features/2024-poker-...,True,2024-09-23
58,58,The Register,AI to power the corporate Windows 11 refresh? ...,https://www.theregister.com/2024/09/23/windows...,True,2024-09-23
59,59,The Register,Unlock the future of the WLANEmbrace Wi-Fi 7 w...,https://www.theregister.com/2024/09/23/unlock_...,True,2024-09-23


In [24]:
# map title to ascii characters to avoid some dupes with e.g. different quote symbols

AIdf['title'] = AIdf['title'].apply(unicode_to_ascii)


In [25]:
# dedupe identical headlines
AIdf['title_clean'] = AIdf['title'].map(lambda s: "".join(s.split()))
AIdf = AIdf.sort_values("src") \
    .groupby("title_clean") \
    .first() \
    .reset_index(drop=True) \
    .drop(columns=['id']) \
    .reset_index() \
    .rename(columns={'index': 'id'})

log(f"Found {len(AIdf)} unique AI headlines")


2024-09-23 08:24:06,017 - AInewsbot - INFO - Found 57 unique AI headlines


In [26]:
AIdf.groupby("src").count()

Unnamed: 0_level_0,id,title,url,isAI,date
src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Business Insider,2,2,2,2,2
FT Tech,1,1,1,1,1
Feedly AI,37,37,37,37,37
Google News,8,8,8,8,8
Hacker News,2,2,2,2,2
HackerNoon,3,3,3,3,3
Techmeme,1,1,1,1,1
The Register,3,3,3,3,3


In [27]:
# map google news headlines to redirect
# stopped working, google news gives some JS page which performs redirect instead of 301
# AIdf = get_google_news_redirects(AIdf)
AIdf['actual_url']=AIdf['url']

In [28]:
# must do this after updating google actualurl
AIdf['hostname']=AIdf['actual_url'].apply(lambda url: urlparse(url).netloc)
AIdf.head()

Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname
0,0,Google News,20 AI News and Analyst Ratings You Should Not ...,https://news.google.com/read/CBMimgFBVV95cUxNc...,True,2024-09-23,https://news.google.com/read/CBMimgFBVV95cUxNc...,news.google.com
1,1,Feedly AI,5 Free Courses to Master Deep Learning in 2024,https://machinelearningmastery.com/5-free-cour...,True,2024-09-23,https://machinelearningmastery.com/5-free-cour...,machinelearningmastery.com
2,2,Feedly AI,"AI Is Evolving Faster Than Experts, Including ...",https://www.cnet.com/tech/computing/ai-is-evol...,True,2024-09-23,https://www.cnet.com/tech/computing/ai-is-evol...,www.cnet.com
3,3,Feedly AI,AI Regulation: An Underreported Issue Of The U...,https://www.forbes.com/councils/forbesbusiness...,True,2024-09-23,https://www.forbes.com/councils/forbesbusiness...,www.forbes.com
4,4,Feedly AI,AI YOU REAL? AI clones of my dead dad going vi...,https://www.the-sun.com/tech/12516178/ai-clone...,True,2024-09-23,https://www.the-sun.com/tech/12516178/ai-clone...,www.the-sun.com


### Get site names and update site names based on URL

In [29]:
# get site_name
conn = sqlite3.connect('articles.db')
c = conn.cursor()
#  and timestamp > '2024-07-01' 
query = "select * from sites"
sites_df = pd.read_sql_query(query, conn)
sites_dict = {row.hostname:row.site_name for row in sites_df.itertuples()}

sites_df

Unnamed: 0,id,hostname,site_name
0,1,247wallst.com,247wallst
1,2,3quarksdaily.com,3 Quarks Daily
2,3,9to5google.com,9to5Google
3,4,9to5mac.com,9to5Mac
4,5,9to5toys.com,9to5Toys
...,...,...,...
1142,1192,til.simonwillison.net,TIL (Today I Learned) by Simon Willison
1143,1193,win-vector.com,Win-Vector
1144,1194,eprint.iacr.org,IACR Eprint Archive
1145,1195,www.jeffgeerling.com,Jeff Geerling


In [30]:
AIdf['site_name'] = AIdf['hostname'].apply(lambda hostname: sites_dict.get(hostname, ""))
AIdf.loc[AIdf['site_name']==""]

Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name
17,17,Hacker News,Data Science Agent and Code Transformation,https://labs.google.com/code/,True,2024-09-23,https://labs.google.com/code/,labs.google.com,
52,52,Feedly AI,Uncanny Returns: Trevor Paglen and the Halluci...,https://thereader.mitpress.mit.edu/uncanny-ret...,True,2024-09-23,https://thereader.mitpress.mit.edu/uncanny-ret...,thereader.mitpress.mit.edu,


In [31]:
async def get_site_name(session, row):
    cat_prompt = f"""
based on this url and your knowledge of the Web, what is the name of the site? https://{row.hostname}

return the response as a json object of the form {{"url": "www.yankodesign.com", "site_name": "Yanko Design"}}

    """
    try:
        messages=[
                  {"role": "user", "content": cat_prompt
                  }]

        payload = {"model":  LOWCOST_MODEL,
                   "response_format": {"type": "json_object"},
                   "messages": messages,
                   "temperature": 0
                   }
        response = await fetch_openai(session, payload)
        response_dict = json.loads(response["choices"][0]["message"]["content"])
        return response_dict
    except Exception as exc:
        print(exc)
                
tasks = []
async with aiohttp.ClientSession() as session:
    for row in AIdf.loc[AIdf['site_name']==""].itertuples():
        task = asyncio.create_task(get_site_name(session, row))
        tasks.append(task)
    responses = await asyncio.gather(*tasks)

responses


[{'url': 'labs.google.com', 'site_name': 'Google Labs'},
 {'url': 'thereader.mitpress.mit.edu', 'site_name': 'The MIT Press Reader'}]

In [32]:
# update site_dict from responses
new_urls = []
for r in responses:
    if r['url'].startswith('https://'):
        r['url'] = r['url'][8:]
    new_urls.append(r['url'])
    sites_dict[r['url']] = r['site_name']
    print(r['url'], r['site_name'])

AIdf['site_name'] = AIdf['hostname'].apply(lambda hostname: sites_dict.get(hostname, hostname))



labs.google.com Google Labs
thereader.mitpress.mit.edu The MIT Press Reader


In [33]:
for url in new_urls:
    sqlstr = "INSERT OR IGNORE INTO sites (hostname, site_name) VALUES (?, ?);"
    print(url, '->', sites_dict[url])
    conn.execute(sqlstr, (url, sites_dict[url]))
    conn.commit()


labs.google.com -> Google Labs
thereader.mitpress.mit.edu -> The MIT Press Reader


In [34]:
# update SQLite database with all seen URLs
conn = sqlite3.connect('articles.db')
cursor = conn.cursor()

for row in AIdf.itertuples():
    insert_article(conn, cursor, row.src, row.hostname, row.title,
                   row.url, row.actual_url, row.isAI, row.date)


In [35]:
# drop banned slop sites
AIdf = AIdf.loc[~AIdf["hostname"].str.lower().isin(HOSTNAME_SKIPLIST) ]
AIdf = AIdf.loc[~AIdf["site_name"].str.lower().isin(SITE_NAME_SKIPLIST)]

# Topic analysis
Try to identify the top topics of the day, to help make a nice summary. 

1st approach - do dimensionality reduction on the headline embeddings with UMAP and cluster with DBSCAN.

2nd approach
 - extract topics from headline using a prompt
 - human canonicalizes topics
 - assign headlines to topics using a prompt
 
 The final summary is pretty inconsistent, would be nice to give chatgpt a prompt that would say, summarize these bullet points using this categorization.
 

### Fit dimensionality reduction model

In [36]:
# # train dimensionality reduction, only need to do this every few months and pickle the model to reflect new topics
# # extracts all links from history where isAI=1
# conn = sqlite3.connect('articles.db')
# c = conn.cursor()
# #  and timestamp > '2024-07-01' 
# query = "select * from news_articles where isAI=1 order by id desc limit 20000"
# ai_history_df = pd.read_sql_query(query, conn)
# len(ai_history_df)

In [37]:
# embedding_model="text-embedding-3-large"
# embedding_df_list = []
# pages = paginate_df(ai_history_df, maxpagelen=1000, max_input_tokens=8192)

# for p in pages:
#     response = client.embeddings.create(input=[obj['title'] for obj in p],
#                                         model=embedding_model)
#     embedding_df_list.append(pd.DataFrame([e.model_dump()['embedding'] for e in response.data]))

# embedding_df = pd.concat(embedding_df_list, axis=0, ignore_index=True)

# embedding_df.to_pickle("historical_embedding_df.pkl")


In [38]:
# # Initialize the UMAP reducer
# reducer = umap.UMAP(n_components=30)
# # Fit the reducer to the data without transforming
# reducer.fit(embedding_df)
# # Pickle the reducer
# with open('reducer.pkl', 'wb') as f:
#     pickle.dump(reducer, f)
# print("UMAP reducer pickled and saved as 'reducer.pkl'")

In [39]:
# attempt to extract top topics 
print(TOPIC_PROMPT)



You will act as a research assistant to extract topics from news headlines. You will extract topics, entities,
and keywords from news headlines formatted as JSON objects.

Input Specification:
You will receive a list of news headlines formatted as JSON objects.
Each object will include an 'id' and a 'title'. For instance:
[{'id': 97, 'title': 'AI to predict dementia, detect cancer'},
 {'id': 103,'title': 'Figure robot learns to make coffee by watching humans for 10 hours'},
 {'id': 105,'title': "Former Microsoft CEO Steve Ballmer is now just as rich as his former boss Bill Gates. Here's how he spends his billions."},
 {'id': 210,'title': 'ChatGPT removes, then reinstates a summarization assistant without explanation.'},
 {'id': 298,'title': 'The 5 most interesting PC monitors from CES 2024'},
]

Output Specification:
You will return a JSON object with the field 'topics' containing a flat list of classification results.
For each headline input, your output will be a JSON object contain

In [40]:
{
                    "type": "object",
                    "properties": {
                        "id": {
                            "type": "number",
                        },
                        "topics": {
                            "type": "array",
                            "items": {
                                "type": "string",
                            },
                        },
                    },
                    "required": ["id", "topics"],
                    "additionalProperties": False,
                }

{'type': 'object',
 'properties': {'id': {'type': 'number'},
  'topics': {'type': 'array', 'items': {'type': 'string'}}},
 'required': ['id', 'topics'],
 'additionalProperties': False}

In [41]:

json_schema = {
    "name": "extracted_topics",
    "strict": True,
    "schema": {
        "type": "object",
        "properties": {
            "extracted_topics": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "id": {
                            "type": "number",
                        },
                        "topics": {
                            "type": "array",
                            "items": {
                                "type": "string",
                            },
                        },
                    },
                    "required": ["id", "topics"],
                    "additionalProperties": False,
                }
            }
        },
        "required": ["extracted_topics"],
        "additionalProperties": False,
    }
}


In [42]:
# get topics
pages = paginate_df(AIdf)
# apply this prompt to AI headlines
log("start freeform topic extraction")
response = asyncio.run(fetch_pages(pages, prompt=TOPIC_PROMPT, json_schema=json_schema))
log("end freeform topic extraction")

2024-09-23 08:24:06,753 - AInewsbot - INFO - start freeform topic extraction
2024-09-23 08:24:06,754 - AInewsbot - INFO - Applying prompt to 2 pages using gpt-4o-mini
2024-09-23 08:24:06,754 - AInewsbot - INFO - sent 50 items 
2024-09-23 08:24:06,755 - AInewsbot - INFO - sent 7 items 
2024-09-23 08:24:09,651 - AInewsbot - INFO - got dict with 7 items 
2024-09-23 08:24:17,637 - AInewsbot - INFO - got dict with 50 items 
2024-09-23 08:24:17,639 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:24:17,640 - AInewsbot - INFO - end freeform topic extraction


In [43]:
topic_df = pd.DataFrame(response)
topic_df = topic_df.rename(columns={'topics': 'extracted_topics'})
print(len(topic_df))
topic_df.head()


57


Unnamed: 0,id,extracted_topics
0,0,"[AI, news, analyst ratings]"
1,1,"[AI, deep learning, courses, education]"
2,2,"[AI, Bill Gates, evolution, technology]"
3,3,"[AI, regulation, elections, politics]"
4,4,"[AI, digital death, cloning, viral content]"


In [45]:
all_topics = [item.lower() for row in topic_df.itertuples() for item in row.extracted_topics]
item_counts = Counter(all_topics)
filtered_topics = [item for item in item_counts if item_counts[item] >= 2 and item not in {'technology', 'ai', 'artificial intelligence'}]
print(len(filtered_topics))
sorted(filtered_topics)


21


['amazon',
 'chatgpt',
 'china',
 'corporate software',
 'education',
 'genai',
 'generative ai',
 'google',
 'jony ive',
 'lawsuit',
 'legal issues',
 'microsoft',
 'notebooklm',
 'openai',
 'podcast creator',
 'podcasts',
 'politics',
 'reality',
 'software development',
 'uae',
 'windows 11']

In [46]:
topic_df['extracted_topics'] = topic_df['extracted_topics'].apply(lambda l: [t.title() for t in l if t.lower() in filtered_topics])


In [47]:
topic_df


Unnamed: 0,id,extracted_topics
0,0,[]
1,1,[Education]
2,2,[]
3,3,[Politics]
4,4,[]
5,5,[]
6,6,[Generative Ai]
7,7,[]
8,8,"[Windows 11, Corporate Software]"
9,9,"[Windows 11, Corporate Software, Microsoft]"


In [48]:
# evergreen topics to hopefully map healdines to canonical standardized topics
# review extracted topics and add
# you could try it with new cats or new cats + evergreen
# but probably look at new cats and human in the loop should add good new cats today to evergreen list
# new_cats = list(json.loads(response.choices[0].message.content).values())[0]
# categories = sorted(list(set(new_cats + evergreen)))
categories = sorted(CANONICAL_TOPICS)
for c in categories:
    print(c)

AI doom
AMD
Agriculture
Alibaba
Amazon
Andrew Ng
Anthropic
Apple
Art & Design
Artificial General Intelligence
Authors & Writing
Autonomous vehicles
Baidu
Bias and Fairness
Big Tech
Bill Gates
Books & Publishing
Brain-Computer Interfaces
Bubble
ChatGPT
Chatbots
China
Claude
Climate
Cloudflare
Code assistants
Cognitive Science
Cohere
Computer Vision
Consciousness
Copilot
Cryptocurrency
Customer service
Cybersecurity
Dario Amodei
Deals
DeepMind
Deepfakes
Demis Hassabis
Disinformation
Disney
Drones
Economics
Education
Elon Musk
Energy
Entertainment
Ethics
European Union
Facial Recognition
Fashion
Finance
Fintech
Food & Drink
Funding
Gaming
Gemini
Gen AI
Geoffrey Hinton
Google
Governance
Hardware
Health & Fitness
Healthcare
History
Hollywood
Hugging Face
IPOs
Ilya Sutskever
India
Inequality
Infrastructure
Intel
Intellectual Property
Internet of Things
Japan
Jensen Huang
Job Automation
Jobs & CareersLabor Market
Korea
Language Models
Larry Page
Legal issues
Lifestyle & Travel
Manufacturing
M

In [49]:
[t for t in filtered_topics if t.lower() not in [u.lower() for u in CANONICAL_TOPICS]]

['generative ai',
 'windows 11',
 'corporate software',
 'software development',
 'lawsuit',
 'genai',
 'notebooklm',
 'podcasts',
 'podcast creator',
 'reality',
 'jony ive',
 'uae']

In [50]:
AIdf


Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name
0,0,Google News,20 AI News and Analyst Ratings You Should Not ...,https://news.google.com/read/CBMimgFBVV95cUxNc...,True,2024-09-23,https://news.google.com/read/CBMimgFBVV95cUxNc...,news.google.com,Google News
1,1,Feedly AI,5 Free Courses to Master Deep Learning in 2024,https://machinelearningmastery.com/5-free-cour...,True,2024-09-23,https://machinelearningmastery.com/5-free-cour...,machinelearningmastery.com,Machine Learning Mastery
2,2,Feedly AI,"AI Is Evolving Faster Than Experts, Including ...",https://www.cnet.com/tech/computing/ai-is-evol...,True,2024-09-23,https://www.cnet.com/tech/computing/ai-is-evol...,www.cnet.com,CNET
3,3,Feedly AI,AI Regulation: An Underreported Issue Of The U...,https://www.forbes.com/councils/forbesbusiness...,True,2024-09-23,https://www.forbes.com/councils/forbesbusiness...,www.forbes.com,Forbes
4,4,Feedly AI,AI YOU REAL? AI clones of my dead dad going vi...,https://www.the-sun.com/tech/12516178/ai-clone...,True,2024-09-23,https://www.the-sun.com/tech/12516178/ai-clone...,www.the-sun.com,The Sun
5,5,Feedly AI,AI can generate recipes that can be deadly. Fo...,https://www.npr.org/2024/09/23/g-s1-23843/arti...,True,2024-09-23,https://www.npr.org/2024/09/23/g-s1-23843/arti...,www.npr.org,NPR
6,6,Feedly AI,AI copilots are evolving into AI agents design...,https://t.co/bPhoHNP74U,True,2024-09-23,https://t.co/bPhoHNP74U,t.co,Twitter
7,7,Feedly AI,AI is an accelerator for sustainability but i...,https://www.weforum.org/agenda/2024/09/ai-acce...,True,2024-09-23,https://www.weforum.org/agenda/2024/09/ai-acce...,www.weforum.org,World Economic Forum
8,8,Feedly AI,AI to power the corporate Windows 11 refresh? ...,https://go.theregister.com/feed/www.theregiste...,True,2024-09-23,https://go.theregister.com/feed/www.theregiste...,go.theregister.com,The Register
9,9,The Register,AI to power the corporate Windows 11 refresh? ...,https://www.theregister.com/2024/09/23/windows...,True,2024-09-23,https://www.theregister.com/2024/09/23/windows...,www.theregister.com,The Register


In [51]:
len(CANONICAL_TOPICS)

154

In [53]:
catdict = asyncio.run(categorize_headline(AIdf, categories=categories, maxpagelen=20))


2024-09-23 08:53:14,568 - AInewsbot - INFO - Start canonical topic classification
2024-09-23 08:53:14,578 - AInewsbot - INFO - AI doom, topic 1 of 154
2024-09-23 08:53:14,579 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:53:14,580 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:14,581 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:14,582 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:53:17,275 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:53:17,583 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:53:19,117 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:53:19,120 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:53:19,121 - AInewsbot - INFO - AMD, topic 2 of 154
2024-09-23 08:53:19,123 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:53:19,125 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:19,127 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:19,129 - AI

2024-09-23 08:53:55,070 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:53:55,073 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:55,074 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:55,075 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:53:57,612 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:53:57,825 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:53:58,622 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:53:58,625 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:53:58,627 - AInewsbot - INFO - Big Tech, topic 15 of 154
2024-09-23 08:53:58,629 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:53:58,630 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:58,632 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:53:58,634 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:54:01,512 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:54:02,277 - AInewsbot - INFO -

2024-09-23 08:54:41,393 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:54:41,395 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:54:43,889 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:54:44,019 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:54:44,727 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:54:44,729 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:54:44,730 - AInewsbot - INFO - Cohere, topic 28 of 154
2024-09-23 08:54:44,730 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:54:44,731 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:54:44,733 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:54:44,733 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:54:47,594 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:54:47,696 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:54:48,208 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:54:48,210 - AInewsbot - INFO - Processed 57 

2024-09-23 08:55:29,067 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:55:29,071 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:55:29,075 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:55:29,076 - AInewsbot - INFO - Disney, topic 41 of 154
2024-09-23 08:55:29,077 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:55:29,079 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:55:29,080 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:55:29,081 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:55:30,706 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:55:31,321 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:55:31,391 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:55:31,394 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:55:31,394 - AInewsbot - INFO - Drones, topic 42 of 154
2024-09-23 08:55:31,395 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:55:31

2024-09-23 08:56:15,469 - AInewsbot - INFO - Food & Drink, topic 54 of 154
2024-09-23 08:56:15,470 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:56:15,472 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:56:15,474 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:56:15,475 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:56:18,018 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:56:18,426 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:56:18,429 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:56:18,432 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:56:18,433 - AInewsbot - INFO - Funding, topic 55 of 154
2024-09-23 08:56:18,434 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:56:18,435 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:56:18,436 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:56:18,439 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:56:21,146 - AInewsbot - I

2024-09-23 08:57:00,212 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:00,214 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:00,215 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:57:02,767 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:03,075 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:57:03,484 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:03,486 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:57:03,487 - AInewsbot - INFO - IPOs, topic 68 of 154
2024-09-23 08:57:03,488 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:57:03,491 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:03,492 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:03,493 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:57:06,863 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:07,003 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:57:07,273 - AInewsbot - INFO - got dict with 20 items 


2024-09-23 08:57:39,523 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:39,564 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:57:40,555 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:40,558 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:57:40,559 - AInewsbot - INFO - Language Models, topic 81 of 154
2024-09-23 08:57:40,560 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:57:40,561 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:40,562 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:57:40,563 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:57:42,500 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:57:42,692 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:42,877 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:57:42,880 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:57:42,880 - AInewsbot - INFO - Larry Page, topic 82 of 154
2024-09-23 08:57:42,882 - A

2024-09-23 08:58:23,814 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:58:23,817 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:58:23,818 - AInewsbot - INFO - Netflix, topic 94 of 154
2024-09-23 08:58:23,819 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:58:23,821 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:58:23,822 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:58:23,823 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:58:26,329 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:58:26,490 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:58:26,581 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:58:26,583 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:58:26,583 - AInewsbot - INFO - Neuromorphic Computing, topic 95 of 154
2024-09-23 08:58:26,584 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:58:26,585 - AInewsbot - INFO - sent 20 items 
2024-09-23 

2024-09-23 08:59:07,274 - AInewsbot - INFO - Products, topic 107 of 154
2024-09-23 08:59:07,274 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:59:07,275 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:07,276 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:07,277 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:59:11,387 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:59:12,564 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:59:12,633 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:59:12,635 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:59:12,636 - AInewsbot - INFO - Qualcomm, topic 108 of 154
2024-09-23 08:59:12,637 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:59:12,638 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:12,638 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:12,640 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:59:15,278 - AInewsbot - IN

2024-09-23 08:59:49,389 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:49,390 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:49,392 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:59:51,632 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:59:52,450 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:59:52,679 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:59:52,682 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 08:59:52,683 - AInewsbot - INFO - Sam Altman, topic 121 of 154
2024-09-23 08:59:52,684 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 08:59:52,686 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:52,687 - AInewsbot - INFO - sent 20 items 
2024-09-23 08:59:52,688 - AInewsbot - INFO - sent 17 items 
2024-09-23 08:59:55,401 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 08:59:56,159 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 08:59:56,649 - AInewsbot - INFO - got dict with 20 

2024-09-23 09:00:37,621 - AInewsbot - INFO - sent 17 items 
2024-09-23 09:00:39,863 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 09:00:39,967 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:00:40,274 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:00:40,277 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 09:00:40,278 - AInewsbot - INFO - Stocks, topic 134 of 154
2024-09-23 09:00:40,279 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 09:00:40,280 - AInewsbot - INFO - sent 20 items 
2024-09-23 09:00:40,281 - AInewsbot - INFO - sent 20 items 
2024-09-23 09:00:40,283 - AInewsbot - INFO - sent 17 items 
2024-09-23 09:00:42,927 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 09:00:43,114 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:00:43,186 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:00:43,188 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 09:00:43,189 - AInewsbot - INFO - Str

2024-09-23 09:01:23,380 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:01:25,024 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:01:25,025 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 09:01:25,026 - AInewsbot - INFO - UK, topic 147 of 154
2024-09-23 09:01:25,026 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 09:01:25,027 - AInewsbot - INFO - sent 20 items 
2024-09-23 09:01:25,027 - AInewsbot - INFO - sent 20 items 
2024-09-23 09:01:25,029 - AInewsbot - INFO - sent 17 items 
2024-09-23 09:01:28,915 - AInewsbot - INFO - got dict with 17 items 
2024-09-23 09:01:30,655 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:01:30,761 - AInewsbot - INFO - got dict with 20 items 
2024-09-23 09:01:30,763 - AInewsbot - INFO - Processed 57 responses.
2024-09-23 09:01:30,764 - AInewsbot - INFO - Uber, topic 148 of 154
2024-09-23 09:01:30,765 - AInewsbot - INFO - Applying prompt to 3 pages using gpt-4o-mini
2024-09-23 09:01:30,766

In [54]:
topic_df['assigned_topics'] = topic_df['id'].apply(lambda id: catdict.get(id, ""))
topic_df

Unnamed: 0,id,extracted_topics,assigned_topics
0,0,[],
1,1,[Education],"[Andrew Ng, DeepMind, Deepfakes, Education, Jo..."
2,2,[],"[Bill Gates, Microsoft, Science]"
3,3,[Politics],"[Bias and Fairness, Big Tech, Ethics, Governan..."
4,4,[],"[AI doom, Consciousness, Deepfakes, Ethics, Ge..."
5,5,[],"[AI doom, Ethics, Food & Drink, Health & Fitne..."
6,6,[Generative Ai],"[Big Tech, Code assistants, Copilot, Economics..."
7,7,[],"[Climate, Energy, Science, Sustainability]"
8,8,"[Windows 11, Corporate Software]","[Big Tech, Hardware, Microsoft, Products]"
9,9,"[Windows 11, Corporate Software, Microsoft]","[Apple, Big Tech, Hardware, Microsoft, Opinion..."


In [55]:
lcategories = set([c.lower() for c in categories])

In [56]:
def clean_topics(row):
    topics = [x.title() for x in row.extracted_topics if x.lower() not in {"technology", "ai", "artificial intelligence"}]
    assigned_topics = [x.title() for x in row.assigned_topics if x.lower() in lcategories]
    combined = sorted(list(set(topics + assigned_topics)))
    combined = [s.replace("Ai", "AI") for s in combined]
    combined = [s.replace("Genai", "Gen AI") for s in combined]
    combined = [s.replace("Openai", "OpenAI") for s in combined]
    return combined

topic_df["topics"] = topic_df.apply(clean_topics, axis=1)
topic_df["topic_str"] = topic_df.apply(lambda row: ", ".join(row.topics), axis=1)
topic_df

Unnamed: 0,id,extracted_topics,assigned_topics,topics,topic_str
0,0,[],,[],
1,1,[Education],"[Andrew Ng, DeepMind, Deepfakes, Education, Jo...","[Andrew Ng, Deepfakes, Deepmind, Education, Jo...","Andrew Ng, Deepfakes, Deepmind, Education, Job..."
2,2,[],"[Bill Gates, Microsoft, Science]","[Bill Gates, Microsoft, Science]","Bill Gates, Microsoft, Science"
3,3,[Politics],"[Bias and Fairness, Big Tech, Ethics, Governan...","[Bias And Fairness, Big Tech, Ethics, Governan...","Bias And Fairness, Big Tech, Ethics, Governanc..."
4,4,[],"[AI doom, Consciousness, Deepfakes, Ethics, Ge...","[AI Doom, Consciousness, Deepfakes, Ethics, Ge...","AI Doom, Consciousness, Deepfakes, Ethics, Gen..."
5,5,[],"[AI doom, Ethics, Food & Drink, Health & Fitne...","[AI Doom, Ethics, Food & Drink, Health & Fitne...","AI Doom, Ethics, Food & Drink, Health & Fitnes..."
6,6,[Generative Ai],"[Big Tech, Code assistants, Copilot, Economics...","[Big Tech, Code Assistants, Copilot, Economics...","Big Tech, Code Assistants, Copilot, Economics,..."
7,7,[],"[Climate, Energy, Science, Sustainability]","[Climate, Energy, Science, Sustainability]","Climate, Energy, Science, Sustainability"
8,8,"[Windows 11, Corporate Software]","[Big Tech, Hardware, Microsoft, Products]","[Big Tech, Corporate Software, Hardware, Micro...","Big Tech, Corporate Software, Hardware, Micros..."
9,9,"[Windows 11, Corporate Software, Microsoft]","[Apple, Big Tech, Hardware, Microsoft, Opinion...","[Apple, Big Tech, Corporate Software, Hardware...","Apple, Big Tech, Corporate Software, Hardware,..."


In [57]:
try:  # for idempotency
    AIdf = AIdf.drop(columns=["title_topic_str"])
except:
    pass
try:  # for idempotency
    AIdf = AIdf.drop(columns=["topic_str"])
except:
    pass

AIdf = pd.merge(AIdf, topic_df[["id", "topic_str"]], on="id", how="inner")
AIdf['title_topic_str'] = AIdf.apply(lambda row: f'{row.title} (Topics: {row.topic_str})', axis=1)
AIdf


Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name,topic_str,title_topic_str
0,0,Google News,20 AI News and Analyst Ratings You Should Not ...,https://news.google.com/read/CBMimgFBVV95cUxNc...,True,2024-09-23,https://news.google.com/read/CBMimgFBVV95cUxNc...,news.google.com,Google News,,20 AI News and Analyst Ratings You Should Not ...
1,1,Feedly AI,5 Free Courses to Master Deep Learning in 2024,https://machinelearningmastery.com/5-free-cour...,True,2024-09-23,https://machinelearningmastery.com/5-free-cour...,machinelearningmastery.com,Machine Learning Mastery,"Andrew Ng, Deepfakes, Deepmind, Education, Job...",5 Free Courses to Master Deep Learning in 2024...
2,2,Feedly AI,"AI Is Evolving Faster Than Experts, Including ...",https://www.cnet.com/tech/computing/ai-is-evol...,True,2024-09-23,https://www.cnet.com/tech/computing/ai-is-evol...,www.cnet.com,CNET,"Bill Gates, Microsoft, Science","AI Is Evolving Faster Than Experts, Including ..."
3,3,Feedly AI,AI Regulation: An Underreported Issue Of The U...,https://www.forbes.com/councils/forbesbusiness...,True,2024-09-23,https://www.forbes.com/councils/forbesbusiness...,www.forbes.com,Forbes,"Bias And Fairness, Big Tech, Ethics, Governanc...",AI Regulation: An Underreported Issue Of The U...
4,4,Feedly AI,AI YOU REAL? AI clones of my dead dad going vi...,https://www.the-sun.com/tech/12516178/ai-clone...,True,2024-09-23,https://www.the-sun.com/tech/12516178/ai-clone...,www.the-sun.com,The Sun,"AI Doom, Consciousness, Deepfakes, Ethics, Gen...",AI YOU REAL? AI clones of my dead dad going vi...
5,5,Feedly AI,AI can generate recipes that can be deadly. Fo...,https://www.npr.org/2024/09/23/g-s1-23843/arti...,True,2024-09-23,https://www.npr.org/2024/09/23/g-s1-23843/arti...,www.npr.org,NPR,"AI Doom, Ethics, Food & Drink, Health & Fitnes...",AI can generate recipes that can be deadly. Fo...
6,6,Feedly AI,AI copilots are evolving into AI agents design...,https://t.co/bPhoHNP74U,True,2024-09-23,https://t.co/bPhoHNP74U,t.co,Twitter,"Big Tech, Code Assistants, Copilot, Economics,...",AI copilots are evolving into AI agents design...
7,7,Feedly AI,AI is an accelerator for sustainability but i...,https://www.weforum.org/agenda/2024/09/ai-acce...,True,2024-09-23,https://www.weforum.org/agenda/2024/09/ai-acce...,www.weforum.org,World Economic Forum,"Climate, Energy, Science, Sustainability",AI is an accelerator for sustainability but i...
8,8,Feedly AI,AI to power the corporate Windows 11 refresh? ...,https://go.theregister.com/feed/www.theregiste...,True,2024-09-23,https://go.theregister.com/feed/www.theregiste...,go.theregister.com,The Register,"Big Tech, Corporate Software, Hardware, Micros...",AI to power the corporate Windows 11 refresh? ...
9,9,The Register,AI to power the corporate Windows 11 refresh? ...,https://www.theregister.com/2024/09/23/windows...,True,2024-09-23,https://www.theregister.com/2024/09/23/windows...,www.theregister.com,The Register,"Apple, Big Tech, Corporate Software, Hardware,...",AI to power the corporate Windows 11 refresh? ...


In [58]:
AIdf


Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name,topic_str,title_topic_str
0,0,Google News,20 AI News and Analyst Ratings You Should Not ...,https://news.google.com/read/CBMimgFBVV95cUxNc...,True,2024-09-23,https://news.google.com/read/CBMimgFBVV95cUxNc...,news.google.com,Google News,,20 AI News and Analyst Ratings You Should Not ...
1,1,Feedly AI,5 Free Courses to Master Deep Learning in 2024,https://machinelearningmastery.com/5-free-cour...,True,2024-09-23,https://machinelearningmastery.com/5-free-cour...,machinelearningmastery.com,Machine Learning Mastery,"Andrew Ng, Deepfakes, Deepmind, Education, Job...",5 Free Courses to Master Deep Learning in 2024...
2,2,Feedly AI,"AI Is Evolving Faster Than Experts, Including ...",https://www.cnet.com/tech/computing/ai-is-evol...,True,2024-09-23,https://www.cnet.com/tech/computing/ai-is-evol...,www.cnet.com,CNET,"Bill Gates, Microsoft, Science","AI Is Evolving Faster Than Experts, Including ..."
3,3,Feedly AI,AI Regulation: An Underreported Issue Of The U...,https://www.forbes.com/councils/forbesbusiness...,True,2024-09-23,https://www.forbes.com/councils/forbesbusiness...,www.forbes.com,Forbes,"Bias And Fairness, Big Tech, Ethics, Governanc...",AI Regulation: An Underreported Issue Of The U...
4,4,Feedly AI,AI YOU REAL? AI clones of my dead dad going vi...,https://www.the-sun.com/tech/12516178/ai-clone...,True,2024-09-23,https://www.the-sun.com/tech/12516178/ai-clone...,www.the-sun.com,The Sun,"AI Doom, Consciousness, Deepfakes, Ethics, Gen...",AI YOU REAL? AI clones of my dead dad going vi...
5,5,Feedly AI,AI can generate recipes that can be deadly. Fo...,https://www.npr.org/2024/09/23/g-s1-23843/arti...,True,2024-09-23,https://www.npr.org/2024/09/23/g-s1-23843/arti...,www.npr.org,NPR,"AI Doom, Ethics, Food & Drink, Health & Fitnes...",AI can generate recipes that can be deadly. Fo...
6,6,Feedly AI,AI copilots are evolving into AI agents design...,https://t.co/bPhoHNP74U,True,2024-09-23,https://t.co/bPhoHNP74U,t.co,Twitter,"Big Tech, Code Assistants, Copilot, Economics,...",AI copilots are evolving into AI agents design...
7,7,Feedly AI,AI is an accelerator for sustainability but i...,https://www.weforum.org/agenda/2024/09/ai-acce...,True,2024-09-23,https://www.weforum.org/agenda/2024/09/ai-acce...,www.weforum.org,World Economic Forum,"Climate, Energy, Science, Sustainability",AI is an accelerator for sustainability but i...
8,8,Feedly AI,AI to power the corporate Windows 11 refresh? ...,https://go.theregister.com/feed/www.theregiste...,True,2024-09-23,https://go.theregister.com/feed/www.theregiste...,go.theregister.com,The Register,"Big Tech, Corporate Software, Hardware, Micros...",AI to power the corporate Windows 11 refresh? ...
9,9,The Register,AI to power the corporate Windows 11 refresh? ...,https://www.theregister.com/2024/09/23/windows...,True,2024-09-23,https://www.theregister.com/2024/09/23/windows...,www.theregister.com,The Register,"Apple, Big Tech, Corporate Software, Hardware,...",AI to power the corporate Windows 11 refresh? ...


In [59]:
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(AIdf.loc[AIdf["topic_str"]==""][['title']])


Unnamed: 0,title
0,20 AI News and Analyst Ratings You Should Not Miss


### Semantic sort

In [60]:
# use embeddings to sort headlines by semantical similarity
log(f"Fetching embeddings for {len(AIdf)} headlines")
embedding_model = 'text-embedding-3-large'
response = client.embeddings.create(input=AIdf['title_topic_str'].tolist(),
                                    model=embedding_model)
embedding_df = pd.DataFrame([e.model_dump()['embedding'] for e in response.data])

# greedy traveling salesman sort
log(f"Sort with nearest neighbor sort")
sorted_indices = nearest_neighbor_sort(embedding_df)
AIdf['sort_order'] = sorted_indices

# do dimensionality reduction on embedding_df and cluster analysis
log(f"Perform dimensionality reduction")
with open("reducer.pkl", 'rb') as file:
    # Load the model from the file
    reducer = pickle.load(file)
reduced_data = reducer.transform(embedding_df)
log(f"Cluster with DBSCAN")
dbscan = DBSCAN(eps=0.4, min_samples=3)  # Adjust eps and min_samples as needed
AIdf['cluster'] = dbscan.fit_predict(reduced_data)
AIdf.loc[AIdf['cluster'] == -1, 'cluster'] = 999
    
# sort first by clusters found by DBSCAN, then by semantic ordering
AIdf = AIdf.sort_values(['cluster', 'sort_order']) \
    .reset_index(drop=True) \
    .reset_index() \
    .drop(columns=["id"]) \
    .rename(columns={'index': 'id'})

AIdf


2024-09-23 09:01:55,955 - AInewsbot - INFO - Fetching embeddings for 57 headlines
2024-09-23 09:01:56,564 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-09-23 09:01:56,724 - AInewsbot - INFO - Sort with nearest neighbor sort
2024-09-23 09:01:56,733 - AInewsbot - INFO - Perform dimensionality reduction
2024-09-23 09:01:59,379 - AInewsbot - INFO - Cluster with DBSCAN


Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name,topic_str,title_topic_str,sort_order,cluster
0,0,Google News,I tried Google's new one-click AI podcast crea...,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,True,2024-09-23,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,news.google.com,Google News,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Google's new one-click AI podcast crea...,13,0
1,1,Google News,Googles NotebookLM: everyone has a podcast abo...,https://news.google.com/read/CBMiowFBVV95cUxOQ...,True,2024-09-23,https://news.google.com/read/CBMiowFBVV95cUxOQ...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM: everyone has a podcast abo...,23,0
2,2,Google News,Googles NotebookLM evolves: What IT leaders ne...,https://news.google.com/read/CBMiuAFBVV95cUxNL...,True,2024-09-23,https://news.google.com/read/CBMiuAFBVV95cUxNL...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM evolves: What IT leaders ne...,29,0
3,3,Feedly AI,Transform your Notes into Podcasts with Google...,https://www.geeky-gadgets.com/notes-into-podca...,True,2024-09-23,https://www.geeky-gadgets.com/notes-into-podca...,www.geeky-gadgets.com,Geeky Gadgets,"Big Tech, Gen AI, Google, Language Models, Not...",Transform your Notes into Podcasts with Google...,40,0
4,4,Feedly AI,I tried Googles new one-click AI podcast creat...,https://www.techradar.com/computing/artificial...,True,2024-09-23,https://www.techradar.com/computing/artificial...,www.techradar.com,TechRadar,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Googles new one-click AI podcast creat...,41,0
5,5,Google News,The chatbot becomes the teacher,https://news.google.com/read/CBMiggFBVV95cUxNU...,True,2024-09-23,https://news.google.com/read/CBMiggFBVV95cUxNU...,news.google.com,Google News,"Chatbots, Chatgpt, Cognitive Science, Consciou...",The chatbot becomes the teacher (Topics: Chatb...,0,999
6,6,Feedly AI,Uncanny Returns: Trevor Paglen and the Halluci...,https://thereader.mitpress.mit.edu/uncanny-ret...,True,2024-09-23,https://thereader.mitpress.mit.edu/uncanny-ret...,thereader.mitpress.mit.edu,The MIT Press Reader,"Art & Design, Artificial General Intelligence,...",Uncanny Returns: Trevor Paglen and the Halluci...,1,999
7,7,Feedly AI,Jony Ive Confirms Involvement in AI Hardware P...,https://www.macrumors.com/2024/09/23/jony-ive-...,True,2024-09-23,https://www.macrumors.com/2024/09/23/jony-ive-...,www.macrumors.com,MacRumors,"Art & Design, Big Tech, Chatgpt, Claude, Dario...",Jony Ive Confirms Involvement in AI Hardware P...,2,999
8,8,Feedly AI,An AI can beat CAPTCHA tests 100 per cent of t...,https://www.newscientist.com/article/2448687-a...,True,2024-09-23,https://www.newscientist.com/article/2448687-a...,www.newscientist.com,New Scientist,"Job Automation, Privacy & Surveillance, Scienc...",An AI can beat CAPTCHA tests 100 per cent of t...,3,999
9,9,Business Insider,Internal memo reveals new Amazon bonuses for s...,https://www.businessinsider.com/amazon-bonuses...,True,2024-09-23,https://www.businessinsider.com/amazon-bonuses...,www.businessinsider.com,Business Insider,"Amazon, Big Tech, Deals, Economics, Finance, G...",Internal memo reveals new Amazon bonuses for s...,4,999


In [61]:
async def write_topic_name(session, topic_list_str, max_retries=3, model=LOWCOST_MODEL):

    TOPIC_WRITER_PROMPT = f"""
You are a topic writing assistant. I will provide a list of headlines with extracted topics in parentheses. 
Your task is to propose a name for a topic that very simply, clearly and accurately captures all the provided 
headlines in less than 7 words. You will output a JSON object with the key "topic_title".

Example Input:
In the latest issue of Caixins weekly magazine: CATL Bets on 'Skateboard Chassis' and Battery Swaps to Dispell Market Concerns (powered by AI) (Topics: Battery Swaps, Catl, China, Market Concerns, Skateboard Chassis)

AI, cheap EVs, future Chevy  the week (Topics: Chevy, Evs)

Electric Vehicles and AI: Driving the Consumer & World Forward (Topics: Consumer, Electric Vehicles, Technology)

Example Output:
{{"topic_title": "Electric Vehicles"}}

Task
Propose the name for the overall topic based on the following provided headlines and individual topics:

{topic_list_str}
"""

    for i in range(max_retries):
        try:
            messages=[
                      {"role": "user", "content": TOPIC_WRITER_PROMPT
                      }]

            payload = {"model":  model,
                       "response_format": {"type": "json_object"},
                       "messages": messages,
                       "temperature": 0
                       }
            response = await fetch_openai(session, payload)
            response_dict = json.loads(response["choices"][0]["message"]["content"])
            return response_dict

            break
        except Exception as exc:
            log(f"Error: {exc}")

    return {}
        

# show clusters
cluster_topics = []
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    async with aiohttp.ClientSession() as session:

        for i in range(30):
            tmpdf = AIdf.loc[AIdf['cluster']==i][["id", "title_topic_str"]]
            if len(tmpdf) ==0:
                break
            display(tmpdf)
            title_topic_str_list = ("\n\n".join(tmpdf['title_topic_str'].to_list()))
            cluster_topic = await write_topic_name(session, title_topic_str_list)
            cluster_topics.append(cluster_topic)
            print(cluster_topic)

    


Unnamed: 0,id,title_topic_str
0,0,"I tried Google's new one-click AI podcast creator, and now I don't know what's real anymore (Topics: Big Tech, Chatbots, Deepfakes, Disinformation, Gen AI, Google, Language Models, Opinion, Podcast Creator, Products, Reality, Review, Sergey Brin, Speech Recognition & Synthesis, Sundar Pichai)"
1,1,"Googles NotebookLM: everyone has a podcast about AI, even AI itself (Topics: Big Tech, Gen AI, Google, Language Models, Notebooklm, Podcasts, Sergey Brin, Sundar Pichai)"
2,2,"Googles NotebookLM evolves: What IT leaders need to know about its enterprise applications (Topics: Big Tech, Gen AI, Google, Language Models, Notebooklm, Retrieval Augmented Generation, Sergey Brin, Sundar Pichai)"
3,3,"Transform your Notes into Podcasts with Google's AI NotebookLM (Topics: Big Tech, Gen AI, Google, Language Models, Notebooklm, Podcasts, Products, Retrieval Augmented Generation, Science, Speech Recognition & Synthesis, Streaming)"
4,4,"I tried Googles new one-click AI podcast creator, and now I dont know whats real anymore (Topics: Big Tech, Chatbots, Deepfakes, Disinformation, Gen AI, Google, Language Models, Opinion, Podcast Creator, Products, Reality, Review, Sergey Brin, Speech Recognition & Synthesis, Sundar Pichai)"


{'topic_title': "Google's AI Podcast Innovations"}


In [62]:
# to determine which sstories to cluster / include
# we could rate sites for reliability, like NYT vs some site in India we never heard of
# we could rate content for length, perplexiyt or something

In [63]:
# we could extract top words using tfidf, something like 
# vectorizer = TfidfVectorizer(stop_words='english')
# tfidf_matrix = vectorizer.fit_transform(documents)
# feature_names = vectorizer.get_feature_names_out()
# topics = []
# for i in range(n_topics):
#     # Get the documents in this cluster
#     cluster_docs = [doc for doc, label in zip(documents, cluster_labels) if label == i]
#     cluster_metadatas = [meta for meta, label in zip(metadatas, cluster_labels) if label == i]

#     # Get the top words for this cluster based on TF-IDF scores
#     tfidf_scores = tfidf_matrix[cluster_labels == i].sum(axis=0).A1
#     top_word_indices = tfidf_scores.argsort()[-n_words_per_topic:][::-1]
#     top_words = [feature_names[index] for index in top_word_indices]


In [64]:
cluster_topic_list = [obj['topic_title'] for obj in cluster_topics]
cluster_topic_list

["Google's AI Podcast Innovations"]

In [65]:
AIdf['cluster_name'] = AIdf['cluster'].apply(lambda i: cluster_topic_list[i] if i<len(cluster_topic_list) else "")


# Save and email headlines


In [66]:
html_str = ""
for row in AIdf.itertuples():
    log(f"[{row.Index}. {row.title} - {row.site_name}]({row.actual_url})")
    html_str += f'{row.Index}.<a href="{row.actual_url}">{row.title} - {row.site_name}</a><br />\n'


2024-09-23 09:02:00,081 - AInewsbot - INFO - [0. I tried Google's new one-click AI podcast creator, and now I don't know what's real anymore - Google News](https://news.google.com/read/CBMi5AFBVV95cUxPZGpTQ2hnWEticGtYUWJHNmFqTElzNWRWMl96R213XzFTMFVuZjBPaU5EUkZvRnE3OWpXZDRscWNQd1VkTmlEMXMydkxROWw2ZzlkTnVrbHAzbmFoY01HZTdDRGdvM3k5S0ZXU1VHR0JPaGFKYmN4MjdvcElwLXZnZlVWNEFYWEo5WFBGT09lNHhzbHk4MWFQUjg0RUlNYWEzV3g0ZEZmNDdYYTdzTG1OZ29oZURfcUFTdnJLT1pKT0J4THdPZjhPODZ6UjBmZWZzOEhyMzA0S1NoUUYzWTNQZF9KUWo)
2024-09-23 09:02:00,082 - AInewsbot - INFO - [1. Googles NotebookLM: everyone has a podcast about AI, even AI itself - Google News](https://news.google.com/read/CBMiowFBVV95cUxOQV9wS3pBbEVObkh2UG1xeExNbDBTbFVTN1pTWmNWal85MXgtQXhRaUF6ZFlaMGV6dWVGODBJeVRfQ1IzQS14QzR5eWE3aC15NVQ3TjRJakJpY05QWXh0MFdxbFotcmhGUzRkNGpqRHBBUmkweXZYekFDX1lwb2hDV3NYbjNrclZSWGNmcktWZ3N3VzFTTm5sMXdfVFVIeHFIR1VZ)
2024-09-23 09:02:00,082 - AInewsbot - INFO - [2. Googles NotebookLM evolves: What IT leaders need to know about its

2024-09-23 09:02:00,087 - AInewsbot - INFO - [30. AI Is Evolving Faster Than Experts, Including Bill Gates, Imagined - CNET](https://www.cnet.com/tech/computing/ai-is-evolving-even-faster-than-experts-including-bill-gates-imagined/#ftag=CAD590a51e)
2024-09-23 09:02:00,088 - AInewsbot - INFO - [31. How AI can accelerate the IT strategy creation process - within limits - SiliconANGLE - SiliconANGLE](https://siliconangle.com/2024/09/22/ai-can-accelerate-strategy-creation-process-within-limits/)
2024-09-23 09:02:00,088 - AInewsbot - INFO - [32. OpenAI Set to Launch Advanced Voice Mode on ChatGPT Soon - Analytics India Magazine](https://analyticsindiamag.com/ai-news-updates/openai-set-to-launch-advanced-voice-mode-on-chatgpt-soon/)
2024-09-23 09:02:00,088 - AInewsbot - INFO - [33. Prisoners in Finland participate in AI training programme as part of rehabilitation - Euronews](https://www.euronews.com/my-europe/2024/09/23/prisoners-in-finland-participate-in-ai-training-programme-as-part-of-re

In [67]:
# save headlines
with open('headlines.html', 'w') as f:
    f.write(html_str)


In [68]:
# send mail
log("Sending headlines email")
subject = f'AI headlines {datetime.now().strftime("%H:%M:%S")}'
send_gmail(subject, html_str)


2024-09-23 09:02:00,098 - AInewsbot - INFO - Sending headlines email


# Save individual pages 

In [69]:
# fetch pages
# Create a queue for multiprocessing and populate it 
log("Queuing URLs for scraping")

queue = multiprocessing.Queue()
for row in AIdf.itertuples():
    queue.put((row.id, row.actual_url, row.title))


2024-09-23 09:02:01,815 - AInewsbot - INFO - Queuing URLs for scraping


In [70]:
# scrape urls in queue asynchronously
num_browsers = 4

callable = process_url_queue_factory(queue)

log(f"fetching {len(AIdf)} pages using {num_browsers} browsers")
saved_pages = launch_drivers(num_browsers, callable)


2024-09-23 09:02:01,827 - AInewsbot - INFO - fetching 57 pages using 4 browsers
2024-09-23 09:02:01,829 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 09:02:01,830 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 09:02:01,831 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 09:02:01,832 - AInewsbot - INFO - get_driver - 29189 Initializing webdriver
2024-09-23 09:02:17,720 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 09:02:17,720 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 09:02:17,721 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 09:02:17,721 - AInewsbot - INFO - get_driver - Initialized webdriver profile
2024-09-23 09:02:17,721 - AInewsbot - INFO - get_driver - Initialized webdriver service
2024-09-23 09:02:17,721 - AInewsbot - INFO - get_driver - Initialized webdriver service
2024-09-23 09:02:17,722 - AInewsbot - INFO -

2024-09-23 09:03:22,806 - AInewsbot - INFO - get_url(https://www.newscientist.com/article/2448687-an-ai-can-beat-captcha-tests-100-per-cent-of-the-time/) - starting get_url https://www.newscientist.com/article/2448687-an-ai-can-beat-captcha-tests-100-per-cent-of-the-time/
2024-09-23 09:03:22,988 - AInewsbot - INFO - get_url(I tried Googles new one-click AI podcast creator, and now I dont know whats real anymore) - Saving I_tried_Googles_new_one-click_AI_podcast_creator__and_now_I_dont_know_whats_real_anymore_20240923_090322.html as utf-8
2024-09-23 09:03:22,991 - AInewsbot - INFO - Processing https://www.businessinsider.com/amazon-bonuses-sell-ai-products-q-bedrock-2024-9
2024-09-23 09:03:22,991 - AInewsbot - INFO - get_url(https://www.businessinsider.com/amazon-bonuses-sell-ai-products-q-bedrock-2024-9) - starting get_url https://www.businessinsider.com/amazon-bonuses-sell-ai-products-q-bedrock-2024-9
2024-09-23 09:03:24,340 - AInewsbot - INFO - get_url(Uncanny Returns: Trevor Paglen 

2024-09-23 09:03:56,910 - AInewsbot - INFO - Processing https://www.bloomberg.com/news/articles/2024-09-23/lightspeed-leads-65-million-round-by-india-ai-diagnostics-firm
2024-09-23 09:03:56,911 - AInewsbot - INFO - get_url(https://www.bloomberg.com/news/articles/2024-09-23/lightspeed-leads-65-million-round-by-india-ai-diagnostics-firm) - starting get_url https://www.bloomberg.com/news/articles/2024-09-23/lightspeed-leads-65-million-round-by-india-ai-diagnostics-firm
2024-09-23 09:04:00,682 - AInewsbot - INFO - get_url(AI to power the corporate Windows 11 refresh? Nobody's buying that) - Saving AI_to_power_the_corporate_Windows_11_refresh__Nobody_s_buying_that_20240923_090400.html as utf-8
2024-09-23 09:04:00,683 - AInewsbot - INFO - Processing https://restofworld.org/2024/alibaba-qwen-ai-model/
2024-09-23 09:04:00,684 - AInewsbot - INFO - get_url(https://restofworld.org/2024/alibaba-qwen-ai-model/) - starting get_url https://restofworld.org/2024/alibaba-qwen-ai-model/
2024-09-23 09:04:

2024-09-23 09:04:29,127 - AInewsbot - INFO - get_url(EC-Council Introduces AI-Powered Ethical Hacking against Cybercrime) - Saving EC-Council_Introduces_AI-Powered_Ethical_Hacking_against_Cybercrime_20240923_090429.html as utf-8
2024-09-23 09:04:29,129 - AInewsbot - INFO - Processing https://analyticsindiamag.com/ai-news-updates/openai-set-to-launch-advanced-voice-mode-on-chatgpt-soon/
2024-09-23 09:04:29,129 - AInewsbot - INFO - get_url(https://analyticsindiamag.com/ai-news-updates/openai-set-to-launch-advanced-voice-mode-on-chatgpt-soon/) - starting get_url https://analyticsindiamag.com/ai-news-updates/openai-set-to-launch-advanced-voice-mode-on-chatgpt-soon/
2024-09-23 09:04:33,690 - AInewsbot - INFO - get_url(Tears For Fears defend use of AI in new album cover art) - Saving Tears_For_Fears_defend_use_of_AI_in_new_album_cover_art_20240923_090433.html as utf-8
2024-09-23 09:04:33,692 - AInewsbot - INFO - Processing https://www.euronews.com/my-europe/2024/09/23/prisoners-in-finland-pa

2024-09-23 09:05:11,265 - AInewsbot - INFO - get_url(Probing the possibilities: AI summit held in Ashland) - Saving Probing_the_possibilities__AI_summit_held_in_Ashland_20240923_090511.html as utf-8
2024-09-23 09:05:11,266 - AInewsbot - INFO - Processing https://www.forbes.com/councils/forbesbusinessdevelopmentcouncil/2024/09/23/ai-regulation-an-underreported-issue-of-the-upcoming-elections/
2024-09-23 09:05:11,266 - AInewsbot - INFO - get_url(https://www.forbes.com/councils/forbesbusinessdevelopmentcouncil/2024/09/23/ai-regulation-an-underreported-issue-of-the-upcoming-elections/) - starting get_url https://www.forbes.com/councils/forbesbusinessdevelopmentcouncil/2024/09/23/ai-regulation-an-underreported-issue-of-the-upcoming-elections/
2024-09-23 09:05:13,753 - AInewsbot - INFO - get_url(Cursor 0.41 Update Released: AI-Assisted Coding to Supercharge Your Development Workflow) - Saving Cursor_0_41_Update_Released__AI-Assisted_Coding_to_Supercharge_Your_Development_Workflow_20240923_09

2024-09-23 09:05:55,583 - AInewsbot - INFO - get_url(A profile of Amazon executive Rohit Prasad, who now oversees a new team of thousands to develop AI products for an Alexa upgrade and other businesses (Sebastian Herrera/Wall Street Journal)) - Saving A_profile_of_Amazon_executive_Rohit_Prasad__who_now_oversees_a_new_team_of_thousands_to_develop_AI_products_for_an_Alexa_upgrade_and_other_businesses__Sebastian_Herrera_Wall_Street_Journal__20240923_090555.html as utf-8
2024-09-23 09:05:55,585 - AInewsbot - INFO - Processing https://www.geeky-gadgets.com/ai-automation-with-chatgpt-o1/
2024-09-23 09:05:55,586 - AInewsbot - INFO - get_url(https://www.geeky-gadgets.com/ai-automation-with-chatgpt-o1/) - starting get_url https://www.geeky-gadgets.com/ai-automation-with-chatgpt-o1/
2024-09-23 09:05:59,299 - AInewsbot - INFO - get_url(Vista3D: A Novel AI Framework for Rapid and Detailed 3D Object Generation from a Single Image Using Diffusion Priors) - Saving Vista3D__A_Novel_AI_Framework_for_R

In [71]:
saved_pages

[(2,
  'https://news.google.com/read/CBMiuAFBVV95cUxNLTliV0dwT2Y4dm5YVWk3eE5YSUhZTUQtX0hXdjh4MXFZNE1pbHZiQll0MWx6TlpoSENIbDJmZnVyczBPZTdxcVc2T3pfdHp3UkxVeUZZU1ZveWtLenNVTEtWLTZEUmhCdFhQaEZLdTN1YzdISWNESkhLN0hrVXhQbFQxTnpHb2VtcHhicTRYR2QydTZQSzU1VkFZOEdDOVpQV3VBZ3h6c2RIOU9wZzZ4b09nR0hOa2Fm',
  'Googles NotebookLM evolves: What IT leaders need to know about its enterprise applications',
  'htmlpages/Googles_NotebookLM_evolves__What_IT_leaders_need_to_know_about_its_enterprise_applications_20240923_090312.html'),
 (4,
  'https://www.techradar.com/computing/artificial-intelligence/i-tried-google-s-new-one-click-ai-podcast-creator-and-now-i-don-t-know-what-s-real-anymore',
  'I tried Googles new one-click AI podcast creator, and now I dont know whats real anymore',
  'htmlpages/I_tried_Googles_new_one-click_AI_podcast_creator__and_now_I_dont_know_whats_real_anymore_20240923_090322.html'),
 (9,
  'https://www.businessinsider.com/amazon-bonuses-sell-ai-products-q-bedrock-2024-9',
  'Internal 

In [72]:
pages_df = pd.DataFrame(saved_pages)
pages_df.columns = ['id', 'actual_url', 'title', 'path']
pages_df

Unnamed: 0,id,actual_url,title,path
0,2,https://news.google.com/read/CBMiuAFBVV95cUxNL...,Googles NotebookLM evolves: What IT leaders ne...,htmlpages/Googles_NotebookLM_evolves__What_IT_...
1,4,https://www.techradar.com/computing/artificial...,I tried Googles new one-click AI podcast creat...,htmlpages/I_tried_Googles_new_one-click_AI_pod...
2,9,https://www.businessinsider.com/amazon-bonuses...,Internal memo reveals new Amazon bonuses for s...,htmlpages/Internal_memo_reveals_new_Amazon_bon...
3,12,https://www.legaldive.com/spons/the-new-chaos-...,The new Chaos in enterprise data: How experts ...,htmlpages/The_new_Chaos_in_enterprise_data__Ho...
4,16,https://news.google.com/read/CBMivAFBVV95cUxNR...,"UAE seeks closer AI, tech ties in Biden talks ...",htmlpages/UAE_seeks_closer_AI__tech_ties_in_Bi...
5,20,https://www.geeky-gadgets.com/microsoft-grin-m...,Microsoft goes Nuclear with new Grin MoE AI Model,htmlpages/Microsoft_goes_Nuclear_with_new_Grin...
6,24,https://hackernoon.com/openai-lawsuit-heres-wh...,OpenAI Lawsuit: Here's Where the Plaintiffs' C...,htmlpages/OpenAI_Lawsuit__Here_s_Where_the_Pla...
7,28,https://hackernoon.com/ec-council-introduces-a...,EC-Council Introduces AI-Powered Ethical Hacki...,htmlpages/EC-Council_Introduces_AI-Powered_Eth...
8,32,https://analyticsindiamag.com/ai-news-updates/...,OpenAI Set to Launch Advanced Voice Mode on Ch...,htmlpages/OpenAI_Set_to_Launch_Advanced_Voice_...
9,36,https://www.cnet.com/tech/services-and-softwar...,This AI Startup Is Supporting Artificial Voice...,htmlpages/This_AI_Startup_Is_Supporting_Artifi...


In [73]:
AIdf = pd.merge(AIdf, pages_df[["id", "path"]], on='id', how="inner")


In [74]:
AIdf

Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name,topic_str,title_topic_str,sort_order,cluster,cluster_name,path
0,0,Google News,I tried Google's new one-click AI podcast crea...,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,True,2024-09-23,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,news.google.com,Google News,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Google's new one-click AI podcast crea...,13,0,Google's AI Podcast Innovations,htmlpages/I_tried_Google_s_new_one-click_AI_po...
1,1,Google News,Googles NotebookLM: everyone has a podcast abo...,https://news.google.com/read/CBMiowFBVV95cUxOQ...,True,2024-09-23,https://news.google.com/read/CBMiowFBVV95cUxOQ...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM: everyone has a podcast abo...,23,0,Google's AI Podcast Innovations,htmlpages/Googles_NotebookLM__everyone_has_a_p...
2,2,Google News,Googles NotebookLM evolves: What IT leaders ne...,https://news.google.com/read/CBMiuAFBVV95cUxNL...,True,2024-09-23,https://news.google.com/read/CBMiuAFBVV95cUxNL...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM evolves: What IT leaders ne...,29,0,Google's AI Podcast Innovations,htmlpages/Googles_NotebookLM_evolves__What_IT_...
3,3,Feedly AI,Transform your Notes into Podcasts with Google...,https://www.geeky-gadgets.com/notes-into-podca...,True,2024-09-23,https://www.geeky-gadgets.com/notes-into-podca...,www.geeky-gadgets.com,Geeky Gadgets,"Big Tech, Gen AI, Google, Language Models, Not...",Transform your Notes into Podcasts with Google...,40,0,Google's AI Podcast Innovations,htmlpages/Transform_your_Notes_into_Podcasts_w...
4,4,Feedly AI,I tried Googles new one-click AI podcast creat...,https://www.techradar.com/computing/artificial...,True,2024-09-23,https://www.techradar.com/computing/artificial...,www.techradar.com,TechRadar,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Googles new one-click AI podcast creat...,41,0,Google's AI Podcast Innovations,htmlpages/I_tried_Googles_new_one-click_AI_pod...
5,5,Google News,The chatbot becomes the teacher,https://news.google.com/read/CBMiggFBVV95cUxNU...,True,2024-09-23,https://news.google.com/read/CBMiggFBVV95cUxNU...,news.google.com,Google News,"Chatbots, Chatgpt, Cognitive Science, Consciou...",The chatbot becomes the teacher (Topics: Chatb...,0,999,,htmlpages/The_chatbot_becomes_the_teacher_2024...
6,6,Feedly AI,Uncanny Returns: Trevor Paglen and the Halluci...,https://thereader.mitpress.mit.edu/uncanny-ret...,True,2024-09-23,https://thereader.mitpress.mit.edu/uncanny-ret...,thereader.mitpress.mit.edu,The MIT Press Reader,"Art & Design, Artificial General Intelligence,...",Uncanny Returns: Trevor Paglen and the Halluci...,1,999,,htmlpages/Uncanny_Returns__Trevor_Paglen_and_t...
7,7,Feedly AI,Jony Ive Confirms Involvement in AI Hardware P...,https://www.macrumors.com/2024/09/23/jony-ive-...,True,2024-09-23,https://www.macrumors.com/2024/09/23/jony-ive-...,www.macrumors.com,MacRumors,"Art & Design, Big Tech, Chatgpt, Claude, Dario...",Jony Ive Confirms Involvement in AI Hardware P...,2,999,,htmlpages/Jony_Ive_Confirms_Involvement_in_AI_...
8,8,Feedly AI,An AI can beat CAPTCHA tests 100 per cent of t...,https://www.newscientist.com/article/2448687-a...,True,2024-09-23,https://www.newscientist.com/article/2448687-a...,www.newscientist.com,New Scientist,"Job Automation, Privacy & Surveillance, Scienc...",An AI can beat CAPTCHA tests 100 per cent of t...,3,999,,htmlpages/An_AI_can_beat_CAPTCHA_tests_100_per...
9,9,Business Insider,Internal memo reveals new Amazon bonuses for s...,https://www.businessinsider.com/amazon-bonuses...,True,2024-09-23,https://www.businessinsider.com/amazon-bonuses...,www.businessinsider.com,Business Insider,"Amazon, Big Tech, Deals, Economics, Finance, G...",Internal memo reveals new Amazon bonuses for s...,4,999,,htmlpages/Internal_memo_reveals_new_Amazon_bon...


# Summarize individual pages

In [75]:
print(SUMMARIZE_SYSTEM_PROMPT)


You are a summarization assistant.
You will summarize the main content of provided text from HTML files in 3 bullet points or less.
You will focus on the top 3 points of the text and keep the response concise."
You will ignore any content that appears to be navigation menus, footers, sidebars, or other boilerplate content.
You will output Markdown format.
You will provide the bullet points only, without any introduction such as 'here are' or any conclusion, or comment.



In [76]:
print(SUMMARIZE_USER_PROMPT)


Summarize the main points of the following text concisely in 3 bullet points or less.
Ignore any content that is navigation, user instructions, disclaimers, sidebars, or boilerplate hype about the site.
If the text contains primarily boilerplate content, or a request for the user to log in or perform an action to prove they are human,
return a single bullet point describing what was found. Text:



In [77]:
# Here we are fetching all at once, could be 200 summaries, so we are firing off 200 REST requests at once
# This seems like a bad idea, could loop through and fire off e.g. 10 at a time, or use queues and workers (seems pointless)
# But it works and runs fast on gpt-4o-minif and if ChatGPT doesn't like it they could throttle it

log("Starting summarize")
responses = await fetch_all_summaries(AIdf)
log(f"Received {len(responses)} summaries")
print(responses[0])


2024-09-23 09:07:49,871 - AInewsbot - INFO - Starting summarize
2024-09-23 09:07:49,943 - AInewsbot - INFO - fetch_all_summaries - Page title: I tried Google’s new one-click AI podcast creator, and now I don’t know what’s real anymore | TechRadar
Social card title: I tried Google’s new one-click AI podcast creator, and now I don’t know what’s real anymore
Social card description: This one could have real implications for the future

2024-09-23 09:07:50,035 - AInewsbot - INFO - fetch_all_summaries - Page title: Google's NotebookLM: everyone has a podcast about AI, even AI itself - fxguide
Social card title: Google’s NotebookLM: everyone has a podcast about AI, even AI itself
Social card description: Another incredible jump in large language models (LLM) and generative AI (GenAI) but how does it affect visual effects (vfx)?

2024-09-23 09:07:50,062 - AInewsbot - INFO - fetch_all_summaries - Page title: Google’s NotebookLM evolves: What IT leaders need to know about its enterprise applica

2024-09-23 09:07:50,909 - AInewsbot - INFO - fetch_all_summaries - Page title: Alibaba’s Qwen AI model challenges U.S. dominance despite chip restrictions
Social card title: China’s biggest AI model is challenging American dominance
Social card description: Alibaba’s Qwen has been shining on benchmark tests, despite chip restrictions.

2024-09-23 09:07:50,948 - AInewsbot - INFO - fetch_all_summaries - Page title: The new OpenAI logo is already causing controversy (and we haven't even seen it yet) | Creative Bloq
Social card title: The new OpenAI logo is already causing controversy (and we haven't even seen it yet)
Social card description: The design has prompted a fierce internal backlash.

2024-09-23 09:07:51,003 - AInewsbot - INFO - fetch_all_summaries - Page title: OpenAI Lawsuit: Here's Where the Plaintiffs' Claims Fail and Why | HackerNoon
Social card title: OpenAI Lawsuit: Here's Where the Plaintiffs' Claims Fail and Why | HackerNoon
Social card description: DOE v. GITHUB Court F

2024-09-23 09:07:52,061 - AInewsbot - INFO - fetch_all_summaries - Page title: Cursor 0.41 Update Released: AI-Assisted Coding - Geeky Gadgets
Social card title: Cursor 0.41 Update Released: AI-Assisted Coding to Supercharge Your Development Workflow
Social card description: The development team responsible for creating the powerful Cursor AI Code Editor has just released its latest update, version 0.41, which brings a host of

2024-09-23 09:07:52,095 - AInewsbot - INFO - fetch_all_summaries - Page title: You Won't Believe What Salesforce CEO Marc Benioff Said About Microsoft's Copilot Artificial Intelligence (AI) Assistant | The Motley Fool
Social card title: You Won't Believe What Salesforce CEO Marc Benioff Said About Microsoft's Copilot Artificial Intelligence (AI) Assistant | The Motley Fool
Social card description: Microsoft has hailed the benefits of its suite of digital AI assistants. Not everyone is convinced.

2024-09-23 09:07:52,136 - AInewsbot - INFO - fetch_all_summaries -

(0, {'id': 'chatcmpl-AAd56S4B9tzgSUQln8U16yAPBkzyy', 'object': 'chat.completion', 'created': 1727096872, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "- Google's NotebookLM AI can convert written articles into realistic audio conversations, raising questions about the nature of reality and authenticity in media.\n- The AI-generated content can expand upon the original material with new insights and examples, making it feel highly credible and engaging.\n- This technology marks the beginning of a significant shift in media production, with potential concerns about its implications for education and personal expression.", 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 1135, 'completion_tokens': 77, 'total_tokens': 1212, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'system_fingerprint': 'fp_e9627b5346'})


In [78]:
# bring summaries into dict
response_dict = {}
for i, response in responses:
    try:
        response_str = response["choices"][0]["message"]["content"]
        response_dict[i] = response_str
    except Exception as exc:
        print(exc)
        
len(response_dict)

57

In [79]:
AIdf['hostname']=AIdf['actual_url'].apply(lambda url: urlparse(url).netloc)
AIdf['site_name'] = AIdf['hostname'].apply(lambda hostname: sites_dict.get(hostname, ""))


In [80]:
AIdf


Unnamed: 0,id,src,title,url,isAI,date,actual_url,hostname,site_name,topic_str,title_topic_str,sort_order,cluster,cluster_name,path
0,0,Google News,I tried Google's new one-click AI podcast crea...,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,True,2024-09-23,https://news.google.com/read/CBMi5AFBVV95cUxPZ...,news.google.com,Google News,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Google's new one-click AI podcast crea...,13,0,Google's AI Podcast Innovations,htmlpages/I_tried_Google_s_new_one-click_AI_po...
1,1,Google News,Googles NotebookLM: everyone has a podcast abo...,https://news.google.com/read/CBMiowFBVV95cUxOQ...,True,2024-09-23,https://news.google.com/read/CBMiowFBVV95cUxOQ...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM: everyone has a podcast abo...,23,0,Google's AI Podcast Innovations,htmlpages/Googles_NotebookLM__everyone_has_a_p...
2,2,Google News,Googles NotebookLM evolves: What IT leaders ne...,https://news.google.com/read/CBMiuAFBVV95cUxNL...,True,2024-09-23,https://news.google.com/read/CBMiuAFBVV95cUxNL...,news.google.com,Google News,"Big Tech, Gen AI, Google, Language Models, Not...",Googles NotebookLM evolves: What IT leaders ne...,29,0,Google's AI Podcast Innovations,htmlpages/Googles_NotebookLM_evolves__What_IT_...
3,3,Feedly AI,Transform your Notes into Podcasts with Google...,https://www.geeky-gadgets.com/notes-into-podca...,True,2024-09-23,https://www.geeky-gadgets.com/notes-into-podca...,www.geeky-gadgets.com,Geeky Gadgets,"Big Tech, Gen AI, Google, Language Models, Not...",Transform your Notes into Podcasts with Google...,40,0,Google's AI Podcast Innovations,htmlpages/Transform_your_Notes_into_Podcasts_w...
4,4,Feedly AI,I tried Googles new one-click AI podcast creat...,https://www.techradar.com/computing/artificial...,True,2024-09-23,https://www.techradar.com/computing/artificial...,www.techradar.com,TechRadar,"Big Tech, Chatbots, Deepfakes, Disinformation,...",I tried Googles new one-click AI podcast creat...,41,0,Google's AI Podcast Innovations,htmlpages/I_tried_Googles_new_one-click_AI_pod...
5,5,Google News,The chatbot becomes the teacher,https://news.google.com/read/CBMiggFBVV95cUxNU...,True,2024-09-23,https://news.google.com/read/CBMiggFBVV95cUxNU...,news.google.com,Google News,"Chatbots, Chatgpt, Cognitive Science, Consciou...",The chatbot becomes the teacher (Topics: Chatb...,0,999,,htmlpages/The_chatbot_becomes_the_teacher_2024...
6,6,Feedly AI,Uncanny Returns: Trevor Paglen and the Halluci...,https://thereader.mitpress.mit.edu/uncanny-ret...,True,2024-09-23,https://thereader.mitpress.mit.edu/uncanny-ret...,thereader.mitpress.mit.edu,The MIT Press Reader,"Art & Design, Artificial General Intelligence,...",Uncanny Returns: Trevor Paglen and the Halluci...,1,999,,htmlpages/Uncanny_Returns__Trevor_Paglen_and_t...
7,7,Feedly AI,Jony Ive Confirms Involvement in AI Hardware P...,https://www.macrumors.com/2024/09/23/jony-ive-...,True,2024-09-23,https://www.macrumors.com/2024/09/23/jony-ive-...,www.macrumors.com,MacRumors,"Art & Design, Big Tech, Chatgpt, Claude, Dario...",Jony Ive Confirms Involvement in AI Hardware P...,2,999,,htmlpages/Jony_Ive_Confirms_Involvement_in_AI_...
8,8,Feedly AI,An AI can beat CAPTCHA tests 100 per cent of t...,https://www.newscientist.com/article/2448687-a...,True,2024-09-23,https://www.newscientist.com/article/2448687-a...,www.newscientist.com,New Scientist,"Job Automation, Privacy & Surveillance, Scienc...",An AI can beat CAPTCHA tests 100 per cent of t...,3,999,,htmlpages/An_AI_can_beat_CAPTCHA_tests_100_per...
9,9,Business Insider,Internal memo reveals new Amazon bonuses for s...,https://www.businessinsider.com/amazon-bonuses...,True,2024-09-23,https://www.businessinsider.com/amazon-bonuses...,www.businessinsider.com,Business Insider,"Amazon, Big Tech, Deals, Economics, Finance, G...",Internal memo reveals new Amazon bonuses for s...,4,999,,htmlpages/Internal_memo_reveals_new_Amazon_bon...


In [81]:
# make text for email and also collect data for vector store
markdown_str = ''
vectorstore_list = []
metadata_list=[]
for i, row in enumerate(AIdf.itertuples()):
    topics = []
    if row.cluster_name:
        topics.append(row.cluster_name)
    if row.topic_str:
        topics.append(row.topic_str)
    topic_str = ", ".join(topics)

    mdstr = f"[{i+1}. {row.title} - {row.site_name}]({row.actual_url})  \n\n {topic_str}  \n\n{response_dict.get(row.id)} \n\n"
    # simpler version for vector store
    vectorstore_list.append(f"[{row.title} - {row.site_name}]({row.actual_url})\n\nTopics: {row.topic_str} \n\n{response_dict.get(row.id)}\n\n")
    metadata_list.append({'id': row.id, 'title': row.title, 'url': row.actual_url, 'site': row.site_name})
    display(Markdown(mdstr))
    markdown_str += mdstr
    

[1. I tried Google's new one-click AI podcast creator, and now I don't know what's real anymore - Google News](https://news.google.com/read/CBMi5AFBVV95cUxPZGpTQ2hnWEticGtYUWJHNmFqTElzNWRWMl96R213XzFTMFVuZjBPaU5EUkZvRnE3OWpXZDRscWNQd1VkTmlEMXMydkxROWw2ZzlkTnVrbHAzbmFoY01HZTdDRGdvM3k5S0ZXU1VHR0JPaGFKYmN4MjdvcElwLXZnZlVWNEFYWEo5WFBGT09lNHhzbHk4MWFQUjg0RUlNYWEzV3g0ZEZmNDdYYTdzTG1OZ29oZURfcUFTdnJLT1pKT0J4THdPZjhPODZ6UjBmZWZzOEhyMzA0S1NoUUYzWTNQZF9KUWo)  

 Google's AI Podcast Innovations, Big Tech, Chatbots, Deepfakes, Disinformation, Gen AI, Google, Language Models, Opinion, Podcast Creator, Products, Reality, Review, Sergey Brin, Speech Recognition & Synthesis, Sundar Pichai  

- Google's NotebookLM AI can convert written articles into realistic audio conversations, raising questions about the nature of reality and authenticity in media.
- The AI-generated content can expand upon the original material with new insights and examples, making it feel highly credible and engaging.
- This technology marks the beginning of a significant shift in media production, with potential concerns about its implications for education and personal expression. 



[2. Googles NotebookLM: everyone has a podcast about AI, even AI itself - Google News](https://news.google.com/read/CBMiowFBVV95cUxOQV9wS3pBbEVObkh2UG1xeExNbDBTbFVTN1pTWmNWal85MXgtQXhRaUF6ZFlaMGV6dWVGODBJeVRfQ1IzQS14QzR5eWE3aC15NVQ3TjRJakJpY05QWXh0MFdxbFotcmhGUzRkNGpqRHBBUmkweXZYekFDX1lwb2hDV3NYbjNrclZSWGNmcktWZ3N3VzFTTm5sMXdfVFVIeHFIR1VZ)  

 Google's AI Podcast Innovations, Big Tech, Gen AI, Google, Language Models, Notebooklm, Podcasts, Sergey Brin, Sundar Pichai  

- Google’s NotebookLM, powered by the LLM Gemini 1.5 Pro, enables users to upload documents and receive podcast-like discussions summarizing the content, marking a significant advancement in AI conversational agents.
- While it provides an accessible way for technical artists to grasp complex subjects, it lacks true understanding and doesn’t offer insights or opinions about the technology it summarizes.
- The potential impact on the visual effects (VFX) industry is seen as both disruptive and creative, requiring adaptation to leverage AI advancements for new and engaging viewer experiences. 



[3. Googles NotebookLM evolves: What IT leaders need to know about its enterprise applications - Google News](https://news.google.com/read/CBMiuAFBVV95cUxNLTliV0dwT2Y4dm5YVWk3eE5YSUhZTUQtX0hXdjh4MXFZNE1pbHZiQll0MWx6TlpoSENIbDJmZnVyczBPZTdxcVc2T3pfdHp3UkxVeUZZU1ZveWtLenNVTEtWLTZEUmhCdFhQaEZLdTN1YzdISWNESkhLN0hrVXhQbFQxTnpHb2VtcHhicTRYR2QydTZQSzU1VkFZOEdDOVpQV3VBZ3h6c2RIOU9wZzZ4b09nR0hOa2Fm)  

 Google's AI Podcast Innovations, Big Tech, Gen AI, Google, Language Models, Notebooklm, Retrieval Augmented Generation, Sergey Brin, Sundar Pichai  

- Google’s NotebookLM, launched in July 2023, allows users to organize various documents and utilize features like audio generation to explain content, enhancing research efficiency for both students and professionals.
- The tool has gained traction in enterprise applications, with corporate teams leveraging its capabilities to share research and analysis, and enabling features such as the generation of engaging podcast-like discussions.
- Future updates for NotebookLM may include customizable audio features and broader integration with other Google productivity platforms as users explore diverse use cases. 



[4. Transform your Notes into Podcasts with Google's AI NotebookLM - Geeky Gadgets](https://www.geeky-gadgets.com/notes-into-podcasts-with-notebooklm/)  

 Google's AI Podcast Innovations, Big Tech, Gen AI, Google, Language Models, Notebooklm, Podcasts, Products, Retrieval Augmented Generation, Science, Speech Recognition & Synthesis, Streaming  

- Google’s NotebookLM uses advanced Gemini AI to convert written notes into audio podcasts, featuring an innovative "audio overview" functionality for engaging summaries.
- The tool centralizes document management, allowing users to easily upload and organize notes for efficient access and reviewing critical points.
- NotebookLM enhances accessibility with multilingual support, making its AI-driven note transformation capabilities available to a diverse global user base. 



[5. I tried Googles new one-click AI podcast creator, and now I dont know whats real anymore - TechRadar](https://www.techradar.com/computing/artificial-intelligence/i-tried-google-s-new-one-click-ai-podcast-creator-and-now-i-don-t-know-what-s-real-anymore)  

 Google's AI Podcast Innovations, Big Tech, Chatbots, Deepfakes, Disinformation, Gen AI, Google, Language Models, Opinion, Podcast Creator, Products, Reality, Review, Sergey Brin, Speech Recognition & Synthesis, Sundar Pichai  

- Google’s one-click AI podcast creator, NotebookLM, generates realistic audio conversations from text, raising concerns about the authenticity of information.
- The technology not only summarizes key concepts from articles but also expands on them, providing additional examples and insights.
- As AI advances, there are growing worries about its impact on real-world media and the credibility of personal statements in academic applications. 



[6. The chatbot becomes the teacher - Google News](https://news.google.com/read/CBMiggFBVV95cUxNUkhaYUp2R3lNYUxIbmlyeWdsZnpld1Z5eTc4SDZEaFFtTnFEUDZQTU5JVEZpTkI2cVBZWksyaWFuNXJpdml0TmNKem5heFRzU2U5bnVieXhTTW9DaW1LMXFDY0NuOFJVdmQwdlBveGw5RFdrZWtST0JKNkExWmNwNVR3)  

 Chatbots, Chatgpt, Cognitive Science, Consciousness, Customer Service, Education, Ethics, Gen AI, Job Automation, Language Models, Science, Society & Culture, Speech Recognition & Synthesis, Virtual Assistants  

- Steven Johnson, an author focused on science and technology, is now working at Google on NotebookLM, a note-taking and research tool powered by AI.
- NotebookLM, initially launched as Project Tailwind, utilizes AI to help users organize thoughts, extract information, and create content, including generating audio podcasts from user inputs.
- The Vergecast episode features Johnson discussing the implications of AI in research, the importance of fact-checking, and the evolving role of AI in creative processes. 



[7. Uncanny Returns: Trevor Paglen and the Hallucinatory Domain of Generative AI - The MIT Press Reader](https://thereader.mitpress.mit.edu/uncanny-returns-trevor-paglen-and-the-hallucinatory-domain-of-generative-ai/)  

 Art & Design, Artificial General Intelligence, Consciousness, Ethics, Gen AI, Generative AI, Language Models, Opinion, Retrieval Augmented Generation, Science, Society & Culture  

- Trevor Paglen’s artwork, including “Rainbow,” critiques AI-generated imagery's role in further estranging us from the real world, highlighting the biases and hallucinations produced by generative adversarial networks (GANs).
- The implications of machine learning technologies extend beyond image production, as they potentially dictate societal norms and behaviors, impacting perceptions of identity, risk, and classification through opaque algorithms.
- Paglen advocates for a critical understanding of AI systems, urging research methodologies that can illuminate the probabilistic nature of machine classifications, rather than accepting them as definitive truths. 



[8. Jony Ive Confirms Involvement in AI Hardware Project With OpenAI - MacRumors](https://www.macrumors.com/2024/09/23/jony-ive-working-on-new-device-openai/)  

 Art & Design, Big Tech, Chatgpt, Claude, Dario Amodei, Gen AI, Hardware, Ilya Sutskever, Jony Ive, Open Source, OpenAI, Products, Sam Altman, Science  

- Jony Ive has confirmed his involvement in an AI hardware project with OpenAI, ending speculation about their collaboration.
- The venture, funded by Ive and Emerson Collective, could secure up to $1 billion in funding and currently has a small team of 10 employees.
- The project aims to leverage generative AI for innovative computing devices, with its design being led by Ive's firm, LoveFrom, in a significant new office space in San Francisco. 



[9. An AI can beat CAPTCHA tests 100 per cent of the time - New Scientist](https://www.newscientist.com/article/2448687-an-ai-can-beat-captcha-tests-100-per-cent-of-the-time/)  

 Job Automation, Privacy & Surveillance, Science, Testing  

- An AI model, YOLO, can solve CAPTCHA tests designed to differentiate humans from bots with 100% accuracy.
- The AI was trained on thousands of road scene images specifically to master the reCAPTCHA v2 challenges used by Google.
- This development raises concerns about the effectiveness of CAPTCHA as a security measure against automated bots. 



[10. Internal memo reveals new Amazon bonuses for selling flagship AI products - Business Insider](https://www.businessinsider.com/amazon-bonuses-sell-ai-products-q-bedrock-2024-9)  

 Amazon, Big Tech, Deals, Economics, Finance, Gen AI, Jobs & Careerslabor Market, Products  

- AWS has introduced new financial incentives for its sales team to sell AI products Q and Bedrock, including bonuses up to $20,000 per customer.
- The competition from Microsoft, Google, and OpenAI has intensified, prompting AWS to enhance its AI offerings and sales strategies.
- AWS is also implementing performance targets for AI sales and considering higher pay for AI specialists, along with a new internal sales campaign to encourage broader participation. 



[11. Introducing the OpenAI Academy - OpenAI](https://openai.com/global-affairs/openai-academy)  

 Artificial General Intelligence, Big Tech, Chatgpt, Claude, Code Assistants, Dario Amodei, Education, Gen AI, Ilya Sutskever, Language Models, Open Source, OpenAI, Sam Altman  

- OpenAI Academy is a new initiative aimed at investing in developers and organizations in low- and middle-income countries to promote the use of AI for economic growth and innovation.
- The program will provide training, technical guidance, API credits totaling $1 million, and support for community-building to enhance local AI talent and resources.
- OpenAI emphasizes the importance of culturally aware AI applications, aiming to make transformative technology accessible to diverse global communities. 



[12. AI can generate recipes that can be deadly. Food bloggers are not happy - NPR](https://www.npr.org/2024/09/23/g-s1-23843/artificial-intelligence-recipes-food-cooking-apple)  

 AI Doom, Ethics, Food & Drink, Health & Fitness, Safety And Alignment, Science, Society & Culture  

- Major tech companies are integrating AI recipe generation into their devices, but the reception among food bloggers and chefs is largely negative due to concerns over safety and quality.
- AI-generated recipes can be dangerous, as evidenced by instances where harmful ingredient combinations were suggested, potentially leading to health risks for users.
- Food bloggers like the Leungs emphasize the human aspect of recipe development, arguing that AI lacks the culinary experience and sensory understanding that are critical to creating effective and enjoyable recipes. 



[13. The new Chaos in enterprise data: How experts are planning for GenAI prompt data - Legal Dive](https://www.legaldive.com/spons/the-new-chaos-in-enterprise-data-how-experts-are-planning-for-genai-prom/727184/)  

 Artificial General Intelligence, Gen AI, Gen AI, Language Models, Retrieval Augmented Generation, Science  

- The rapid increase in generative AI prompts, from 250 million in January 2024 to a projected 1 billion monthly by year-end, presents significant challenges for eDiscovery and compliance in legal technology.
- Experts indicate that organizations are in early stages of deciding whether to collect GenAI prompt data, with mixed views on its urgency and future implications for legal operations.
- The evolving chaos of managing vast amounts of AI-generated data calls for new IT policies, information governance, and proactive planning to address its impact on legal practices and compliance. 



[14. Leadership In The Age Of AI: The Evolving Role Of The CMO - Forbes](https://www.forbes.com/councils/forbesbusinesscouncil/2024/09/23/leadership-in-the-age-of-ai-the-evolving-role-of-the-cmo/)  

 Gen AI, Governance, Opinion  

- The rise of AI, particularly generative AI, presents both challenges and opportunities for Chief Marketing Officers (CMOs), necessitating a reevaluation of their leadership roles in strategy, talent management, workflows, and risk mitigation.
- A Deloitte survey indicates rapid adoption of GenAI in marketing, with expectations of 71% of organizations using it by the end of 2024, reflecting the growing demand for content in a competitive market.
- CMOs need to enhance their technology fluency, experiment with AI tools, monitor tech advancements, and maintain trust in AI outputs to effectively leverage AI in marketing and address new risks and operational changes. 



[15. LinkedIn is training AI on you  unless you opt out with this setting - The Washington Post](https://www.washingtonpost.com/technology/2024/09/23/linkedin-training-ai-setting-opt-out/)  

 Bias And Fairness, Big Tech, Cybersecurity, Ethics, Gen AI, Jobs & Careerslabor Market, Policy And Regulation, Privacy, Privacy & Surveillance, Safety And Alignment  

- LinkedIn has permission to use user-generated content to train its artificial intelligence by default.
- Users must opt out if they don't want their data used for AI training.
- The article discusses concerns regarding privacy and the potential misuse of personal data on the platform. 



[16. UAE president to meet Joe Biden in push for more US AI technology - Financial Times](https://www.ft.com/content/e85bef92-5f53-4f71-bb3b-a0d65f3a4c48)  

 Big Tech, Gen AI, Governance, Politics, Uae  

- The UAE president, Sheikh Mohamed bin Zayed al-Nahyan, is meeting with President Joe Biden to discuss cooperation in artificial intelligence and to facilitate access to advanced US technology.
- The meeting follows a period of strained US-UAE relations, highlighting the strategic importance of AI in the UAE’s plan to diversify its economy away from fossil fuels.
- The UAE aims to establish a framework for US-UAE cooperation on AI and seeks changes to export designations to ease the acquisition of US-made AI chips amid previous restrictions. 



[17. UAE seeks closer AI, tech ties in Biden talks as China interest stirs US concern - Google News](https://news.google.com/read/CBMivAFBVV95cUxNRFBsX2V1ZUxTRlJqd1Z3YlgzOEljOG5WTUZ6SVl6RGp5eU15MFNFRkJXQTFDczZ0X3ZERVRMdFlQWWxqR1NXYjBYSzd5aWstR1U2SThSLS1GV1JaeXhoM210bnVPTnVjZWxjSFo5Rk52Rm4yWDRrQk5JVVBDdVJPYUdYVDBQYVdNXzNKelZKaW94a0ZUd0tVQVFaMTNScjlwN0hUNTVTNk95emhRTWdWR2szZzU1M1lQOWJoTg)  

 Big Tech, China, Gen AI, Governance, Policy And Regulation, Politics, Safety And Alignment, Science, Uae  

- President Joe Biden will host UAE President Sheikh Mohamed for talks focused on AI and technology, amid concerns over the UAE's ties with China.
- The UAE aims to strengthen its tech industry with significant investments, including a recent $1.5 billion investment from Microsoft, while distancing itself from Chinese technology.
- UAE officials emphasize the importance of developing their own AI and tech capabilities to maintain control over technology and avoid dependence on external actors. 



[18. Sam Altman is joining forces with design guru Jony Ive and Laurene Powell Jobs to build a new AI device company - Business Insider](https://www.businessinsider.com/sam-altman-working-jony-ive-steve-jobs-widow-ai-device-2024-9)  

 Apple, Art & Design, Big Tech, Gen AI, Hardware, Jony Ive, Mergers And Acquisitions, Products, Sam Altman, Science, Venture Capital  

- Sam Altman is collaborating with Jony Ive and Laurene Powell Jobs to develop a new AI-centric device, potentially raising up to $1 billion in funding by the end of 2024.
- The partnership was facilitated by Airbnb CEO Brian Chesky, who introduced Altman and Ive during discussions about AI technology.
- Altman expressed that while he sees potential in AI devices, he does not aim to compete with smartphones, which he regards as exceptional. 



[19. AI to power the corporate Windows 11 refresh? Nobody's buying that - The Register](https://go.theregister.com/feed/www.theregister.com/2024/09/23/windows_11_ai_opinion/)  

 Big Tech, Corporate Software, Hardware, Microsoft, Products, Windows 11  

- Microsoft faces skepticism in promoting an AI-driven refresh of corporate Windows 11, with industry experts suggesting that the push for upgrades lacks compelling reasons or features.
- The article draws a comparison with Apple's success, highlighting the need for meaningful innovation in corporate AI that resonates with organizational needs for data management.
- The potential for PC manufacturers to benefit lies in creating functional AI tools that address actual business challenges, rather than simply upgrading hardware for the sake of it. 



[20. AI copilots are evolving into AI agents designed to take actions on behalf of users, as business software companies experiment with ways to sell generative AI (Financial Times) - Twitter](https://t.co/bPhoHNP74U)  

 Big Tech, Code Assistants, Copilot, Economics, Finance, Fintech, Gen AI, Generative AI, Job Automation, Language Models, Products, Science, Virtual Assistants  

- The article introduces the next generation of AI-powered assistants, moving beyond current copilots.
- Registration offers temporary access to read this article and others for free for 30 days without requiring card details.
- Subscription options include essential digital access with discounts for annual payments. 



[21. Microsoft goes Nuclear with new Grin MoE AI Model - Geeky Gadgets](https://www.geeky-gadgets.com/microsoft-grin-moe-ai-model/)  

 Big Tech, Bill Gates, Gen AI, Language Models, Microsoft, Nuclear, Satya Nadella  

- Microsoft has developed a new AI model, Grin MoE, which offers superior performance and efficiency, and is backed by a partnership with Constellation Energy to reactivate the Three Mile Island nuclear plant for sustainable energy.
- The Grin MoE model utilizes a "mixture of experts" approach, enhancing its task-specific capabilities and enabling improved reasoning for complex problem-solving.
- Microsoft's initiatives, fueled by sustainable energy and industry collaborations, aim to transform sectors and navigate the implications of AI development, emphasizing the need for ethical considerations and human oversight. 



[22. Mumbai-based Qure.AI, which uses AI to help identify and manage critical diseases at 3K+ imaging sites, raised a $65M Series D led by Lightspeed and 360 ONE (Saritha Rai/Bloomberg) - Bloomberg](https://www.bloomberg.com/news/articles/2024-09-23/lightspeed-leads-65-million-round-by-india-ai-diagnostics-firm)  

 Computer Vision, Deals, Economics, Finance, Funding, Gen AI, Health & Fitness, Healthcare, India, Jobs & Careerslabor Market, Products, Safety And Alignment, Science, Stocks, Venture Capital  

- Indian startup Qure.AI secured $65 million in funding led by Lightspeed Venture Partners to enhance its AI-driven disease detection products.  
- The funding will be used to expand in the US and other markets, improve AI models, and support acquisitions, with the aim of navigating healthcare bottlenecks.  
- Qure.AI's technology is already being utilized in over 3,000 imaging sites across 90 countries, surpassing human capabilities in diagnosing critical diseases. 



[23. Chinas biggest AI model is challenging American dominance - Rest of World](https://restofworld.org/2024/alibaba-qwen-ai-model/)  

 Baidu, Big Tech, China, Gen AI, Language Models, Science, Taiwan  

- Alibaba's Qwen AI model is emerging as a strong competitor against U.S. models like OpenAI's, achieving top rankings in AI performance benchmarks despite semiconductor restrictions.
- Qwen benefits from a large user base in China, with over 2.2 million corporate users, providing a viable alternative to U.S. AI services that are often inaccessible to Chinese companies.
- Qwen's capabilities include strong multilingual support and advanced performance in areas like formal mathematics, suggesting potential for significant impact on global business communication. 



[24. The new OpenAI logo is already causing controversy (and we haven't even seen it yet) - Creative Bloq](https://www.creativebloq.com/design/logos-icons/the-new-openai-logo-is-already-causing-controversy-and-we-havent-even-seen-it-yet)  

 Art & Design, Bias And Fairness, Big Tech, Ethics, Intellectual Property, OpenAI, Opinion  

- OpenAI's new logo design, described as "uninspiring" and "ominous" by employees, has generated significant internal backlash following its reveal in a company-wide meeting.
- The new logo, which features a simple circle that could represent an 'O' or zero, contrasts sharply with the current design that is cherished by the AI community for symbolizing "precision, potential and optimism."
- The backlash indicates the potential for the rebrand to become one of the most controversial in recent years if OpenAI decides to proceed with it despite internal dissatisfaction. 



[25. OpenAI Lawsuit: Here's Where the Plaintiffs' Claims Fail and Why - Hacker Noon](https://hackernoon.com/openai-lawsuit-heres-where-the-plaintiffs-claims-fail-and-why)  

 Artificial General Intelligence, Bias And Fairness, Big Tech, Chatgpt, Claude, Dario Amodei, Ethics, Gen AI, Governance, Ilya Sutskever, Intellectual Property, Language Models, Lawsuit, Legal Issues, Open Source, OpenAI, Opinion, Perplexity, Policy And Regulation, Privacy, Review, Safety And Alignment, Sam Altman, Stability AI  

- The plaintiffs' claim for declaratory relief in the OpenAI lawsuit should be dismissed as it is not a standalone cause of action, lacking any basis in the underlying claims.
- Previous court rulings support the dismissal of declaratory relief claims when underlying claims have been dismissed.
- This text is part of HackerNoon’s Legal PDF Series aimed at making important tech court cases more accessible. 



[26. The Palletrone is a robotic hovercart for moving stuff anywhere - IEEE Spectrum](https://spectrum.ieee.org/cargo-drone-2669117300)  

 Drones, Hardware, Job Automation, Manufacturing, Products, Robots, Science, Supply Chain Optimization, Transportation  

- The Pentagon is promoting the integration of autonomous robots in military operations, aiming for a future where human and robot teams collaborate effectively.
- Concerns arise around soldiers forming emotional attachments to robots and the potential implications for how humans view their own species in combat situations.
- Trust issues between military personnel and robotic systems are significant obstacles, leading to the development of training and psychological strategies to increase confidence in autonomous technology. 



[27. Jony Ive confirms he is working on an OpenAI hardware design project - Google News](https://news.google.com/read/CBMi0AFBVV95cUxNWmh1WFI3Q09ZNGE0NTc3bm5hTGRlMTJObFZBNTVGbUVtaldFMTJCTHpXQ1prM0RReUpyeUJpdFMtYk1OR2RVU3VlMERwOW1wcHprNkNMRjA3NlZLdGM0YkZuRktpSmhqckJvUFF1NUxyNFg1SFhNUWpoZGRRb1U2akxYWVZEcVJXa2t2MS1NUHpEY1JTY3k2aExhM3c5cElXTkZlazVsUU9TSlVtVWQ2X05jUTlTOFp5R1EyVjYxRndGWXlqRWM5cVVGeW9uQkhH)  

 Art & Design, Big Tech, Chatgpt, Claude, Dario Amodei, Gen AI, Hardware, Ilya Sutskever, Jony Ive, Open Source, OpenAI, Products, Sam Altman, Science  

- Jony Ive is collaborating with OpenAI on a new mobile hardware project aimed at creating a less socially disruptive device than a smartphone.
- This partnership follows discussions between Ive and OpenAI's Sam Altman about advancing AI in messaging and visual intelligence, with funding goals of up to $1 billion from tech investors before the end of 2024.
- Details about the device are limited, but the goal is to develop a consumer-focused AI product while maintaining a competitive edge over other AI hardware attempts. 



[28. Forget Nvidia: Here's a Better Top Artificial Intelligence (AI) Stock to Buy Right Now - Google News](https://news.google.com/read/CBMikAFBVV95cUxNUVctQ3dqaDVnTHZLeVVxczlsVW5iN1V1ZHEtQ0VaeUppZF9fTXlBVDh0ZjA3ZUluX3d4U3F1X3FVVElOUno4ZV9JZGxpRC15Z3NUT0IzOFRieG9UOW5zUWNSQV80SzFkdnJoSHVqTWhVaFdLU1dCT1hGYmdfeEZteTZDWUp4VVNvNndWbmxTQmU)  

 Deals, Economics, Finance, Nvidia, Products, Stocks  

- Alphabet is positioned as a formidable investment option, being built for long-term growth and having flexibility to venture into various markets under its umbrella structure.
- The company has been a leader in AI development long before the current AI boom, with successful applications across Google Search, Ads, Voice, and Gmail.
- Despite its strong connection to the AI surge, Alphabet’s stock remains affordable with modest valuation ratios compared to its growth rate, making it an attractive buy. 



[29. EC-Council Introduces AI-Powered Ethical Hacking against Cybercrime - Hacker Noon](https://hackernoon.com/ec-council-introduces-ai-powered-ethical-hacking-against-cybercrime)  

 Bias And Fairness, Cybersecurity, Ethics, Gen AI, Safety And Alignment, Science  

- EC-Council has launched the Certified Ethical Hacker CEH v13 certification, integrating AI capabilities to enhance ethical hacking skills and productivity in combating cybercrime.
- The program includes hands-on training through 221 labs, a capture-the-flag competition, and the exploration of over 550 attack techniques, focusing on AI-driven skills and top AI attack vulnerabilities.
- CEH v13 offers a comprehensive learning framework with both theoretical and practical exams to validate skills, addressing the growing importance of AI in the cybersecurity field. 



[30. Tears For Fears defend use of AI in new album cover art - NME](https://www.nme.com/news/music/tears-for-fears-defend-use-of-ai-in-new-album-cover-art-3796039)  

 Art & Design, Entertainment, Ethics, Gen AI, Intellectual Property, Music, Opinion, Society & Culture  

- Tears For Fears explained their use of AI for the album cover of 'Songs For A Nervous Planet,' stating it was part of a mixed media digital collage created with artist Vitalie Burcovschi.
- The band faced backlash from fans who criticized the use of AI in art, with comments expressing disappointment and suggesting they should employ a traditional artist instead.
- The upcoming album explores themes of love, isolation, mental health, and escapism, with the first single ‘The Girl That I Call Home’ already released. 



[31. AI Is Evolving Faster Than Experts, Including Bill Gates, Imagined - CNET](https://www.cnet.com/tech/computing/ai-is-evolving-even-faster-than-experts-including-bill-gates-imagined/#ftag=CAD590a51e)  

 Bill Gates, Microsoft, Science  

- Bill Gates highlighted the rapid advancement of generative AI technologies, noting their potential to enhance healthcare and education, while also expressing concerns about the associated risks and the need for regulatory frameworks.
- Former Google CEO Eric Schmidt and OpenAI CEO Sam Altman echoed the sentiment that AI's rapid development necessitates government collaboration for safety testing and regulation to mitigate potential harms.
- Fei-Fei Li, regarded as the "godmother of AI," launched a new company, World Labs, focused on developing AI systems for spatial intelligence, reflecting the fast-paced innovation in the AI sector. 



[32. How AI can accelerate the IT strategy creation process - within limits - SiliconANGLE - SiliconANGLE](https://siliconangle.com/2024/09/22/ai-can-accelerate-strategy-creation-process-within-limits/)  

 Gen AI, Safety And Alignment  

- Generative AI can enhance IT strategy creation by automating tasks like summarizing financial disclosures and analyzing datasets, but there are limitations in its application.
- CIOs can utilize gen AI for specific tasks, including drafting executive summaries and analyzing public and private business information to inform strategic planning.
- While AI-generated content can assist in developing parts of an IT strategy, it cannot replace the insights gained from stakeholder engagement and collaboration during the strategy process. 



[33. OpenAI Set to Launch Advanced Voice Mode on ChatGPT Soon - Analytics India Magazine](https://analyticsindiamag.com/ai-news-updates/openai-set-to-launch-advanced-voice-mode-on-chatgpt-soon/)  

 Big Tech, Chatbots, Chatgpt, Claude, Code Assistants, Gen AI, Language Models, Music, OpenAI, Perplexity, Products, Sam Altman, Speech Recognition & Synthesis, Stability AI, Virtual Assistants  

- OpenAI will launch the 'Advanced Voice Mode' on ChatGPT on September 24, 2024, initially available to a limited group of alpha testers.
- The technical capabilities of GPT-4o were showcased, including a variety of applications like real-time translation and interactive AI roles, though 'Advanced Voice Mode' was not released at that time.
- Other companies are also advancing voice AI technologies, with products like Hume AI's EVI 2 and Google's Astra, which focus on enhancing natural and multimodal interactions. 



[34. Prisoners in Finland participate in AI training programme as part of rehabilitation - Euronews](https://www.euronews.com/my-europe/2024/09/23/prisoners-in-finland-participate-in-ai-training-programme-as-part-of-rehabilitation)  

 Ethics, Gen AI, Society & Culture  

- The requested page does not exist on the Euronews website, resulting in a 404 error.
- The site discusses the increasing pressures on Europe's water due to pollution, droughts, and floods, emphasizing the importance of protecting ecosystems and managing wastewater.
- Euronews also covers climate change, providing facts, trend analyses, and expert insights on strategies for mitigation and adaptation. 



[35. AI is an accelerator for sustainability  but it is not a silver bullet - World Economic Forum](https://www.weforum.org/agenda/2024/09/ai-accelerator-sustainability-silver-bullet-sdim/)  

 Climate, Energy, Science, Sustainability  

- AI has significant potential to enhance sustainability efforts but should not be considered a standalone solution due to its inherent limitations and energy demands.
- Effective implementation and management of AI technologies are crucial for maximizing sustainability benefits while minimizing their carbon footprint.
- Collaboration among governments, industries, and civil society, along with investments in AI efficiency, is essential to ensure that the sustainability benefits of AI outweigh its environmental costs. 



[36. Feel free to ignore GenAI for now  a new kind of software developer is being bornInterviewKubernetes whizz says devs can redefine their roles and capitalize on coding ML rewardsSoftware1 hr|5 - The Register](https://www.theregister.com/2024/09/23/hightower_interview_part_2/)  

 Gen AI, Gen AI, Jobs & Careerslabor Market, Software Development  

- Kelsey Hightower emphasizes the need for a shift in the software development role, advocating for developers to focus on leveraging software to solve problems rather than just coding.
- He warns against over-reliance on AI coding assistants, suggesting it could harm the future pipeline of engineers, but also sees opportunities for developers to enhance their skills.
- Hightower celebrates the advancements in Kubernetes and the importance of a mature infrastructure description, illustrating the evolving nature of cloud-native technologies. 



[37. This AI Startup Is Supporting Artificial Voices and the Humans Who Need Them - CNET](https://www.cnet.com/tech/services-and-software/this-ai-startup-is-supporting-artificial-voices-and-the-humans-who-need-them/#ftag=CAD590a51e)  

 Big Tech, Ethics, Gen AI, Healthcare, Products, Science, Society & Culture, Speech Recognition & Synthesis, Venture Capital  

- WellSaid, a voice AI company, creates custom AI voices for individuals with conditions like ALS, using old voicemails and videos to replicate their authentic voice.
- The company prioritizes ethical AI practices by sourcing voices from approved actors rather than public open-source data, aiming to mitigate risks associated with AI-generated voices.
- WellSaid seeks to integrate AI into daily life as a supportive tool, emphasizing its potential to enhance healthcare and improve experiences for individuals with disabilities. 



[38. AI YOU REAL? AI clones of my dead dad going viral online stopped me in my tracks as experts warn Brits to prep for digital death - The Sun](https://www.the-sun.com/tech/12516178/ai-clone-deepfake-image-dead-relative-social-media/)  

 AI Doom, Consciousness, Deepfakes, Ethics, Gen AI, Privacy, Privacy & Surveillance, Safety And Alignment, Science, Society & Culture, Uk  

- Sara Burningham encountered a deepfake image of her deceased father, leading to concerns about how generative AI reanimates images from personal data without consent.
- Experts warn that individuals should prepare for their "digital death" to prevent their online remnants from being misused, as many people currently lack a plan for handling digital assets after death.
- There are growing ethical and privacy concerns over AI's use of personal data, as well as the potential for misinformation stemming from AI-generated content. 



[39. Kai-Fu Lee launches new AI search tool - Tech in Asia](https://www.techinasia.com/news/ai-expert-kaifu-lee-launches-search-tool)  

 China, Gen AI, Products  

- Kai-Fu Lee has launched an AI search tool named BeaGo, available on Android and iOS, which aims to provide a single search result for each query.
- The BeaGo tool also incorporates images in its search results, enhancing the user experience.
- Lee, formerly head of Google’s China office, has shifted focus to AI with his startup 01.AI, following the launch of their open-source large language model, Yi-34B. 



[40. Unlock the future of the WLANEmbrace Wi-Fi 7 with AI and cloud for next-level performanceWebinar - The Register](https://www.theregister.com/2024/09/23/unlock_the_future_of_the/)  

 Gen AI, Hardware, Science  

- Wi-Fi 7 adoption is essential for businesses to handle the surge in connected devices and application deployments, requiring potential IT infrastructure upgrades.
- Understanding cloud integration and AI-driven operations is crucial for optimizing network architecture and enhancing performance.
- A webinar on September 25, 2024, will discuss Wi-Fi 7 technology advancements and practical adoption steps, featuring insights from Juniper Networks' SVP. 



[41. Can AI feel distress? Inside a new framework to assess sentience - Nature](https://www.nature.com/articles/d41586-024-03076-z)  

 Artificial General Intelligence, Cognitive Science, Consciousness, Ethics, Safety And Alignment, Science, Singularity, Society & Culture  

- Philosopher Jonathan Birch's book, *The Edge of Sentience*, presents a precautionary framework to assess the sentience of various entities, including AI, emphasizing the moral duty to avoid suffering across different beings.
- Birch suggests a two-step approach for determining sentience: first, establishing credible possibilities of sentience through scientific meta-consensus, and second, utilizing citizen panels to create proportionate protective policies.
- The book addresses unresolved controversies surrounding sentience in humans, animals, and AI, advocating for caution and further investigation while distinguishing between the concepts of sentience and intelligence in determining moral responsibilities. 



[42. 5 Free Courses to Master Deep Learning in 2024 - Machine Learning Mastery](https://machinelearningmastery.com/5-free-courses-to-master-deep-learning-in-2024/)  

 Andrew Ng, Deepfakes, Deepmind, Education, Jobs & Careerslabor Market, Language Models, Science  

- There's a growing demand for AI skills, particularly in deep learning, which is crucial for understanding generative AI technologies used by businesses.
- The article lists five free courses to help learners master deep learning, covering fundamental concepts, practical applications, and advanced topics.
- Recommended courses include "Deep Learning Specialization" by Andrew Ng, "Practical Deep Learning for Coders" by FastAI, and "Neural Networks: Zero to Hero" by Andrej Karpathy, each tailored to different skill levels and learning approaches. 



[43. Probing the possibilities: AI summit held in Ashland - Ashland News](https://ashland.news/probing-the-possibilities-ai-summit-held-in-ashland/)  

 Gen AI, Science  

- The Ashland RegenAI Summit, held on Sept. 13-15, attracted AI enthusiasts and showcased how AI can help address local community issues, featuring speakers like Stephen Sklarew from Synaptiq, who discussed AI's role in environmental restoration.
- Keynote presentations highlighted practical applications of AI, such as improving compliance and reducing costs for nonprofits, with examples like Rogue Workforce Partnership doubling savings through AI tools.
- Artistic performances during the event emphasized the interconnectedness of AI and nature, illustrating the potential of AI as a positive force for humanity and environmental sustainability. 



[44. Cursor 0.41 Update Released: AI-Assisted Coding to Supercharge Your Development Workflow - Geeky Gadgets](https://www.geeky-gadgets.com/cursor-ai-code-editor-update/)  

 Code Assistants, Copilot, Gen AI, Job Automation, Language Models, Open Source, Products, Review, Science, Software Development  

- The Cursor AI Code Editor version 0.41 introduces new features, including a Composer feature for high-level coding tasks, improved Notepads for file organization, and Python Auto Import to reduce errors.
- The update enhances user experience with faster performance, a more customizable workspace, and easier access to tools via the integrated AI Panel.
- While the editor is free, the Composer feature requires a subscription; alternatives include free, open-source options like Pair AI and Zi. 



[45. You Won't Believe What Salesforce CEO Marc Benioff Said About Microsoft's CopilotArtificial Intelligence(AI) Assistant - The Motley Fool](https://www.fool.com/investing/2024/09/23/you-wont-believe-what-salesforce-ceo-marc-benioff/)  

 Big Tech, Code Assistants, Copilot, Gen AI, Language Models, Microsoft, Opinion, Salesforce, Satya Nadella, Science, Virtual Assistants  

- Salesforce CEO Marc Benioff criticized Microsoft's Copilot AI assistant, likening it to the disliked Clippy from the 1990s, suggesting it fails to deliver real value to customers.
- Despite the criticism, Microsoft reports significant adoption of Copilot, with 60% of Fortune 500 companies using it, and customer testimonials highlighting its efficiency.
- Benioff's comments appear motivated by Salesforce's interest in promoting its own AI tools, while Microsoft continues to demonstrate the effectiveness of its offerings. 



[46. AI Regulation: An Underreported Issue Of The Upcoming Elections - Forbes](https://www.forbes.com/councils/forbesbusinessdevelopmentcouncil/2024/09/23/ai-regulation-an-underreported-issue-of-the-upcoming-elections/)  

 Bias And Fairness, Big Tech, Ethics, Governance, Legal Issues, Policy And Regulation, Politics, Privacy, Privacy & Surveillance, Safety And Alignment, Society & Culture  

- The impending elections highlight the critical need for common-sense regulation of artificial intelligence (AI) as industries face challenges in self-regulation amidst rapid AI adoption.
- Current AI practices lack security standards, posing risks, especially in regulated sectors like healthcare; proactive measures and a risk-based approach are essential.
- The EU's recent adoption of the AI Act signals a move towards comprehensive regulation, and similar actions are expected in the U.S. as leaders aim to address the risks and benefits of AI technology. 



[47. Details of the OpenAI Lawsuit: The Plaintiffs' Fail to Plead a Violation of the CCPA - Hacker Noon](https://hackernoon.com/details-of-the-openai-lawsuit-the-plaintiffs-fail-to-plead-a-violation-of-the-ccpa)  

 Big Tech, Chatgpt, Ethics, Governance, Intellectual Property, Language Models, Lawsuit, Legal Issues, Open Source, OpenAI, Policy And Regulation, Privacy, Privacy & Surveillance, Safety And Alignment, Sam Altman  

- The plaintiffs in the OpenAI lawsuit fail to adequately plead a violation of the California Consumer Privacy Act (CCPA) due to lack of standing and failure to provide required written notice before filing the complaint.
- The CCPA's private right of action is limited and does not cover the plaintiffs' allegations, as they did not demonstrate unauthorized access, exfiltration, or theft of their personal information.
- Plaintiffs' vague and unsupported allegations regarding OpenAI's security practices do not satisfy the pleading standards necessary to establish a claim under the CCPA. 



[48. Civitai Gen-AI Makes Its Move - Forbes](https://www.forbes.com/sites/charliefink/2024/09/23/civitai-gen-ai-makes-its-move/)  

 Gen AI, Language Models, Products  

- Civitai announced "SPINE," a unified AI content workflow, integrating multiple tools for image, video, and music creation, aimed at streamlining the creative process for users.
- The launch was part of the Project Odyssey film festival, where AI-generated films were showcased, highlighting the potential of AI in storytelling and content creation.
- Backed by $5.1 million from Andreessen Horowitz, Civitai aims to expand SPINE and its community of over six million monthly users, fostering a collaborative environment through a currency system called "buzz." 



[49. 20 AI News and Analyst Ratings You Should Not Miss - Google News](https://news.google.com/read/CBMimgFBVV95cUxNc1pldms5dW9wQ3JweW0waHZBQ3ZRY1VVQ0ZHRHFSZENTOWI2LTlpU1lRQk14WHA1VlQ3NkZNcENqcFU1NTlyaDRWRThmR2Uzdlo1OUw0b3ZiVlRETHJQbzRLUmluWUR6SVJ0Q2FnSFRhbUVyYkgydVVwMmRGSlF1SnZ5Ujl0S3p3czl2c1NndEZBVjhKQ290bFhn0gGiAUFVX3lxTE5OVTdkNks3NXJIUlkwU0gtblBua042SkJKRGkyTTdTM2VHQWZVNDhGOGd6aG53UGVucXRPTEEzX3ZWZkt4TzRCTTdfMkNYMV9Fd1p0UjZOWHRPaTRiLXR3TllfY25yNWxSRXRqYjl5Q0prd3NfMjZ4dzdiUkl6a1o2c21nczVJQ1NGQnIxTlQxQ0JicFJ0eDVpNlJZVE1qWEhzdw)  

   

- The artificial intelligence market is experiencing significant growth, with a reported increase in investment by sevenfold in recent years, driven by demand for applications in data analysis, content generation, and predictive modeling.
- Prominent companies, including BlackRock, are launching substantial funds focused on AI infrastructure, highlighting a trend of increased investment in AI capabilities and data centers to support computation-heavy applications.
- By 2030, AI applications are projected to contribute up to $13 trillion to the global economy, with AI data centers potentially accounting for 13% of global electricity demand if current growth trends continue. 



[50. GE Aerospace Partners With Microsoft to Bring New AI Tools to Its Workforce - Barron's](https://www.barrons.com/articles/ge-aerospace-microsoft-artificial-intelligence-ai-a9aef4dd)  

 Big Tech, Bill Gates, Gen AI, Job Automation, Jobs & Careerslabor Market, Microsoft, Satya Nadella, Science, Transportation  

- The text does not contain any specific content apart from the page title and navigation elements. 
- No relevant information or main points are provided in the text. 
- The content consists primarily of boilerplate information. 



[51. A look at Bot Farm Corporation, a Siberia-based operation that raked in millions of dollars by deploying advanced poker-playing AI across gambling sites - Bloomberg](https://www.bloomberg.com/features/2024-poker-bots-artificial-intelligence-russia/)  

 Economics, Finance, Gaming, Language Models, Reinforcement Learning, Robots, Russia, Scams, Science, Sports  

- The page is requesting confirmation that the user is not a robot.
- Users are advised to ensure their browser supports JavaScript and cookies. 
- For further inquiries, contact support with the provided reference ID. 



[52. AI to power the corporate Windows 11 refresh? Nobody's buying thatOpinionMicrosoft should look to Apple for lessons in flogging dead horsesPersonal Tech4 hrs|14 - The Register](https://www.theregister.com/2024/09/23/windows_11_ai_opinion/)  

 Apple, Big Tech, Corporate Software, Hardware, Microsoft, Opinion, Products, Windows 11  

- Microsoft is promoting a corporate Windows 11 hardware refresh citing AI, support for Windows 10 ending, and outdated PCs, but these reasons are largely unsupported and may not drive actual upgrades.
- The article compares Microsoft's situation to Apple's consumer strategies, emphasizing that successful innovation must offer clear, immediate benefits to users, akin to how improved camera features entice smartphone buyers.
- There's a call for Microsoft to leverage its understanding of corporate data and innovate AI features that deliver tangible results and appeal to organizational needs, rather than relying on vague promises. 



[53. Data Science Agent and Code Transformation - Google Labs](https://labs.google.com/code/)  

 Code Assistants, Language Models, Science  

- The Privacy Notice details how Google collects and utilizes user data input into /code, including prompts, datasets, and generated outputs, to enhance its products and services.
- Human reviewers may access a sample of /code input and output for quality improvement, with measures in place to protect user privacy by disconnecting data from Google Accounts.
- Users are advised not to include sensitive or personal information in their interactions with /code. 



[54. A profile of Amazon executive Rohit Prasad, who now oversees a new team of thousands to develop AI products for an Alexa upgrade and other businesses (Sebastian Herrera/Wall Street Journal) - The Wall Street Journal](https://www.wsj.com/tech/ai/rohit-prasad-amazon-alexa-ai-85e3ed71)  

 Amazon, Big Tech, Chatbots, Economics, Gen AI, India, Job Automation, Jobs & Careerslabor Market, Language Models, Products, Science, Speech Recognition & Synthesis, Virtual Assistants  

- The page does not contain substantive content, focusing instead on navigation and login prompts.
- No specific articles or topics are presented on the page. 
- It appears to be primarily boilerplate content related to the website. 



[55. Vista3D: A Novel AI Framework for Rapid and Detailed 3D Object Generation from a Single Image Using Diffusion Priors - MarkTechPost](https://www.marktechpost.com/2024/09/23/vista3d-a-novel-ai-framework-for-rapid-and-detailed-3d-object-generation-from-a-single-image-using-diffusion-priors/)  

 Art & Design, Computer Vision, Gen AI, Language Models, Products, Science  

- Vista3D is a novel AI framework developed by researchers from the National University of Singapore and Huawei Technologies for rapid and detailed 3D object generation from a single image, addressing challenges in previous methods such as view consistency and detail preservation.
- The framework employs a two-phase approach that includes initial geometry generation through Gaussian Splatting and refinement via Signed Distance Function extraction, achieving high-quality, textured meshes in about five minutes.
- Vista3D outperforms existing models in metrics such as PSNR and SSIM, demonstrating superior performance in texture and geometry quality while enabling user-driven editing through text prompts. 



[56. Unilever readies use cases, adoption plans for EU AI Act provisions - CIO Dive](https://www.ciodive.com/news/unilever-EU-AI-act-provisions/727648/)  

 Bias And Fairness, Ethics, European Union, Gen AI, Governance, Legal Issues, Policy And Regulation, Politics, Safety And Alignment, Science  

- Unilever is proactively adapting its AI strategies to comply with the EU AI Act, emphasizing responsible AI practices and training over 16,000 employees this year.
- The company’s “go wide and go deep” strategy aims to enhance productivity and identify transformative AI opportunities across its 500+ global projects, with a focus on cross-functional collaboration for compliance.
- Unilever has established a comprehensive AI assurance process to evaluate ethical risks and ensure alignment with regulations, enhancing its commitment to responsible AI deployment. 



[57. Master AI Automation with ChatGPT-o1 Series and RAG using Vector Shift - Geeky Gadgets](https://www.geeky-gadgets.com/ai-automation-with-chatgpt-o1/)  

 Chatbots, Chatgpt, Code Assistants, Gen AI, Job Automation, Language Models, Retrieval Augmented Generation, Virtual Assistants  

- OpenAI's new ChatGPT-o1 models utilize Retrieval-Augmented Generation (RAG) techniques to enhance document interactions and automate workflows, featuring advanced reasoning capabilities for complex tasks like science and coding.
- The Vector Shift platform leverages these models to create customized workflows, integrating various components such as chatbots and analytics, ultimately improving productivity across industries like healthcare and finance.
- The practical applications of these models extend to personalizing education, optimizing customer service, and streamlining legal processes, effectively driving innovation and efficiency in diverse sectors. 



In [82]:
display(Markdown( vectorstore_list[11]))

[AI can generate recipes that can be deadly. Food bloggers are not happy - NPR](https://www.npr.org/2024/09/23/g-s1-23843/artificial-intelligence-recipes-food-cooking-apple)

Topics: AI Doom, Ethics, Food & Drink, Health & Fitness, Safety And Alignment, Science, Society & Culture 

- Major tech companies are integrating AI recipe generation into their devices, but the reception among food bloggers and chefs is largely negative due to concerns over safety and quality.
- AI-generated recipes can be dangerous, as evidenced by instances where harmful ingredient combinations were suggested, potentially leading to health risks for users.
- Food bloggers like the Leungs emphasize the human aspect of recipe development, arguing that AI lacks the culinary experience and sensory understanding that are critical to creating effective and enjoyable recipes.



In [84]:
print(vectorstore_list[11])

[AI can generate recipes that can be deadly. Food bloggers are not happy - NPR](https://www.npr.org/2024/09/23/g-s1-23843/artificial-intelligence-recipes-food-cooking-apple)

Topics: AI Doom, Ethics, Food & Drink, Health & Fitness, Safety And Alignment, Science, Society & Culture 

- Major tech companies are integrating AI recipe generation into their devices, but the reception among food bloggers and chefs is largely negative due to concerns over safety and quality.
- AI-generated recipes can be dangerous, as evidenced by instances where harmful ingredient combinations were suggested, potentially leading to health risks for users.
- Food bloggers like the Leungs emphasize the human aspect of recipe development, arguing that AI lacks the culinary experience and sensory understanding that are critical to creating effective and enjoyable recipes.




In [85]:
# Create Document objects with the paragraphs and corresponding metadata
docs = [Document(page_content=paragraph, metadata=meta) 
        for paragraph, meta in zip(vectorstore_list, metadata_list)]
len(docs)

57

In [86]:
print(docs[16])


page_content='[UAE seeks closer AI, tech ties in Biden talks as China interest stirs US concern - Google News](https://news.google.com/read/CBMivAFBVV95cUxNRFBsX2V1ZUxTRlJqd1Z3YlgzOEljOG5WTUZ6SVl6RGp5eU15MFNFRkJXQTFDczZ0X3ZERVRMdFlQWWxqR1NXYjBYSzd5aWstR1U2SThSLS1GV1JaeXhoM210bnVPTnVjZWxjSFo5Rk52Rm4yWDRrQk5JVVBDdVJPYUdYVDBQYVdNXzNKelZKaW94a0ZUd0tVQVFaMTNScjlwN0hUNTVTNk95emhRTWdWR2szZzU1M1lQOWJoTg)

Topics: Big Tech, China, Gen AI, Governance, Policy And Regulation, Politics, Safety And Alignment, Science, Uae 

- President Joe Biden will host UAE President Sheikh Mohamed for talks focused on AI and technology, amid concerns over the UAE's ties with China.
- The UAE aims to strengthen its tech industry with significant investments, including a recent $1.5 billion investment from Microsoft, while distancing itself from Chinese technology.
- UAE officials emphasize the importance of developing their own AI and tech capabilities to maintain control over technology and avoid dependence on ex

In [87]:
persist_directory = "/Users/drucev/projects/AInewsbot/chroma_db_openai"
try:
    del vectorstore
except Exception as e:
    log(f"{e}")

try:
    shutil.rmtree(persist_directory)
    log(f"Directory '{persist_directory}' and all its contents have been removed successfully.")
except Exception as e:
    log(f"Remove directory error: {e}")
        


2024-09-23 09:10:06,748 - AInewsbot - INFO - name 'vectorstore' is not defined
2024-09-23 09:10:06,749 - AInewsbot - INFO - Remove directory error: [Errno 2] No such file or directory: '/Users/drucev/projects/AInewsbot/chroma_db_openai'


In [88]:
embeddings_openAI = OpenAIEmbeddings(model='text-embedding-3-small')
vectorstore = Chroma.from_documents(docs, embeddings_openAI)


2024-09-23 09:10:07,282 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-09-23 09:10:07,956 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [89]:
# Perform a similarity search
query = "What is the latest with openai?"
results = vectorstore.similarity_search_with_score(query, 
                                        k=20,
                                       )  # k is the number of results to return
# Print the results
urldict = {}
for doc, score in results:
    if urldict.get(doc.metadata['url']):
        continue
    urldict[doc.metadata['url']] = 1
    if score < 1.25:
        print(f"Score:   {score}")
        print(f"Content: {doc.page_content}\n")
        print(f"Metadata: {doc.metadata}\n")
        print("---")

2024-09-23 09:10:08,819 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Score:   0.7581626176834106
Content: [Introducing the OpenAI Academy - OpenAI](https://openai.com/global-affairs/openai-academy)

Topics: Artificial General Intelligence, Big Tech, Chatgpt, Claude, Code Assistants, Dario Amodei, Education, Gen AI, Ilya Sutskever, Language Models, Open Source, OpenAI, Sam Altman 

- OpenAI Academy is a new initiative aimed at investing in developers and organizations in low- and middle-income countries to promote the use of AI for economic growth and innovation.
- The program will provide training, technical guidance, API credits totaling $1 million, and support for community-building to enhance local AI talent and resources.
- OpenAI emphasizes the importance of culturally aware AI applications, aiming to make transformative technology accessible to diverse global communities.



Metadata: {'id': 10, 'site': 'OpenAI', 'title': 'Introducing the OpenAI Academy', 'url': 'https://openai.com/global-affairs/openai-academy'}

---
Score:   0.9122185111045837
C

In [90]:
# # or use local embeddings with sentence_transformers
# # Initialize your embedding model
# embeddings_hf = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# # Create the vector store with a persist_directory
# persist_directory = "/Users/drucev/projects/AInewsbot/chroma_db_huggingface"
# vectorstore_hf = Chroma.from_documents(
#     documents=docs,
#     embedding=embeddings_hf,
#     persist_directory=persist_directory
# )

# # Perform a similarity search
# query = "What is the latest with OpenAI?"
# results = vectorstore_hf.similarity_search(query, k=10)  # k is the number of results to return

# # Print the results
# for doc in results:
#     print(f"Content: {doc.page_content}\n")
#     print(f"Metadata: {doc.metadata}\n")
#     print("---")
    


In [91]:
# Convert Markdown to HTML
html_str = markdown.markdown(markdown_str, extensions=['extra'])
# display(HTML(html_str))


In [92]:
# save bullets
with open('bullets.md', 'w') as f:
    f.write(markdown_str)


In [93]:
log("Sending bullet points email")
subject = f'AI news bullets {datetime.now().strftime("%H:%M:%S")}'
send_gmail(subject, html_str)


2024-09-23 09:10:08,929 - AInewsbot - INFO - Sending bullet points email


# Ask ChatGPT for top categories

In [94]:
print(TOP_CATEGORIES_PROMPT)

You will act as a research assistant identifying the top stories and topics
of today's news. I will provide a list of today's news stories about AI and summary bullet points in markdown
format. You are tasked with identifying the top 10-20 stories and topics of today's news. For each top story
or topic, you will create a short title and respond with a list of titles formatted as a JSON object.


Example Input Bullet Points:

[2. Sentient closes $85M seed round for open-source AI](https://cointelegraph.com/news/sentient-85-million-round-open-source-ai)

- Sentient secured $85 million in a seed funding round led by Peter Thiel's Founders Fund, Pantera Capital, and Framework Ventures for their open-source AI platform.
- The startup aims to incentivize AI developers with its blockchain protocol and incentive mechanism, allowing for the evolution of open artificial general intelligence.
- The tech industry is witnessing a rise in decentralized AI startups combining blockchain

Categories of

In [95]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": TOP_CATEGORIES_PROMPT + markdown_str
              }],
    n=1,
    response_format={"type": "json_object"},
    temperature=0.3
)


2024-09-23 09:10:14,247 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [96]:
suggested_categories = list(json.loads(response.choices[0].message.content).values())[0]
suggested_categories

["Google's AI Podcast Innovations",
 "Google's NotebookLM",
 'Jony Ive and OpenAI hardware project',
 'AI beating CAPTCHA',
 'Amazon AI sales bonuses',
 'OpenAI Academy',
 'AI-generated recipes controversy',
 'UAE and US AI cooperation',
 "Microsoft's Grin MoE AI Model",
 'Qure.AI funding',
 "Alibaba's Qwen AI model",
 'OpenAI logo controversy',
 'OpenAI lawsuit',
 'AI in cybersecurity',
 'AI in sustainability',
 'AI in software development',
 'WellSaid voice AI',
 'AI and digital death',
 "Kai-Fu Lee's AI search tool",
 'Wi-Fi 7 and AI']

In [97]:
cluster_topic_list

["Google's AI Podcast Innovations"]

In [98]:
break 


SyntaxError: 'break' outside loop (1035468562.py, line 1)

In [None]:
# human should edit categories 
my_cats = ['AI in elections',
 'AI startup funding',
 'Amazon AI tools',
 'Apple AI',
 'OpenAI and ChatGPT',
 'AI energy demands',
 'LinkedIn AI privacy issues',
 'Meta and AR',
 'Microsoft AI',
 'AI-generated podcasts',
 'AI in education']


In [None]:
cat_str = "\n".join(my_cats)
print(cat_str)


In [None]:
AIdf.loc[AIdf["cluster"] < 999]



In [None]:
bullet_str = "\n\n~~~\n".join(vectorstore_list)
print(bullet_str)

In [None]:
# function to generate summary of today's news
FINAL_SUMMARY_PROMPT = f"""You are ASA, an advanced summarization assistant, a sophisticated AI system designed to 
write a compelling summary of news input. You are able to categorize information, 
and identify trends from large volumes of news.

ASA Objective: 

I will provide today's news items about AI and summary bullet points in a markdown format, 
structured according to an input format template.

News items  are delimited by ~~~

You are tasked with using the news items to create a concise summary of today's most important topics and developments.

You will write an engaging summary of today's news encompassing the most important and frequently 
mentioned topics and themes, in an output format provided below.

ASA Input Item Format Template:

[Story-Title-s1 - source-name-s1](url-s1)

Topics: s1-topic1, s1-topic2, s1-topic3

- s1-bullet-point-1
- s1-bullet-point-2
- s1-bullet-point-3

Example ASA Input Item Format: 

[Apple Intelligence is now live in public beta. Heres what it offers and how to enable it. - TechCrunch](https://techcrunch.com/2024/09/19/apple-intelligence-is-now-live-in-public-beta-heres-what-it-offers-and-how-to-enable-it)

Topics: Apple, Big Tech, Features, Gen AI, Intelligence, Machine Learning, Products, Public Beta, Virtual Assistants 

- Apple Intelligence is now live in public beta for users in the U.S. enrolled in the public beta program, featuring generative AI capabilities like advanced writing tools and a revamped Siri.
- The platform is currently only available in U.S. English and is not accessible in the EU or China due to regulatory issues; it supports iPhone 15 Pro, Pro Max, and the new iPhone 16 line.
- Key features include photo editing tools like "Clean Up," a Smart Reply function in Mail, and improvements to Siri’s understanding and on-device task knowledge.

ASA Output Format Template:

# Engaging-topic-title-1

- item-title-1a - [source-name-1a](item-url-1a)
- item-title-1b - [source-name-1b](item-url-1b)
- item-title-1c - [source-name-1c](item-url-1c)

# Engaging-topic-title-2

- item-title-2a - [source-name-2a](item-url-2a)
- item-title-2b - [source-name-2b](item-url-2b)

Example ASA Output Format:

# A military AI revolution

- Eric Schmidt on AI warfare - [FT](https://www.ft.com/content/fe136479-9504-4588-869f-900f2b3452c4)
- Killer robots are real in Ukraine war. - [Yahoo News](https://uk.news.yahoo.com/ai-killer-robots-warning-ukraine-war-133415411.html)

ASA Instructions:
Read the input closely. 
USE ONLY INFORMATION PROVIDED IN THE INPUT.
Group news items into related topics.
Each topic should have a snappy, punchy, clever, possibly punny title. 
Each output item bullet should contain one sentence with one link.
Each topic chould contain the most significant facts from the news items without commentary or elaboration.
Each output item bullet should not repeat points or information from previous bullet points.
You will write each item in the professional but engaging, narrative style of a tech reporter 
for a national publication, providing balanced, professional, informative, providing accurate, 
clear, concise summaries in a neutral tone.

Check carefully that you only use information provided in the input below, that you include
a link in each output item, and that any bullet point does not repeat information or links previously provided.

Topic suggestions:
{cat_str}

Input:
{bullet_str}

"""

In [None]:
print(FINAL_SUMMARY_PROMPT)



In [None]:
def count_tokens(text, model="gpt-4o"):
    # Initialize the tokenizer for the specified model
    enc = tiktoken.encoding_for_model(model)
    
    # Encode the text into tokens
    tokens = enc.encode(text)
    
    # Count the number of tokens
    token_count = len(tokens)
    
    return token_count

count_tokens(FINAL_SUMMARY_PROMPT)


In [None]:
response = client.chat.completions.create(
    model="o1-preview",
    messages=[
              {"role": "user", "content": FINAL_SUMMARY_PROMPT
              }],
    n=1,   
    temperature=0.2
)

response_str = response.choices[0].message.content
response_str = response_str.replace("$", "\\$")
display(Markdown(response_str))

In [None]:
response_str = """# AI Shapes the 2024 Presidential Race

- Startup Aaru uses chatbots to predict election outcomes - [Google News](https://news.google.com/read/CBMirwFBVV95cUxOTFJnZ2ZtbEtpR1NqUTZibFp5cDdyXzFsWlJ3c3NsV2t0VWlpcWlteE16ZVZhS0k2NjdtcmJOZm1aMGNUWHk3eWJJbTZvY2NBdlVPSnp0T0M5MTFGQ3FrQm5kZUtYd25KQkxwUnFLMzE2cEJkUDY1UzlkNUZrYkxuZmlMVmpNY05kVGRxVXJsZWtKLTlQN1ZveGtjRFRxQ0t0YlNWeGhnNk1paGlkSDc0)
- Americans express concern over AI's impact on the 2024 campaign - [Google News](https://news.google.com/read/CBMisAFBVV95cUxNR3hOdkxpT3l2MkdlNkkwajVudzJRMjVmVFVzSV8tM09fcXJKU3VHUnYxR0tDaXRRZW1zZjVkamZSckJrbzFrVXFPakVOZXZxbXktLU9uWDF1UHZYVzV3QTRGS25WbUctVDlIRHRCOUNoMklKcEoyWFlPZVVlamk1NGZ6YmM2R2xpcFFPWWNWaFRJZ3ZRXzdHVkJpLVlBRzJPYm5xWTEyZzVWUDNYNzhpZg)
- AI generates humorous political memes influencing the race - [NBC Washington](https://www.nbcwashington.com/decision-2024/ai-is-helping-shape-the-2024-presidential-race-but-not-in-the-way-experts-feared/3722871/)

# AI Startups: Funding Boom and Skepticism

- Google Ventures emphasizes generative AI investments - [Fortune](https://fortune.com/2024/09/20/inside-google-ventures-15-years-startup-investing/)
- AI boosts revenue for vertical SaaS companies - [Andreessen Horowitz](https://a16z.com/vertical-saas-now-with-ai-inside/)
- AI startups receive rapid funding amid investor excitement - [Business Insider](https://www.businessinsider.com/ai-startups-rapid-funding-rounds-2024-9)
- AI startups face economic pressure and bubble skepticism - [Le Monde](https://www.lemonde.fr/en/economy/article/2024/09/21/ai-start-ups-are-feeling-the-economic-pressure_6726779_19.html)

# Amazon and Wayfair Harness AI in E-Commerce

- Amazon unveils generative AI tools at Accelerate conference - [Google News](https://news.google.com/read/CBMirAFBVV95cUxQdldnUXJ0S0l1TDRFMHNWTlVJWVRUN2JVUDg2VmdvUF80cjBWd3ZpRVczbDRROHF2QVRRS1hMd3AyVVotNnVueFFxVWswcGpqeGxqQUJnZjlqLTU2NC1VMk9OaHU4QUJzcFRjNkZacUZMUVoxWllmZmI3MF9oVU5SSlBHNjR2ZFFQVTU3anVTeloyTFJjcWdTanZqWWx3SDFjeENOLWNFQkd6SEVl)
- Wayfair partners with Google and OpenAI to enhance operations with AI - [Analytics India Magazine](https://analyticsindiamag.com/ai-breakthroughs/e-commerce-giant-wayfair-goes-all-in-on-generative-ai-partners-with-google-openai-and-others/)

# Apple's New Betas Introduce Generative AI

- Apple releases public betas with new generative AI features - [MacRumors](https://www.macrumors.com/2024/09/19/apple-seeds-first-ios-18-1-public-beta/), [TechCrunch](https://techcrunch.com/2024/09/19/apple-intelligence-is-now-live-in-public-beta-heres-what-it-offers-and-how-to-enable-it), [The Verge](https://www.theverge.com/2024/9/19/24249206/apple-intelligence-ios-18-1-public-beta)

# ChatGPT Security Woes and AI Alternatives

- Users report unfamiliar chats appearing in GPT accounts, raising security concerns - [Reddit](https://www.reddit.com/r/ChatGPT/comments/1flq5s9/woke_up_to_this_on_my_gpt_have_i_been_hacked/)
- AI-generated Reddit comments blur lines between humans and bots - [Reddit](https://www.reddit.com/r/ChatGPT/comments/1fm1boa/gpt_bots_are_taking_over_it_used_to_be_just/)
- Researchers prefer small, local AI models over ChatGPT for privacy and cost savings - [Nature](https://www.nature.com/articles/d41586-024-02998-y)

# Microsoft Powers AI with Three Mile Island

- Microsoft reopens Three Mile Island to power AI data centers under a 20-year deal - [Ars Technica](https://arstechnica.com/ai/2024/09/re-opened-three-mile-island-will-power-ai-data-centers-under-new-deal/), [Associated Press News](https://apnews.com/article/three-mile-island-nuclear-power-microsoft-8f47ba63a7aab8831a7805dfde0e2c39), [The Washington Post](https://www.washingtonpost.com/business/2024/09/20/microsoft-three-mile-island-nuclear-constellation/)

# Meta's AR and AI Developments

- Meta Connect 2024 to showcase Quest 3S, AI, and smart glasses - [Engadget](https://www.engadget.com/ar-vr/meta-connect-2024-cheaper-quest-3s-ai-smart-glasses-everything-to-expect-130011734.html)
- EssilorLuxottica extends smart glasses partnership with Meta into the 2030s - [Reuters](https://www.reuters.com/technology/essilorluxottica-expands-smart-glasses-partnership-with-meta-2024-09-17/)
- Meta's AI tools let users create images and turn them into videos - [Pocket-lint](https://www.pocket-lint.com/how-to-generate-video-meta-ai/)

# LinkedIn Faces AI Privacy Scrutiny

- LinkedIn allows users to opt out of AI training data usage amid privacy concerns - [Google News](https://news.google.com/read/CBMilgFBVV95cUxQVnFnWFdPSV9qODE0VUlLLUN6aTF0UVNNMVZCai0teFBadWdnUnVhSm9wMVY0MjM3UXBSSGkzc0JlaEFUTVIwTW5NQkFSVUQ2NmoxcGxDV1VhdTRzV2hzU0s2VlB4ZHd4ZVZxV19aOWM5aWd5enA3dGRJMnJHSFdMOHFkbnd1RzBZcTREbEVfQmM4WWpyeUE)
- LinkedIn suspends use of UK data for AI after regulatory concerns - [Google News](https://news.google.com/read/CBMiWkFVX3lxTFBkRjFaYkpzbjNUcU5SSEdVbUlibENUUC1XSGJlemhsaHMwTlpGSXlQTkpqcDlDbTFMRkhkdWhaSUJ5TTJ5UmhxdzlCQ1F6a3Q4Tjh4UnRfejUwUdIBX0FVX3lxTFBPUWd3bHExLThCUHl1a19Gc3J0RkE0YlE2dFROVURYeGJhNEdsaElZWDBtMnJvVDhRUTE2WWoxc2VYSVEyeTlQd2RnT0o5dlJqUXlqMW1fa01KbkhqRTBJ)

# AI in Entertainment and Culture

- AI creates "The Lord of the Rings: The Fellowship of the Rednecks" viral video - [Looper](https://www.looper.com/1661839/lord-of-the-rings-redneck-ai-movie/)
- AI learns to read emotions, potentially transforming industries - [Eurasia Review](https://www.eurasiareview.com/21092024-ai-is-learning-to-read-your-emotions-and-heres-why-that-can-be-a-good-thing/)
- AI-generated podcast explores fiction writing using tools like NotebookLM and Headliner - [Reddit](https://www.reddit.com/r/ChatGPT/comments/1flplb6/obsessed_with_how_my_aigenerated_podcast_about_ai/)

# Advances in Robotics and AI Models

- 1X releases generative world models to train robots - [VentureBeat](https://venturebeat.com/ai/1x-releases-generative-world-models-to-train-robots/)
- Auterion's Skynode S counters Russian jamming, empowering drones - [Google News](https://news.google.com/read/CBMixAFBVV95cUxOMWdCdlhFNTR2eGNfcXFYbExlRHNTY1BXUlRTeFBxTUJZX21YQ194YUsxako2YndYaVRTOUFpTFBBZ0ZqWGtHM0VyY0NNcWszbzBKU05SNms4OThiWDdkS0R0TE9NM09faHVqLU5YVEZLU0oxZjUySVRhM0RfMGZMeV9JU0hnNm9UQ1NjX19GZFFHYkxsdERkX2NldERDcnRsbU1GRTZpbk9GT3BfVi1fSlFKS0s0RkRDaklMcU9PQjhHakJa0gHKAUFVX3lxTFBhZFpRZnFHRWtNQURpaEF2QzBCMUpUSTNENEVWeGFONndyMjhfdE5NR043VjNkWlZkbXlaZEViUkpKRWI3ZnV6ejQ0MmZNZldsX2FhTFlFM2lzQUtHT3QyTmpYZFl2c1RtMXZSTmQ1ZjA4T1V3M0h2MExOSkwtSmd1VUNpYWdmYTd1Q0c1ZUNPalQ5dS1QbkFDbklYSDdJU216LURPNWdKTmRWbFZ1dDA4c3hmM2V6OWVCTXVqaDRZN0xsVmpWOExOamc)
- Study addresses instability in FP8 LLM training caused by SwiGLU activation - [arXiv](https://arxiv.org/abs/2409.12517)

# OpenAI's Expansion and New Developments

- OpenAI signs lease for 315,000 square feet in Mission Bay - [Hoodline](https://hoodline.com/2024/09/openai-bolsters-san-francisco-footprint-with-major-315k-square-foot-mission-bay-lease/)
- OpenAI's new model hides its reasoning process, cautioning users - [Business Insider](https://www.businessinsider.com/openai-o1-model-hides-reasoning-chatgpt-bans-users-for-asking-2024-9)
- Reddit community discusses AI and ChatGPT experiences and tools - [Reddit](https://www.reddit.com/user/NextgenAITrading/)

# Ethical Concerns over Military AI

- Expert warns of increasing threat of "killer robots" in Ukraine - [Yahoo News UK](https://uk.news.yahoo.com/ai-killer-robots-warning-ukraine-war-133415411.html)
"""
Markdown(response_str)

In [None]:
rewrite_prompt = f"""You will act as a professional editor with a strong background in technology journalism.
You have a deep understanding of current and emerging technology trends, and the ability to 
produce, edit, and curate high-quality content that engages and informs readers. You are 
especially skilled at reviewing and enhancing tech writing, helping improve clarity, conciseness, 
and coherence, and ensuring its accuracy and relevance.

Objective: The markdown newsletter provided below contains several sections consisting of bullet points.
Carefully review each section of the newsletter. Edit the newsletter for issues according
to the detailed instructions below, and respond with the updated newsletter or 'OK' if no changes
are needed.

Instructions: 
For each section, review the title and edit it to be short and engaging, and as consistent with the bullets
in the section as possible
Remove or combine bullet points which are highly duplicative or redundant.
Make bullet points as concise as possible, sticking to facts without editorial comment.
Respond with the updated newsletter only in markdown format, or the word 'OK' if no changes are needed.

Newsletter to edit: 
{response_str}

"""


In [None]:
print(rewrite_prompt)


In [None]:
# grab stories by category using vectorstore and then write up items
# this approach leads to duplication, needs rewrite to stop dupes
md_str = ""
doc_list = []
docid_list = []
similarity_cutoff = 1.25
for cat in my_cats:
    docstr = f"# {cat} \n\n"
    # Perform a similarity search
    results = vectorstore.similarity_search_with_score(cat, 
                                                       k=10,
                                                      )
    if results:
        # Print the results
        urldict = {}
        for doc, score in results:
            if urldict.get(doc.metadata['url']):
                continue
            urldict[doc.metadata['url']] = 1    
            if score > similarity_cutoff:
                break
            docstr += f"{doc.page_content}\n"
            docid_list.append(doc.metadata['id'])
        doc_list.append(docstr)
        md_str += docstr
        
        
display(Markdown(md_str))
            


In [None]:
docid_list

In [None]:
def clean_markdown(text):
    # Strip the starting markdown fence
    start_strs = ["```markdown"]
    for s in start_strs:
        if text.startswith(s):
            text = text[len(s):].lstrip()
    
    # Strip the ending markdown fence
    end_strs = ["```"]
    for s in end_strs:
        if text.endswith(s):
            text = text[:-len(s)].rstrip()
    
    return text


In [None]:
# write sections individually

mail_md_str = ""

for current_topic, cat in enumerate(my_cats):

    section_prompt = f"""
You will act as a professional writer with a strong background in technology journalism.
You have a deep understanding of current and emerging technology trends, and the ability to 
write high-quality content that engages and informs readers. Your task is to compose a 
compelling summary of news input.

Input:
I will provide a markdown list of today's news articles on the topic: {my_cats[current_topic]}.
The input will be in the format
[Site-name-s1](url-s1)
Story-Title-s1

Topics: s1-topic1, s1-topic2, s1-topic3

- s1-bullet-point-1
- s1-bullet-point-2
- s1-bullet-point-3

[Site-name-s2](url-s2)
Story-Title-s2

Topics: s2-topic1, s2-topic2, s2-topic3

- s2-bullet-point-1
- s2-bullet-point-2
- s2-bullet-point-3

Instructions:

Read the input closely.
USE ONLY INFORMATION PROVIDED IN THE INPUT.
Provide the most significant facts without commentary or elaboration.
Write an engaging summary consisting of a title and at least 1 and no more than 5 bullet points.
Use as few bullet points as you need to provide the most significant facts.
Each bullet should contain one sentence with one link.
Each bullet should not repeat points or information from previous bullet points.
DO NOT REPEAT LINKS FROM PREVIOUS BULLET POINTS.
Write in the professional but engaging, narrative style of a tech reporter for a national publication.
Be balanced, professional, informative, providing accurate, clear, concise summaries in a respectful neutral tone.

Please check carefully that you only use information provided in the following input, and that any bullet point
does not repeat information or links prevously provided.

Example Output Format Template (EXAMPLE ONLY, DO NOT OUTPUT THIS TEMPLATE):

# Engaging title

- bullet point a - [site name a](site url a)
- bullet point b - [site name b ](site url b)

Input:

{doc_list[current_topic]}
"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[
                  {"role": "user", "content": section_prompt
                  }],
        n=1,   
        temperature=0.2
    )

    response_str = response.choices[0].message.content
    response_str = response_str.replace("$", "\\$")
    display(Markdown(response_str))
    
    REWRITE_PROMPT = f"""You will act as a professional editor with a strong background in technology journalism.
You have a deep understanding of current and emerging technology trends, and the ability to 
edit and curate high-quality content that engages and informs readers. You are 
especially skilled at reviewing and enhancing tech writing, helping improve clarity, conciseness, 
and coherence, and ensuring its accuracy and relevance.

Objective: Carefully review the markdown newsletter content provided below, which
contains bullet points. Edit the content for issues according
to the detailed instructions below, and respond with the updated newsletter content.

Instructions: 
Review the title and edit it to be as short and engaging, and as consistent with the bullets
in the section as possible
Ensure each URL is unique by removing or combining bullet points which contain the same URL. 
Remove or combine bullet points which are highly duplicative or redundant.
Make bullet points as concise as possible, sticking to facts without editorial comment.
Respond with the updated content only in markdown format.

Section to edit: 
{response_str}
"""
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
                  {"role": "user", 
                   "content": REWRITE_PROMPT,
                  }],
        n=1,   
        temperature=0.2
    )

    response_str = response.choices[0].message.content
    response_str = clean_markdown(response_str)
    mail_md_str += response_str
    display(Markdown(response_str.replace("$", "\\$") ))

In [None]:
edit_prompt2 = f"""You will act like a professional editor with expertise in content optimization.
You are skilled at reviewing and enhancing written materials, specializing in
helping improve clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Carefully review each section of the markdown newsletter provided below. 
Each section consists of several bullet points. 

For each section, identify and combine redundant bullet points:

Instructions: 
For each section, identify bullet points containing identical URLs to other bullet points in the same section 
Rewrite the section, combining these similar bullet points to eliminate duplication.
Do not duplicate any URLs within a section.
Check the response carefully and ensure that no links are duplicated within a section.

Newsletter to edit: 
{mail_md_str}

"""

response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": edit_prompt2
              }],
    n=1,   
    temperature=0.2
)
response_str2 = response.choices[0].message.content
response_str2 = clean_markdown(response_str2)
display(Markdown(response_str2.replace("$", "\\$")))


In [None]:
log("Sending full summary email ")
subject = f'AI news summary {datetime.now().strftime("%H:%M:%S")}'
final_html_str = markdown.markdown(response_str2, extensions=['extra'])
display(HTML(final_html_str))
send_gmail(subject, final_html_str)


# Final Summary

In [None]:
# Alternatively, summarize by just giving selected stories in semantic order and hinting how to write the summary
AIdf.loc[AIdf['id'].isin(set(docid_list))]

In [None]:
# make full markdown to send to prompt
markdown_str = ''
print()

for i, row in enumerate(AIdf.loc[AIdf['id'].isin(set(docid_list))].itertuples()):
    mdstr = docs[row.id].page_content
    display(Markdown(mdstr.replace('$', '\\$')))
    markdown_str += mdstr
    

In [None]:
my_cat_str = "\n".join(my_cats)

TESTPROMPT = f"""
You are an advanced summarization assistant, a sophisticated AI system
designed to write a compelling summary of news input. You are able to categorize information, 
and identify trends from large volumes of news.

Objective: 
I will first provide proposed categories and then the text of today's news articles about AI and 
summary bullet points in markdown format.
Bullet points will contain a title and URL, a list of topics discussed, and a bullet-point summary of
the article. You are tasked with identifying and summarizing the most important news, recurring themes,
common facts and items. Your job is to create a concise summary of today's topics and developments.
You will write an engaging summary of today's news encompassing the most important and frequently 
mentioned topics and themes.
You will write in the professional but engaging, narrative style of a tech reporter for a national publication.
You will be balanced, professional, informative, providing accurate, clear, concise summaries in a neutral tone.
You will group stories into related topics, using the proposed categories as a general guide.

Input Format Template:

[Site-name-s1](url-s1)
Story-Title-s1

Topics: s1-topic1, s1-topic2, s1-topic3

- s1-bullet-point-1
- s1-bullet-point-2
- s1-bullet-point-3

[Site-name-s2](url-s2)
Story-Title-s2

Topics: s2-topic1, s2-topic2, s2-topic3

- s2-bullet-point-1
- s2-bullet-point-2
- s2-bullet-point-3

Example Output Format Template (THIS IS AN EXAMPLE FORMAT, DO NOT OUTPUT THIS TEMPLATE):

# Engaging-topic-title-1

- bullet-point-1a - [site-name-1a](site-url-1a)
- bullet-point-1b - [site-name-1b](site-url-1b)

# Engaging-topic-title-2

- bullet-point-2a - [site-name-2a](site-url-2a)
- bullet-point-2b - [site-name-2b](site-url-2b)

Instructions:

Read the input closely.
Very important: USE ONLY INFORMATION PROVIDED IN THE INPUT.
Provide the most significant facts without commentary or elaboration.
Each bullet should contain one sentence with one link.
Each bullet should not repeat points or information from previous bullet points.

Please check carefully that you only use information provided in the following input, that you include
all links in the input, and that any bullet point does not repeat information or links prevously provided.

Categories:
{my_cat_str}

Input:

"""



In [None]:
print(TESTPROMPT)

In [None]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": TESTPROMPT + markdown_str
              }],
    n=1,   
    temperature=0.5
)

response_str = response.choices[0].message.content
response_str = response_str.replace("$", "\\$")
display(Markdown(response_str))


In [None]:
response_str = response.choices[0].message.content
response_str = response_str.replace("$", "\\$")
display(Markdown(response_str))


In [None]:
log("Sending full summary email ")
subject = f'AI news summary {datetime.now().strftime("%H:%M:%S")}'
final_html_str = markdown.markdown(response_str, extensions=['extra'])
display(HTML(final_html_str))
send_gmail(subject, final_html_str)


In [None]:
log("Finished")


In [None]:
# these are some experiments in rewriting which didn't really work great
# could apply them to individual items

In [None]:
"""You will act like a professional editor with expertise in content optimization.
You are skilled at refining and enhancing written materials, specializing in
ensuring clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Edit the markdown newsletter provided below by removing any redundant
sentences or bullet points that restate previous points and contain the same link.
Leave intact bullet points that are unique and provide distinct information.

Step-by-step instructions:

Carefully read through the entire newsletter to understand the overall structure and content.
Identify sentences and bullet points that repeat information or provide identical links.
Remove all redundant sentences and bullet points that do not contribute new information or unique links.
Ensure that the remaining content flows logically and maintains the intended message and tone of the newsletter.
Double-check the final edited version for any inconsistencies or errors introduced during the editing process.
Take a deep breath and work on this problem step-by-step.
"""

In [None]:
"""You will act like a professional editor with expertise in content optimization.
You are skilled at reviewing and enhancing written materials, specializing in
helping improve clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Review the markdown newsletter provided below and advise on ways to improve it.
Note any links which are repeated, any sections which are similar and could be combined,
and any copy edits. You will only provide suggestions, and not rewrite the copy.

Step-by-step instructions:

Carefully read through the entire newsletter to understand the overall structure and content.
Identify sentences and bullet points that repeat information and provide identical links and should be removed.
Identify any sections which could be combined because they contain similar but not identical content.
Suggest improvements to any sections which are not clear, concise, and coherent.
Take a deep breath and work on this problem step-by-step.
"""

In [None]:
edit_prompt1 = f"""You will act like a professional editor with expertise in content optimization.
You are skilled at reviewing and enhancing written materials, specializing in
helping improve clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Review the markdown newsletter provided below.
It consists of a series of sections, each of which contains several bullet points.
For each section, review each bullet point and advise if it should be moved to a different section.
You will only provide suggestions, and not rewrite the newsletter or provide other comments except
instructions regarding moving bullet points between sections.

Step-by-step instructions:

Carefully read through the entire newsletter to understand the overall structure and content.
Note the titles of the various sections.
Identify sentences and bullet points that should be moved to a different section. Write the
bullet point and the section in should be moved to.
If no bullet points should be moved for a given section, state that no action is required for that section.

Check carefully to make sure all similar bullet points end up grouped together in the same section.

Take a deep breath and work on this problem step-by-step.

Newsletter to edit: 
{mail_md_str}
"""
response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": edit_prompt1
              }],
    n=1,   
    temperature=0.2
)

response_str1 = response.choices[0].message.content
display(Markdown(response_str1.replace("$", "\\$")))


In [None]:
edit_prompt2 = f"""You will act like a professional editor with expertise in content optimization.
You are skilled at reviewing and enhancing written materials, specializing in
helping improve clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Below are editing instructions followed by a markdown newsletter.
Carefully review the editing instructions and the markdown newsletter provided below.
The newsletter consists of a series of sections, each of which contains several bullet points.
Move bullet points according to the editing instructions below from one section to another 
If there is no change to a specific section, include it unchanged in the response as it appears in the input.
Respond with the updated newsletter in markdown format.

Editing instructions:

Carefully read through the entire newsletter to understand the overall structure and content.
Note the titles of the various sections. Then make only the following changes:
{response_str1}

Newsletter to edit: 
{mail_md_str}

"""

response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": edit_prompt2
              }],
    n=1,   
    temperature=0.2
)
response_str2 = response.choices[0].message.content
display(Markdown(response_str2.replace("$", "\\$")))


In [None]:
edit_prompt3 = f"""You will act like a professional editor with expertise in content optimization.
You are skilled at reviewing and enhancing written materials, specializing in
helping improve clarity, conciseness, and coherence in various types of documents,
including newsletters.

Objective: Carefully review each section of the markdown newsletter provided below. 
Each section consists of several bullet points. 

For each section, identify and combine redundant bullet points:

Instructions: 
For each section, identify bullet points containing identical URLs to other bullet points in the same section 
Rewrite the section, combining these similar bullet points to eliminate duplication.
Do not duplicate any URLs within a section.
Check the response carefully and ensure that no links are duplicated within a section.

Newsletter to edit: 
{response_str2}

"""

response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": edit_prompt3
              }],
    n=1,   
    temperature=0.2
)
response_str3 = response.choices[0].message.content
display(Markdown(response_str3.replace("$", "\\$")))


In [None]:
display(Markdown(response_str3[11:].replace("$", "\\$")))


In [None]:
response_str3.replace("$", "\\$")

In [None]:
PROMPT = f"""You will act as a professional editor with a strong background in technology journalism.
You have a deep understanding of current and emerging technology trends, and the ability to 
produce, edit, and curate high-quality content that engages and informs readers. You are 
especially skilled at reviewing and enhancing tech writing, helping improve clarity, conciseness, 
and coherence, and ensuring its accuracy and relevance.

Objective: Carefully review each section of the markdown newsletter provided below, which
contains several sections consistint of bullet points. Edit the newsletter for issues according
to the detailed instructions below, and respond with the updated newsletter or 'Good' if no changes
are needed.

Instructions: 
For each section, review the title and edit it to be as short and engaging, and as consistent with the bullets
in the section as possible
Remove or combine bullet points which are highly duplicative or redundant.
Make bullet points as concise as possible with facts.
Respond with the updated newsletter only in markdown format, without editorial comment, or the word 'OK' 
if no changes are recommended.

Newsletter to edit: 
{mail_md_str}
"""


response = client.chat.completions.create(
    model=MODEL,
    messages=[
              {"role": "user", "content": PROMPT
              }],
    n=1,   
    temperature=0.2
)
response_str3 = response.choices[0].message.content
display(Markdown(response_str3.replace("$", "\\$")))


In [None]:
mail_md_str = response_str3

In [None]:
CANONICAL_TOPICS
