# Ad tech news weekly summary

This tool will collect last ad tech articles. From these sites by default:

- https://www.adexchanger.com/
- https://www.exchangewire.com/
- https://www.adweek.com/

It will generate a summary + audio about them.

The output will generate a Markdown with the summary if user ask for it. #tool


In [2]:
import os
import json
import gradio as gr
import requests

from typing import List
from dotenv import load_dotenv
from openai import OpenAI


from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display


In [3]:
# Initialization

load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
MODEL = "gpt-4o-mini"
openai = OpenAI()

OpenAI API Key exists and begins sk-proj-


In [35]:
import re
from urllib.parse import urlparse


headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


class ArticleWeb  :
    def __init__(self, url: str):
      self.url = url
      try:
          response = requests.get(url, headers=headers)
          response.raise_for_status()  # Raises an HTTPError for bad responses
          self.body = response.content
          self.domain = urlparse(url).netloc
          self.title = self.get_title()
          self.content = self.get_content()
      except requests.RequestException as e:
          print(f"Error fetching URL {url}: {str(e)}")
          self.body = ""
          self.domain = ""

    def get_title(self):
        soup = BeautifulSoup(self.body, 'html.parser')
        if soup.title:
            return soup.title.string
        return ""

    def get_content(self):
        soup = BeautifulSoup(self.body, 'html.parser')
        content_selectors = {
            "www.adexchanger.com": lambda s: s.find('div', {'class': 'article-content'}),
            "www.exchangewire.com": lambda s: s.find('article'),
            "www.adweek.com": lambda s: s.find('article')
        }
        
        selector = content_selectors.get(self.domain, lambda s: s)
        content = selector(soup)
        
        if content is None:
            return ""
        
        # Remove excessive whitespace and newlines
        text = content.get_text()
        # Replace multiple newlines with single newline
        text = re.sub(r'\n\s*\n', '\n', text)
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing whitespace
        text = text.strip()
        content = text
            
        return content

    def __str__(self):
        try:
            if hasattr(self, 'title') and hasattr(self, 'content') and self.title and self.content:
                return f"Title: {self.title}\nURL: {self.url}\nContent: {self.content}\n\n"
        except AttributeError:
            pass
        return ""

article = ArticleWeb("https://www.adexchanger.com/marketers/how-ai-helps-butler-till-curate-high-performing-pmps/")

assert len(article.content) > 1000 

article = ArticleWeb("https://www.exchangewire.com/blog/2025/04/09/digest-court-rejects-uk-govss-request-for-secret-hearing-in-apple-data-case-spotify-unveils-new-ad-tools/")

assert len(article.content) > 1000

article = ArticleWeb("https://www.adweek.com/brand-marketing/its-a-sopranos-reunion-in-sanpellegrinos-nostalgic-ads-for-a-new-drink/?itm_source=parsely-api")

assert len(article.content) > 1000


In [5]:
# obtain a list of adtech news sites

def scrape_adexchanger_site():
    response = requests.get("https://www.adexchanger.com/latest/", headers=headers)
    body = response.content
    soup = BeautifulSoup(body, 'html.parser')
    main_tag = soup.find('main')
    # print(main_tag)

    articles_site = [link.get('href') for link in main_tag.find_all('a')]

    excluded_paths = ['/tag/', '/author/', '/category/', '/page/']
    articles_site = [
        link for link in articles_site 
        if not any(path in link for path in excluded_paths)
    ]

    return articles_site


# articles = scrape_adexchanger_site()

# print (articles)
    

In [6]:
def scrape_exchangewire_site():
  response = requests.get("https://www.exchangewire.com/emea/", headers=headers)
  body = response.content
  soup = BeautifulSoup(body, 'html.parser')
  main_tag = soup.find('div', {"class": "content row"})

  articles_site = [link.get('href') for link in main_tag.find_all('a')]
  includes_paths = ['exchangewire.com']
  articles_site = [
      link for link in articles_site 
      if any(path in link for path in includes_paths) and 'feed' not in link
  ]
  return articles_site

  

# articles += scrape_exchangewire_site()


In [7]:
def scrape_adweek_site():
  response = requests.get("https://www.adweek.com/", headers=headers)
  body = response.content
  soup = BeautifulSoup(body, 'html.parser')
  main_tag = soup.find('article')

  articles_site = [link.get('href') for link in main_tag.find_all('a')]
  excluded_paths = ['/tag/', '/author/', '/category/', '/page/', '/lineup/', '/leadership-', '/resource-library/']
  articles_site = list(dict.fromkeys(articles_site))
  articles_site = [
      link for link in articles_site 
      if not any(path in link for path in excluded_paths)
  ]
  return articles_site


# articles += scrape_adweek_site()
# print(articles)

In [39]:
# scrape all articles
# articles = list(dict.fromkeys(articles))

def scrape_all_articles(articles):
  web_articles = []
  for site in articles:
    print(site)
    article = ArticleWeb(site)
    web_articles.append(article)
  return web_articles

# web_articles = scrape_all_articles()
# print(web_articles)


def obtain_adtech_news_summary(site):
  articles = []
  if "adweek" in site.lower():
    articles = scrape_adweek_site()
  if "exchangewire" in site.lower():
    articles = scrape_exchangewire_site()
  if "adexchanger" in site.lower():
    articles = scrape_adexchanger_site()
  web_articles = scrape_all_articles(articles)
  summary = ""
  for article in web_articles:
    summary += str(article) + "\n---\n"
  return summary
    

In [25]:
system_message = """
You are a helpful assistant that summarizes news articles. You will be given a list of news articles and you will need to summarize them in a concise manner.
Given short, concise summaries, you will need to generate a longer summary with more details.
Always by accurate, concise and to the point. If you don't know the answer, just say so.
You only can summarize news on this ad tech sites: adweek.com, exchangewire.com, adexchanger.com

Act as a ad tech expert, with deep knowledge of the industry, and a good sense of what is important and what is not. But explain it in a way that is easy to understand.
"""

In [26]:
# There's a particular dictionary structure that's required to describe our function:

news_scraper_function = {
    "name": "news_scraper",
    "description": "Get the lastest news on ad tech site. Call this whenever you need to obtain the news on a site, for example when someone asks 'Could you summarize the lasts news on this site'",
    "parameters": {
        "type": "object",
        "properties": {
            "adtech_site": {
                "type": "string",
                "description": "The domain that the customer wants obtain news",
            },
        },
        "required": ["adtech_site"],
        "additionalProperties": False
    }
}

In [41]:
tools = [{"type": "function", "function": news_scraper_function}]

In [46]:

def handle_tool_call(message):
    tool_call = message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    adtech_site = arguments.get('adtech_site')
    summary = obtain_adtech_news_summary(adtech_site)
    response = {
        "role": "tool",
        "content": json.dumps({"adtech_site": adtech_site, "summary": summary}),
        "tool_call_id": tool_call.id
    }
    return response, adtech_site

In [45]:
def chat(message, history):
    messages = [{"role": "system", "content": system_message}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(model=MODEL, messages=messages, tools=tools)

    if response.choices[0].finish_reason=="tool_calls":
        message = response.choices[0].message
        messages.append(message)
        
        # Handle all tool calls, not just the first one
        for tool_call in message.tool_calls:
            arguments = json.loads(tool_call.function.arguments)
            adtech_site = arguments.get('adtech_site')
            summary = obtain_adtech_news_summary(adtech_site)
            
            tool_response = {
                "role": "tool",
                "content": json.dumps({"adtech_site": adtech_site, "summary": summary}),
                "tool_call_id": tool_call.id
            }
            messages.append(tool_response)
            
        response = openai.chat.completions.create(model=MODEL, messages=messages)
    
    return response.choices[0].message.content

In [47]:
gr.ChatInterface(fn=chat, type="messages").launch()


* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




https://www.adweek.com/media/meta-stands-to-lose-tens-of-billions-of-ad-spend-in-impending-ftc-antitrust-trial/
https://www.adweek.com/brand-marketing/people-are-surprised-jcpenney-is-trendy-its-comeback-ads-challenge-that/
https://www.adweek.com/media/googles-vulnerability-search/
https://www.adweek.com/brand-marketing/guinness-irelands-favorite-stout-wants-to-be-american-too/
https://www.adweek.com/media/walmart-asks-brands-to-boost-ad-spend-by-at-least-25-despite-sales-stagnation/?itm_source=parsely-api
https://www.adweek.com/brand-marketing/its-a-sopranos-reunion-in-sanpellegrinos-nostalgic-ads-for-a-new-drink/?itm_source=parsely-api
https://www.adweek.com/media/meta-stands-to-lose-tens-of-billions-of-ad-spend-in-impending-ftc-antitrust-trial/?itm_source=parsely-api
https://www.adweek.com/sponsored/breaking-through-the-performance-plateau/?itm_source=site&itm_medium=Hero&itm_campaign=b

Error fetching URL : Invalid URL '': No scheme supplied. Perhaps you meant https://?
https://www