<a href="https://colab.research.google.com/github/djimit/Nieuwsbrief/blob/main/Nieuwsbrief.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from datetime import datetime
import pandas as pd
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from urllib.parse import urljoin, urlparse

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def is_ai_related(text):
    ai_keywords = [
        'artificial intelligence', 'machine learning', 'deep learning', 'neural network',
        'AI', 'ML', 'NLP', 'computer vision', 'robotics', 'autonomous', 'algorithm',
        'kunstmatige intelligentie', 'KI', 'machinaal leren', 'diep leren', 'neurale netwerken',
        'big data', 'data science', 'predictive analytics', 'natural language processing',
        'reinforcement learning', 'computer vision', 'expert system', 'cognitive computing',
        'machine perception', 'AI ethics', 'AI policy', 'AI regulation', 'AI governance',
        'AI research', 'AI development', 'AI innovation', 'AI technology', 'AI application'
    ]

    stop_words = set(stopwords.words('english') + stopwords.words('dutch'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [w for w in word_tokens if not w in stop_words]

    for i in range(len(filtered_text) - 1):
        if f"{filtered_text[i]} {filtered_text[i+1]}" in ai_keywords:
            return True
    return any(keyword.lower() in filtered_text for keyword in ai_keywords)

def is_irrelevant(text):
    irrelevant_keywords = ['airpods', 'iphone', 'smartphone', 'gadget']
    return any(keyword.lower() in text.lower() for keyword in irrelevant_keywords)

def scrape_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()

        if is_ai_related(article.title + " " + article.text) and not is_irrelevant(article.title + " " + article.text):
            return {
                'title': article.title,
                'url': url,
                'summary': article.summary,
                'keywords': ', '.join(article.keywords),
                'published_date': article.publish_date or datetime.now(),
                'source': article.source_url
            }
    except Exception as e:
        print(f"Error scraping article {url}: {str(e)}")
    return None

def scrape_news_site(base_url, ai_section_url=None):
    articles = []
    try:
        url_to_scrape = ai_section_url if ai_section_url else base_url
        response = requests.get(url_to_scrape, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        for link in soup.find_all('a', href=True):
            url = urljoin(base_url, link['href'])

            if not is_valid_url(url):
                continue

            if re.search(r'/\d{4}/\d{2}/\d{2}/', url) or '/article/' in url or '/news/' in url:
                article = scrape_article(url)
                if article:
                    articles.append(article)
                    print(f"Scraped: {article['title']}")
                time.sleep(1)

            if len(articles) >= 5:
                break

    except requests.RequestException as e:
        print(f"Error fetching {url_to_scrape}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error scraping {base_url}: {str(e)}")

    return articles

# Lijst van websites om te scrapen
websites = [
    # Europese Commissie en gerelateerde bronnen
    {'name': 'European Commission - AI', 'url': 'https://digital-strategy.ec.europa.eu/en/policies/artificial-intelligence'},
    {'name': 'EU Science Hub - AI', 'url': 'https://ec.europa.eu/jrc/en/research-topic/artificial-intelligence'},

    # Amerikaanse overheid
    {'name': 'White House - AI', 'url': 'https://www.whitehouse.gov/ai/'},
    {'name': 'National Artificial Intelligence Initiative', 'url': 'https://www.ai.gov/'},

    # Silicon Valley en Tech bedrijven
    {'name': 'Google AI', 'url': 'https://ai.google/'},
    {'name': 'Microsoft AI Blog', 'url': 'https://blogs.microsoft.com/ai/'},
    {'name': 'OpenAI Blog', 'url': 'https://openai.com/blog/'},
    {'name': 'DeepMind Blog', 'url': 'https://deepmind.com/blog'},
    {'name': 'IBM AI Research', 'url': 'https://www.ibm.com/artificial-intelligence'},

    # Onderzoeksinstituten en universiteiten
    {'name': 'MIT AI', 'url': 'https://www.csail.mit.edu/research/artificial-intelligence'},
    {'name': 'Stanford AI Lab', 'url': 'https://ai.stanford.edu/'},
    {'name': 'Berkeley AI Research', 'url': 'https://bair.berkeley.edu/blog/'},
    {'name': 'Harvard AI', 'url': 'https://ai.harvard.edu/'},

    # AI-specifieke nieuwssites
    {'name': 'AI News', 'url': 'https://artificialintelligence-news.com/'},
    {'name': 'Machine Learning Mastery', 'url': 'https://machinelearningmastery.com/blog/'},
    {'name': 'KDnuggets', 'url': 'https://www.kdnuggets.com/'},
    {'name': 'Analytics Vidhya', 'url': 'https://www.analyticsvidhya.com/blog/'},

    # Algemene tech-nieuwssites met AI-secties
    {'name': 'TechCrunch', 'url': 'https://techcrunch.com', 'ai_section': 'https://techcrunch.com/category/artificial-intelligence/'},
    {'name': 'VentureBeat', 'url': 'https://venturebeat.com', 'ai_section': 'https://venturebeat.com/category/ai/'},
    {'name': 'MIT Technology Review', 'url': 'https://www.technologyreview.com', 'ai_section': 'https://www.technologyreview.com/topic/artificial-intelligence/'},
    {'name': 'Wired', 'url': 'https://www.wired.com', 'ai_section': 'https://www.wired.com/tag/artificial-intelligence/'},
    {'name': 'Forbes AI', 'url': 'https://www.forbes.com/ai'},

    # Nederlandse bronnen
    {'name': 'Nederlandse AI Coalitie', 'url': 'https://nlaic.com/en/news/'},
    {'name': 'TNO - AI', 'url': 'https://www.tno.nl/en/focus-areas/information-communication-technology/roadmaps/data-sharing/artificial-intelligence/'},
    {'name': 'AI Krant', 'url': 'https://www.ai-krant.nl/'},

    # Internationale organisaties
    {'name': 'OECD AI Policy Observatory', 'url': 'https://oecd.ai/'},
    {'name': 'UNESCO Artificial Intelligence', 'url': 'https://en.unesco.org/artificial-intelligence'},

    # AI Ethics en Policy
    {'name': 'AI Ethics Lab', 'url': 'https://aiethicslab.com/'},
    {'name': 'Future of Life Institute', 'url': 'https://futureoflife.org/ai-news/'},
]

def add_website(name, url, ai_section=None):
    """
    Voegt een nieuwe website toe aan de lijst om te scrapen.

    :param name: Naam van de website
    :param url: Basis-URL van de website
    :param ai_section: Optionele URL voor de AI-specifieke sectie van de website
    """
    new_site = {'name': name, 'url': url}
    if ai_section:
        new_site['ai_section'] = ai_section
    websites.append(new_site)
    print(f"Toegevoegd: {name}")

def scrape_multiple_sites():
    all_articles = []
    for site in websites:
        print(f"Scraping {site['name']}...")
        articles = scrape_news_site(site['url'], site.get('ai_section'))
        all_articles.extend(articles)

    return pd.DataFrame(all_articles)

# Voorbeeld van het toevoegen van een nieuwe website
add_website("AI Trends", "https://www.aitrends.com/")

# Run de scraper
news_df = scrape_multiple_sites()

# Toon de resultaten
display(news_df[['title', 'url', 'summary', 'keywords', 'published_date', 'source']])

Toegevoegd: AI Trends
Scraping European Commission - AI...
Scraped: Artificial Intelligence Board kicks off work on uptake of AI in the EU and implementation of the AI Act
Scraped: EU boosts European AI developers with the AI Factories call for proposals
Scraped: AI Office received strong interest for participation in drafting the first General-Purpose AI Code of Practice
Scraping EU Science Hub - AI...
Error fetching https://ec.europa.eu/jrc/en/research-topic/artificial-intelligence: 404 Client Error: Not Found for url: https://joint-research-centre.ec.europa.eu/scientific-activities/artificial-intelligence_en
Scraping White House - AI...
Error fetching https://www.whitehouse.gov/ai/: 404 Client Error: Not Found for url: https://www.whitehouse.gov/ai/
Scraping National Artificial Intelligence Initiative...
Scraping Google AI...
Scraped: Using AI to help more college students graduate
Scraping Microsoft AI Blog...
Error fetching https://blogs.microsoft.com/ai/: 403 Client Error: Forbid

Unnamed: 0,title,url,summary,keywords,published_date,source
0,Artificial Intelligence Board kicks off work o...,https://digital-strategy.ec.europa.eu/en/news/...,The AI Board is comprised of high-level repres...,"states, implementation, artificial, board, upt...",2024-09-10 19:26:43.697871,https://digital-strategy.ec.europa.eu
1,EU boosts European AI developers with the AI F...,https://digital-strategy.ec.europa.eu/en/news/...,AI Factories will be created around the EU's w...,"factories, computing, startups, providing, ai,...",2024-09-10 19:26:45.114771,https://digital-strategy.ec.europa.eu
2,AI Office received strong interest for partici...,https://digital-strategy.ec.europa.eu/en/news/...,The AI Office has received expressions of inte...,"strong, interest, generalpurpose, drafting, pr...",2024-09-10 19:26:46.775698,https://digital-strategy.ec.europa.eu
3,Using AI to help more college students graduate,https://publicpolicy.google/article/ai-helping...,When I started as Undergraduate Dean of John J...,"working, way, unable, students, graduate, jay,...",2024-09-10 19:26:49.356771,https://publicpolicy.google
4,Announcing the NeurIPS 2023 Paper Awards – Neu...,https://blog.neurips.cc/2023/12/11/announcing-...,Announcing the NeurIPS 2023 Paper AwardsBy Ami...,"abilities, emergent, data, models, dec, cst, a...",2023-12-11 00:00:00,https://blog.neurips.cc
5,How to Evaluate Jailbreak Methods: A Case Stud...,https://bair.berkeley.edu/blog/2024/08/28/stro...,How to Evaluate Jailbreak Methods: A Case Stud...,"evaluate, jailbreak, victim, prompts, methods,...",2024-08-28 00:00:00,https://bair.berkeley.edu
6,How to Evaluate Jailbreak Methods: A Case Stud...,https://bair.berkeley.edu/blog/2024/08/28/stro...,How to Evaluate Jailbreak Methods: A Case Stud...,"evaluate, jailbreak, victim, prompts, methods,...",2024-08-28 00:00:00,https://bair.berkeley.edu
7,Are We Ready for Multi-Image Reasoning? Launch...,https://bair.berkeley.edu/blog/2024/07/20/visu...,"To address this gap, this project focuses on t...","question, models, mirage, image, vhs, benchmar...",2024-07-20 00:00:00,https://bair.berkeley.edu
8,Are We Ready for Multi-Image Reasoning? Launch...,https://bair.berkeley.edu/blog/2024/07/20/visu...,"To address this gap, this project focuses on t...","question, models, mirage, image, vhs, benchmar...",2024-07-20 00:00:00,https://bair.berkeley.edu
9,TinyAgent: Function Calling at the Edge,https://bair.berkeley.edu/blog/2024/05/29/tiny...,TinyAgent: Function Calling at the EdgeThe abi...,"data, models, tinyagent, user, calling, plan, ...",2024-05-29 00:00:00,https://bair.berkeley.edu


In [14]:
from IPython.display import HTML, display
import datetime

def generate_newsletter(df):
    """
    Genereert een HTML-nieuwsbrief van de gescrapete AI-nieuws-artikelen.

    :param df: DataFrame met de gescrapete artikelen
    :return: HTML-string van de nieuwsbrief
    """
    # Sorteer artikelen op publicatiedatum, meest recente eerst
    df = df.sort_values('published_date', ascending=False)

    # Begin de HTML-structuur
    html = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; max-width: 800px; margin: 0 auto; padding: 20px; }}
            h1 {{ color: #2c3e50; text-align: center; }}
            h2 {{ color: #3498db; }}
            .article {{ margin-bottom: 30px; border-bottom: 1px solid #eee; padding-bottom: 20px; }}
            .article h3 {{ margin-bottom: 5px; }}
            .article p {{ margin-top: 5px; }}
            .meta {{ font-size: 0.8em; color: #7f8c8d; }}
            a {{ color: #2980b9; text-decoration: none; }}
            a:hover {{ text-decoration: underline; }}
        </style>
    </head>
    <body>
        <h1>AI News Nieuwsbrief</h1>
        <p style="text-align: center;">Gegenereerd op {datetime.datetime.now().strftime('%d-%m-%Y %H:%M')}</p>
    """

    # Voeg elk artikel toe aan de nieuwsbrief
    for _, row in df.iterrows():
        html += f"""
        <div class="article">
            <h2><a href="{row['url']}">{row['title']}</a></h2>
            <p class="meta">Gepubliceerd op: {row['published_date'].strftime('%d-%m-%Y')} | Bron: {row['source']}</p>
            <p>{row['summary']}</p>
            <p><strong>Keywords:</strong> {row['keywords']}</p>
        </div>
        """

    # Sluit de HTML-structuur
    html += """
    </body>
    </html>
    """

    return html

# Functie om de nieuwsbrief weer te geven in de notebook
def display_newsletter(df):
    newsletter_html = generate_newsletter(df)
    display(HTML(newsletter_html))

# Functie om de nieuwsbrief op te slaan als HTML-bestand
def save_newsletter(df, filename='ai_news_newsletter.html'):
    newsletter_html = generate_newsletter(df)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(newsletter_html)
    print(f"Nieuwsbrief opgeslagen als {filename}")

# Gebruik deze functies na het scrapen van de nieuws-artikelen
# Bijvoorbeeld:
# display_newsletter(news_df)
# save_newsletter(news_df)

In [16]:
from IPython.display import HTML, display
import datetime
import pytz

def standardize_dates(df):
    """
    Zet alle datums om naar 'naive' UTC datums.
    """
    def to_naive_utc(dt):
        if dt is pd.NaT:
            return dt
        if dt.tzinfo is None:
            # Als er geen tijdzone is, nemen we aan dat het UTC is
            return dt
        return dt.astimezone(pytz.UTC).replace(tzinfo=None)

    df['published_date'] = df['published_date'].apply(to_naive_utc)
    return df

def generate_newsletter(df):
    """
    Genereert een HTML-nieuwsbrief van de gescrapete AI-nieuws-artikelen.

    :param df: DataFrame met de gescrapete artikelen
    :return: HTML-string van de nieuwsbrief
    """
    # Standaardiseer de datums en sorteer
    df = standardize_dates(df)
    df = df.sort_values('published_date', ascending=False)

    # Begin de HTML-structuur
    html = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; max-width: 800px; margin: 0 auto; padding: 20px; }}
            h1 {{ color: #2c3e50; text-align: center; }}
            h2 {{ color: #3498db; }}
            .article {{ margin-bottom: 30px; border-bottom: 1px solid #eee; padding-bottom: 20px; }}
            .article h3 {{ margin-bottom: 5px; }}
            .article p {{ margin-top: 5px; }}
            .meta {{ font-size: 0.8em; color: #7f8c8d; }}
            a {{ color: #2980b9; text-decoration: none; }}
            a:hover {{ text-decoration: underline; }}
        </style>
    </head>
    <body>
        <h1>AI News Nieuwsbrief</h1>
        <p style="text-align: center;">Gegenereerd op {datetime.datetime.now().strftime('%d-%m-%Y %H:%M')} UTC</p>
    """

    # Voeg elk artikel toe aan de nieuwsbrief
    for _, row in df.iterrows():
        published_date = row['published_date'].strftime('%d-%m-%Y') if pd.notna(row['published_date']) else 'Onbekende datum'
        html += f"""
        <div class="article">
            <h2><a href="{row['url']}">{row['title']}</a></h2>
            <p class="meta">Gepubliceerd op: {published_date} | Bron: {row['source']}</p>
            <p>{row['summary']}</p>
            <p><strong>Keywords:</strong> {row['keywords']}</p>
        </div>
        """

    # Sluit de HTML-structuur
    html += """
    </body>
    </html>
    """

    return html

# Functie om de nieuwsbrief weer te geven in de notebook
def display_newsletter(df):
    newsletter_html = generate_newsletter(df)
    display(HTML(newsletter_html))

# Functie om de nieuwsbrief op te slaan als HTML-bestand
def save_newsletter(df, filename='ai_news_newsletter.html'):
    newsletter_html = generate_newsletter(df)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(newsletter_html)
    print(f"Nieuwsbrief opgeslagen als {filename}")

# Gebruik deze functies na het scrapen van de nieuws-artikelen
# Bijvoorbeeld:
# display_newsletter(news_df)
# save_newsletter(news_df)

In [17]:
!pip install pytz



In [18]:
display_newsletter(news_df)

In [19]:
save_newsletter(news_df)

Nieuwsbrief opgeslagen als ai_news_newsletter.html
