<a href="https://colab.research.google.com/github/djimit/Nieuwsbrief/blob/main/Nieuwsbrief.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests beautifulsoup4 newspaper3k pandas nltk pytz

import requests
from bs4 import BeautifulSoup
from newspaper import Article
from datetime import datetime
import pandas as pd
import time
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from urllib.parse import urljoin, urlparse
import pytz

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def is_ai_related(text):
    ai_keywords = [
        'artificial intelligence', 'machine learning', 'deep learning', 'neural network',
        'AI', 'ML', 'NLP', 'computer vision', 'robotics', 'autonomous', 'algorithm',
        'kunstmatige intelligentie', 'KI', 'machinaal leren', 'diep leren', 'neurale netwerken',
        'big data', 'data science', 'predictive analytics', 'natural language processing',
        'reinforcement learning', 'computer vision', 'expert system', 'cognitive computing',
        'machine perception', 'AI ethics', 'AI policy', 'AI regulation', 'AI governance',
        'AI research', 'AI development', 'AI innovation', 'AI technology', 'AI application'
    ]

    stop_words = set(stopwords.words('english') + stopwords.words('dutch'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [w for w in word_tokens if not w in stop_words]

    for i in range(len(filtered_text) - 1):
        if f"{filtered_text[i]} {filtered_text[i+1]}" in ai_keywords:
            return True
    return any(keyword.lower() in filtered_text for keyword in ai_keywords)

def is_irrelevant(text):
    irrelevant_keywords = ['airpods', 'iphone', 'smartphone', 'gadget']
    return any(keyword.lower() in text.lower() for keyword in irrelevant_keywords)

def scrape_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()

        if is_ai_related(article.title + " " + article.text) and not is_irrelevant(article.title + " " + article.text):
            return {
                'title': article.title,
                'url': url,
                'summary': article.summary,
                'keywords': ', '.join(article.keywords),
                'published_date': article.publish_date or datetime.now(),
                'source': article.source_url
            }
    except Exception as e:
        print(f"Error scraping article {url}: {str(e)}")
    return None

def scrape_news_site(base_url, ai_section_url=None):
    articles = []
    try:
        url_to_scrape = ai_section_url if ai_section_url else base_url
        response = requests.get(url_to_scrape, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        for link in soup.find_all('a', href=True):
            url = urljoin(base_url, link['href'])

            if not is_valid_url(url):
                continue

            if re.search(r'/\d{4}/\d{2}/\d{2}/', url) or '/article/' in url or '/news/' in url:
                article = scrape_article(url)
                if article:
                    articles.append(article)
                    print(f"Scraped: {article['title']}")
                time.sleep(1)

            if len(articles) >= 5:
                break

    except requests.RequestException as e:
        print(f"Error fetching {url_to_scrape}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error scraping {base_url}: {str(e)}")

    return articles

# Lijst van websites om te scrapen
websites = [
    {'name': 'Computable', 'url': 'https://www.computable.nl/'},
    {'name': 'Silicon Canals', 'url': 'https://siliconcanals.com/'},
    {'name': 'OfferZen', 'url': 'https://www.offerzen.com/'},
    {'name': 'Ethics and Technology', 'url': 'https://www.ethicsandtechnology.eu/'},
    {'name': 'Holland FinTech', 'url': 'https://www.hollandfintech.com/'},
    {'name': 'NOS', 'url': 'https://nos.nl/'},
    {'name': 'Adyen', 'url': 'https://www.adyen.com/'},
    {'name': 'Devoteam', 'url': 'https://nl.devoteam.com/'},
    {'name': 'Ironhack', 'url': 'https://www.ironhack.com/'},
    {'name': 'Maritime Technology', 'url': 'https://www.maritimetechnology.nl/'},
    {'name': 'Holland High Tech', 'url': 'https://www.hollandhightech.nl/'},
    {'name': 'Techleap', 'url': 'https://www.techleap.nl/'},
    {'name': 'Mikrocentrum', 'url': 'https://www.mikrocentrum.nl/'},
    {'name': 'FME', 'url': 'https://www.fme.nl/'},
    {'name': 'Dutch Tech', 'url': 'https://www.dutch-tech.nl/'},
    {'name': 'Holland Innovative', 'url': 'https://www.holland-innovative.nl/'},
    {'name': 'Prodrive Technologies', 'url': 'https://www.prodrive-technologies.com/'},
    {'name': 'TechBlog', 'url': 'https://techblog.nl/'},
    {'name': 'ALTEN', 'url': 'https://www.alten.nl/'},
    {'name': 'Netherlands Innovation', 'url': 'https://www.netherlandsinnovation.nl/'},
    {'name': 'DTX', 'url': 'https://www.dtx.nl/'},
    {'name': 'DTLS', 'url': 'https://www.dtls.nl/'},
    {'name': 'TNO', 'url': 'https://www.tno.nl/'},
    {'name': 'OpenAI', 'url': 'https://openai.com/'},
    {'name': 'MIT News', 'url': 'https://news.mit.edu/'},
    {'name': 'AI Trends', 'url': 'https://www.aitrends.com/'},
    {'name': 'KDnuggets', 'url': 'https://www.kdnuggets.com/'},
    {'name': 'MIT Technology Review', 'url': 'https://www.technologyreview.com/'},
    {'name': 'ScienceDaily', 'url': 'https://www.sciencedaily.com/'},
    {'name': 'Business.com', 'url': 'https://www.business.com/'},
    {'name': 'Artificial Intelligence News', 'url': 'https://artificialintelligence-news.com/'},
    {'name': 'Analytics Vidhya', 'url': 'https://www.analyticsvidhya.com/'},
    {'name': 'Google AI Blog', 'url': 'https://ai.googleblog.com/'},
    {'name': 'Wired', 'url': 'https://www.wired.com/'},
    {'name': 'AI Weekly', 'url': 'https://aiweekly.co/'},
    {'name': 'AI Valley', 'url': 'https://aivalley.com/'},
    {'name': 'AWS Machine Learning Blog', 'url': 'https://aws.amazon.com/blogs/machine-learning/'},
    {'name': 'DeepMind', 'url': 'https://deepmind.com/'},
    {'name': 'ExtremeTech', 'url': 'https://www.extremetech.com/'},
    {'name': 'Gizmodo', 'url': 'https://gizmodo.com/'},
    {'name': 'IEEE Spectrum', 'url': 'https://spectrum.ieee.org/'},
    {'name': 'MarkTechPost', 'url': 'https://www.marktechpost.com/'},
    {'name': 'Towards Data Science', 'url': 'https://towardsdatascience.com/'},
    {'name': 'VentureBeat', 'url': 'https://venturebeat.com/'},
    {'name': 'AI Tidbits', 'url': 'https://aitidbits.com/'},
    {'name': 'Analytics Insight', 'url': 'https://www.analyticsinsight.net/'},
    {'name': 'Wired UK', 'url': 'https://www.wired.co.uk/'},
    {'name': 'AI Business', 'url': 'https://aibusiness.com/'},
    {'name': 'Berkeley AI Research', 'url': 'https://bair.berkeley.edu/'},
    {'name': 'Medium', 'url': 'https://medium.com/'},
    {'name': 'Great Learning', 'url': 'https://www.greatlearning.com/'},
    {'name': 'NVIDIA', 'url': 'https://www.nvidia.com/'},
    {'name': 'NVIDIA Developer', 'url': 'https://developer.nvidia.com/'},
    {'name': 'TensorFlow', 'url': 'https://www.tensorflow.org/'},
    {'name': 'CMU Machine Learning', 'url': 'https://ml.cmu.edu/'},
    {'name': "There's An AI For That", 'url': 'https://theresanaiforthat.com/'},
    {'name': 'AI Scout', 'url': 'https://aiscout.net/'},
    {'name': 'OFEM Wire', 'url': 'https://ofemwire.com/'},
    {'name': 'DLabs.AI', 'url': 'https://dlabs.ai/'},
    {'name': 'The Gradient', 'url': 'https://thegradient.pub/'},
    {'name': 'Machine Learning Mastery', 'url': 'https://machinelearningmastery.com/'},
    {'name': 'ZDNet', 'url': 'https://www.zdnet.com/'},
    {'name': 'Hackernoon', 'url': 'https://hackernoon.com/'},
    {'name': 'GeekFlare', 'url': 'https://geekflare.com/'},
    {'name': 'AI Magazine', 'url': 'https://aimagazine.com/'},
    {'name': 'DATAVERSITY', 'url': 'https://www.dataversity.net/'},
    {'name': 'Emerj', 'url': 'https://emerj.com/'}
]

def add_website(name, url, ai_section=None):
    new_site = {'name': name, 'url': url}
    if ai_section:
        new_site['ai_section'] = ai_section
    websites.append(new_site)
    print(f"Toegevoegd: {name}")

def scrape_multiple_sites():
    all_articles = []
    for site in websites:
        print(f"Scraping {site['name']}...")
        articles = scrape_news_site(site['url'], site.get('ai_section'))
        all_articles.extend(articles)

    return pd.DataFrame(all_articles)

def standardize_dates(df):
    def to_naive_utc(dt):
        if dt is pd.NaT:
            return dt
        if dt.tzinfo is None:
            return dt
        return dt.astimezone(pytz.UTC).replace(tzinfo=None)

    df['published_date'] = df['published_date'].apply(to_naive_utc)
    return df

def generate_newsletter(df):
    df = standardize_dates(df)
    df = df.sort_values('published_date', ascending=False)

    html = f"""
    <html>
    <head>
        <style>
            body {{ font-family: Arial, sans-serif; line-height: 1.6; color: #333; max-width: 800px; margin: 0 auto; padding: 20px; }}
            h1 {{ color: #2c3e50; text-align: center; }}
            h2 {{ color: #3498db; }}
            .article {{ margin-bottom: 30px; border-bottom: 1px solid #eee; padding-bottom: 20px; }}
            .article h3 {{ margin-bottom: 5px; }}
            .article p {{ margin-top: 5px; }}
            .meta {{ font-size: 0.8em; color: #7f8c8d; }}
            a {{ color: #2980b9; text-decoration: none; }}
            a:hover {{ text-decoration: underline; }}
        </style>
    </head>
    <body>
        <h1>AI News Nieuwsbrief</h1>
        <p style="text-align: center;">Gegenereerd op {datetime.now().strftime('%d-%m-%Y %H:%M')} UTC</p>
    """

    for _, row in df.iterrows():
        published_date = row['published_date'].strftime('%d-%m-%Y') if pd.notna(row['published_date']) else 'Onbekende datum'
        html += f"""
        <div class="article">
            <h2><a href="{row['url']}">{row['title']}</a></h2>
            <p class="meta">Gepubliceerd op: {published_date} | Bron: {row['source']}</p>
            <p>{row['summary']}</p>
            <p><strong>Keywords:</strong> {row['keywords']}</p>
        </div>
        """

    html += """
    </body>
    </html>
    """

    return html

def display_newsletter(df):
    from IPython.display import HTML, display
    newsletter_html = generate_newsletter(df)
    display(HTML(newsletter_html))

def save_newsletter(df, filename='ai_news_newsletter.html'):
    newsletter_html = generate_newsletter(df)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(newsletter_html)
    print(f"Nieuwsbrief opgeslagen als {filename}")

# Scrape de websites en genereer de nieuwsbrief
news_df = scrape_multiple_sites()
display_newsletter(news_df)
save_newsletter(news_df)

Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
C

Building prefix dict from /usr/local/lib/python3.10/dist-packages/jieba/dict.txt ...
DEBUG:jieba:Building prefix dict from /usr/local/lib/python3.10/dist-packages/jieba/dict.txt ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 3.2614612579345703 seconds.
DEBUG:jieba:Loading model cost 3.2614612579345703 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


Scraped: NVIDIA 在 Microsoft Azure 上為全球企業和新創公司推出生成式人工智慧代工服務
Scraping NVIDIA Developer...
Scraping TensorFlow...
Scraping CMU Machine Learning...
Error fetching https://ml.cmu.edu/: HTTPSConnectionPool(host='ml.cmu.edu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'ml.cmu.edu'. (_ssl.c:1007)")))
Scraping There's An AI For That...
Error fetching https://theresanaiforthat.com/: 403 Client Error: Forbidden for url: https://theresanaiforthat.com/
Scraping AI Scout...
Scraping OFEM Wire...
Scraping DLabs.AI...
Scraping The Gradient...
Scraping Machine Learning Mastery...
Error fetching https://machinelearningmastery.com/: 403 Client Error: Forbidden for url: https://machinelearningmastery.com/
Scraping ZDNet...
Scraped: What is ChatGPT? The world's most popular AI chatbot explained
Scraped: The best AI image generators of 2024: Tested and

Nieuwsbrief opgeslagen als ai_news_newsletter.html


In [None]:
# Add this import at the beginning of your code
from IPython.display import HTML, display

# Modify the display_newsletter function as follows
def display_newsletter(df):
    newsletter_html = generate_newsletter(df)
    display(HTML(newsletter_html))

# At the end of your script, after scraping and generating the newsletter, add:
news_df = scrape_multiple_sites()
display_newsletter(news_df)

Scraping Computable...
Scraped: Boek: ‘Universum te Koop’
Scraped: Is gen-ai de nieuwe citizen developer?
Scraped: UWV ontkent inzet ai bij oplossen WIA-problemen
Scraped: Atos geeft olympisch stokje door aan Deloitte
Scraped: Denken Europese organisaties (onter-)echt dat ze tijdig NIS2-compliant zijn?
Scraping Silicon Canals...
Scraped: The latest European technology news about Artificial Intelligence (AI)
Scraped: The latest European technology news about Deeptech
