In [None]:
# prompt: Scrape the web for statements and speeches regarding Joint Expeditionary Force activities in the Arctic...include quotations from secondary sources as well

import requests
from bs4 import BeautifulSoup
import re

def scrape_arctic_statements(keywords):
  """Scrapes web pages for statements and speeches related to the Joint Expeditionary Force (JEF)
  and Arctic activities, including quotations from secondary sources.

  Args:
    keywords: A list of keywords to search for (e.g., ['Joint Expeditionary Force', 'Arctic', 'NATO']).

  Returns:
    A list of dictionaries, where each dictionary represents a found statement/speech and
    contains information like the title, source, URL, and extracted text.
  """
  statements = []
  # Add more relevant news sources and websites here.
  sources = [
      "https://www.nato.int/cps/en/natohq/news_190311.htm", # NATO Website
      "https://www.arctic-council.org/en/", # Arctic Council
      "https://www.gov.uk/government/publications", # UK government publications
      "https://www.mil.dk/en/", # Danish military
      "https://www.forsvaret.no/en/", # Norwegian military
      "https://www.mil.ca/", # Canadian military
      "https://www.althingi.is/", # Icelandic Parliament
      "https://www.gov.ie/", # Irish Government
      "https://www.defensie.nl/en/", # Netherlands defense
  ]

  for source in sources:
    try:
      response = requests.get(source)
      response.raise_for_status()  # Raise an exception for bad status codes
      soup = BeautifulSoup(response.content, "html.parser")
      for keyword in keywords:
        for link in soup.find_all("a", href=True, text=re.compile(keyword, re.IGNORECASE)):
          article_url = link["href"]
          try:
            article_response = requests.get(article_url)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.content, "html.parser")
            title = article_soup.title.string if article_soup.title else "No Title"
            text = article_soup.get_text()

            statements.append({
                "title": title,
                "source": source,
                "url": article_url,
                "text": text
            })

          except requests.exceptions.RequestException as e:
            print(f"Error fetching article from {article_url}: {e}")
    except requests.exceptions.RequestException as e:
      print(f"Error fetching source {source}: {e}")

  return statements


if __name__ == "__main__":
  keywords = ["Joint Expeditionary Force", "Arctic", "NATO", "defence", "security", "operations"]

  found_statements = scrape_arctic_statements(keywords)

  for statement in found_statements:
    print("-" * 20)
    print(f"Title: {statement['title']}")
    print(f"Source: {statement['source']}")
    print(f"URL: {statement['url']}")
    # You can process the 'text' further to extract quotes and specific information
    print(f"Text Snippet: {statement['text'][:200]}...")


Error fetching source https://www.nato.int/cps/en/natohq/news_190311.htm: 403 Client Error: Forbidden for url: https://www.nato.int/cps/en/natohq/news_190311.htm
Error fetching source https://www.arctic-council.org/en/: 403 Client Error: Forbidden for url: https://www.arctic-council.org/en/


  for link in soup.find_all("a", href=True, text=re.compile(keyword, re.IGNORECASE)):


Error fetching source https://www.mil.dk/en/: HTTPSConnectionPool(host='www.mil.dk', port=443): Max retries exceeded with url: /en/ (Caused by SSLError(SSLError(1, '[SSL: TLSV1_UNRECOGNIZED_NAME] tlsv1 unrecognized name (_ssl.c:1006)')))
Error fetching article from /en/exercises-and-operations: Invalid URL '/en/exercises-and-operations': No scheme supplied. Perhaps you meant https:///en/exercises-and-operations?
Error fetching article from /en/exercises-and-operations: Invalid URL '/en/exercises-and-operations': No scheme supplied. Perhaps you meant https:///en/exercises-and-operations?
Error fetching source https://www.mil.ca/: HTTPSConnectionPool(host='www.mil.ca', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f566c6ba050>: Failed to resolve 'www.mil.ca' ([Errno -2] Name or service not known)"))
Error fetching source https://www.althingi.is/: 403 Client Error: Forbidden for url: https://www.althingi.is/
Er

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re

# Custom headers to mimic a browser
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                   'AppleWebKit/537.36 (KHTML, like Gecko) '
                   'Chrome/112.0.0.0 Safari/537.36'),
    'Referer': 'https://www.google.com/'
}

# List of alliance websites with likely news/statement pages.
# You might need to update these URLs to point to pages that list statements.
urls = [
    "https://bucharest9.org/en/news",         # Bucharest Nine news/statements page
    "https://www.jef.int/en/news",             # Joint Expeditionary Force (JEF) news
    "https://www.visegradgroup.eu/en/news",    # Visegrad Group news/statements
    "https://nordefco.org/news",               # NORDEFCO news/statements (if available)
    # Additional similar alliances can be added here:
    "https://three-seas.eu/en/news",           # Three Seas Initiative news
    "https://lublintriangle.eu/en/news",       # Lublin Triangle (if available)
    "https://ukpolukraine.org/en/news"          # UK-Poland-Ukraine Trilateral Initiative (if available)
]

def fetch_page(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.content
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return None

def extract_recent_statements(html):
    soup = BeautifulSoup(html, 'html.parser')
    content = ""

    # Look for article or post blocks; this selector might need customization per site.
    for article in soup.find_all(['article', 'div'],
                                 class_=lambda c: c and re.search(r'(news|post|statement)', c, re.IGNORECASE)):
        # Optionally, check for a publication date within the article, and skip if not recent.
        # For example, if dates are stored in a <time> tag with attribute datetime.
        date_tag = article.find('time')
        if date_tag and date_tag.get('datetime'):
            # Here, you can parse the date and filter if older than a threshold.
            pass

        text = article.get_text(separator='\n', strip=True)
        if text:
            content += text + "\n\n" + "="*80 + "\n\n"

    # Fallback: if no structured articles were found, grab main content from a common container.
    if not content:
        main_content = soup.find('main')
        if main_content:
            content = main_content.get_text(separator='\n', strip=True)
    return content

final_content = ""

for url in urls:
    print(f"Processing {url}")
    html = fetch_page(url)
    if html:
        extracted = extract_recent_statements(html)
        if extracted:
            final_content += f"--- Content from {url} ---\n\n" + extracted + "\n"
    time.sleep(1)  # Delay to avoid overwhelming servers

output_filename = "small_scale_alliance_statements.txt"
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(final_content)

print(f"Scraping complete. Data saved to '{output_filename}'")


Processing https://bucharest9.org/en/news
Request failed for https://bucharest9.org/en/news: HTTPSConnectionPool(host='bucharest9.org', port=443): Max retries exceeded with url: /en/news (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f566c7ed190>: Failed to resolve 'bucharest9.org' ([Errno -2] Name or service not known)"))
Processing https://www.jef.int/en/news
Request failed for https://www.jef.int/en/news: HTTPSConnectionPool(host='www.jef.int', port=443): Max retries exceeded with url: /en/news (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f566c6bdbd0>: Failed to resolve 'www.jef.int' ([Errno -2] Name or service not known)"))
Processing https://www.visegradgroup.eu/en/news
Request failed for https://www.visegradgroup.eu/en/news: 404 Client Error: Not Found for url: https://www.visegradgroup.eu/en/news
Processing https://nordefco.org/news
Request failed for https://nordefco.org/news: HTTPSConnectionPool(host='nordef