In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Custom headers to mimic a regular browser
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                   'AppleWebKit/537.36 (KHTML, like Gecko) '
                   'Chrome/112.0.0.0 Safari/537.36'),
    'Referer': 'https://www.google.com/'
}

# List of 100 relevant websites – Ukrainian government, allied governments, defense agencies, think tanks, and regional initiatives
urls = [
    # Ukrainian government sites:
    "https://www.president.gov.ua/en",                              # President of Ukraine
    "https://www.president.gov.ua/en/news/speeches",                  # Speeches
    "https://www.mil.gov.ua/en",                                   # Ministry of Defense of Ukraine
    "https://www.mil.gov.ua/en/news/",                             # News - Ministry of Defense
    "https://mspu.gov.ua/en",                                      # Ministry of Strategic Industries of Ukraine
    "https://mspu.gov.ua/en/news/",                                # News - Ministry of Strategic Industries
    "https://www.kmu.gov.ua/en",                                   # Cabinet of Ministers of Ukraine
    "https://www.kmu.gov.ua/en/news",                              # News - Cabinet of Ministers
    "https://mfa.gov.ua/en",                                       # Ministry of Foreign Affairs of Ukraine
    "https://mfa.gov.ua/en/press-center/news",                     # Press center news
    "http://www.nsdc.gov.ua",                                      # National Security and Defense Council of Ukraine (if available)

    # Allied governments and defense agencies:
    "https://www.nato.int",                                        # NATO
    "https://europa.eu",                                           # European Union
    "https://www.defense.gov",                                     # U.S. Department of Defense
    "https://www.state.gov",                                       # U.S. Department of State
    "https://www.gov.uk/government/organisations/ministry-of-defence", # U.K. Ministry of Defence
    "https://www.bmvg.de/en",                                      # German Federal Ministry of Defence
    "https://www.defense.gouv.fr",                                 # French Ministry of Armed Forces
    "http://www.difesa.it/EN",                                     # Italian Ministry of Defence
    "https://www.gov.pl/web/national-defence",                     # Polish Ministry of National Defence
    "https://www.canada.ca/en/department-national-defence.html",   # Canadian Department of National Defence
    "https://www.defence.gov.au",                                  # Australian Department of Defence
    "https://www.bundeswehr.de/en",                                # Bundeswehr (alternative German site)

    # Regional groupings & initiatives:
    "https://www.visegradgroup.eu",                                # Visegrad Group
    "https://three-seas.eu",                                       # Three Seas Initiative
    "https://bucharest9.org",                                      # Bucharest Nine (if available)
    "https://nordic-baltic8.org",                                  # Nordic-Baltic Eight (if available)
    "https://lublintriangle.eu",                                   # Lublin Triangle (if available)
    "https://www.cei.int",                                         # Central European Initiative
    "https://ukpolukraine.org",                                    # UK-Poland-Ukraine Trilateral Initiative (if available)

    # Think tanks and research institutes:
    "https://www.csis.org",                                        # Center for Strategic and International Studies
    "https://www.rand.org",                                        # RAND Corporation
    "https://www.atlanticcouncil.org",                             # Atlantic Council
    "https://www.chathamhouse.org",                                # Chatham House
    "https://www.iiss.org",                                        # International Institute for Strategic Studies
    "https://www.sipri.org",                                       # Stockholm International Peace Research Institute
    "https://www.cnas.org",                                        # Center for a New American Security
    "https://www.ecfr.eu",                                         # European Council on Foreign Relations
    "https://carnegieeurope.eu",                                   # Carnegie Europe
    "https://www.epc.eu",                                          # European Policy Centre
    "https://www.cepa.org",                                        # Center for European Policy Analysis
    "https://www.brookings.edu",                                   # Brookings Institution
    "https://www.hoover.org",                                      # Hoover Institution
    "http://www.cssu.org.ua",                                      # Center for Strategic Studies, Ukraine (if available)
    "https://www.uifuture.org",                                    # Ukrainian Institute for the Future
    "https://razumkov.org.ua",                                     # Razumkov Centre
    "https://uacrisis.org",                                        # Ukrainian Crisis Media Center
    "https://www.kyivsecurityforum.com",                           # Kyiv Security Forum
    "http://www.uiir.org.ua",                                      # Ukrainian Institute of International Relations (if available)
    "https://www.iss.europa.eu",                                   # EU Institute for Security Studies

    # Additional U.S. related sites:
    "https://www.whitehouse.gov/nsc",                              # U.S. National Security Council
    "https://crsreports.congress.gov",                             # Congressional Research Service

    # Additional U.K. sites:
    "https://www.da.mod.uk",                                       # U.K. Defence Academy
    "https://committees.parliament.uk/committee/117/defence-committee",  # U.K. Defence Committee

    # Additional think tanks and research:
    "https://www.gmfus.org",                                       # German Marshall Fund of the United States
    "https://www.esiweb.org",                                      # European Stability Initiative

    # European news and analysis:
    "https://www.brusselstimes.com",                               # Brussels Times
    "https://www.euractiv.com",                                    # Euractiv
    "https://www.politico.eu",                                     # Politico Europe
    "https://www.defensenews.com",                                 # Defense News
    "https://www.janes.com",                                       # Jane's Defence Weekly
    "https://www.militarytimes.com",                              # Military Times
    "https://www.nationaldefensemagazine.org",                     # National Defense Magazine
    "https://defence-blog.com",                                    # Defence Blog
    "https://www.firepowermagazine.com",                           # Firepower Magazine

    # Additional European defense and security organizations:
    "https://www.osce.org",                                        # OSCE
    "https://www.eucom.mil",                                       # U.S. European Command
    "https://www.cybercom.mil",                                    # U.S. Cyber Command
    "https://www.soc.mil",                                         # U.S. Special Operations Command

    # Additional allied national sites:
    "https://www.defensa.gob.es",                                  # Spanish Ministry of Defence
    "https://www.defensie.nl",                                     # Dutch Ministry of Defence
    "https://www.forsvarsmakten.se/en",                            # Swedish Ministry of Defence
    "https://www.regjeringen.no/en/dep/fd/id504/",                  # Norwegian Ministry of Defence
    "https://www.defmin.fi/en/frontpage",                          # Finnish Ministry of Defence
    "https://www.mil.be",                                          # Belgian Defence
    "https://www.fmn.dk/en",                                        # Danish Defence
    "https://www.bundesheer.at",                                   # Austrian Federal Ministry of Defence
    "https://www.vbs.admin.ch/en",                                 # Swiss Federal Department of Defence, Civil Protection and Sport
    "https://www.defesa.gov.pt",                                   # Portuguese Ministry of National Defence
    "https://www.ssb.gov.tr",                                      # Turkish Ministry of National Defence
    "https://www.mod.mil.gr",                                      # Greek Ministry of National Defence
    "https://www.army.cz/en",                                      # Czech Ministry of Defence
    "https://www.mosr.sk/en",                                      # Slovak Ministry of Defence
    "https://www.kormany.hu/en/ministry-of-defence",               # Hungarian Ministry of Defence
    "http://www.mod.bg/en",                                        # Bulgarian Ministry of Defence
    "https://www.mapn.ro/en",                                      # Romanian Ministry of National Defence

    # Baltic and Eastern European sites:
    "https://kam.lt/en",                                           # Lithuanian Ministry of National Defence
    "https://www.mod.gov.lv/en",                                   # Latvian Ministry of Defence
    "https://www.kaitseministeerium.ee/en",                        # Estonian Ministry of Defence
    "https://www.government.is",                                   # Icelandic Government (for foreign affairs)

    # Additional global allied sites:
    "https://www.defence.govt.nz",                                 # New Zealand Ministry of Defence
    "https://www.mindef.gov.sg",                                   # Singapore Ministry of Defence
    "https://www.mod.go.jp/e",                                     # Japan Ministry of Defence
    "https://www.mnd.go.kr/eng",                                   # South Korean Ministry of National Defense
    "https://www.mod.gov.in",                                      # Indian Ministry of Defence
    "https://www.mod.gov.il/English/Pages/default.aspx"           # Israel Ministry of Defence
]

def fetch_page(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.content
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return None

def extract_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    content = ""
    # Attempt to extract content from <article> tags first
    for article in soup.find_all('article'):
        content += article.get_text(separator='\n', strip=True) + '\n\n' + '=' * 80 + '\n\n'
    # Fallback: extract from elements with class "content" if no articles are found
    if not content:
        for div in soup.find_all('div', class_='content'):
            content += div.get_text(separator='\n', strip=True) + '\n\n' + '=' * 80 + '\n\n'
    return content

final_content = ""
for url in urls:
    print(f"Processing {url}")
    html = fetch_page(url)
    if html:
        final_content += extract_content(html)
    # Pause between requests to avoid overwhelming the servers
    time.sleep(1)

with open('ukraine_defense_materials.txt', 'w', encoding='utf-8') as f:
    f.write(final_content)

print("Scraping complete. Data saved to 'ukraine_defense_materials.txt'")


Processing https://www.president.gov.ua/en
Request failed for https://www.president.gov.ua/en: 403 Client Error: Forbidden for url: https://www.president.gov.ua/en
Processing https://www.president.gov.ua/en/news/speeches
Request failed for https://www.president.gov.ua/en/news/speeches: 403 Client Error: Forbidden for url: https://www.president.gov.ua/en/news/speeches
Processing https://www.mil.gov.ua/en
Request failed for https://www.mil.gov.ua/en: 403 Client Error: Forbidden for url: https://www.mil.gov.ua/en
Processing https://www.mil.gov.ua/en/news/
Request failed for https://www.mil.gov.ua/en/news/: 403 Client Error: Forbidden for url: https://www.mil.gov.ua/en/news/
Processing https://mspu.gov.ua/en
Processing https://mspu.gov.ua/en/news/
Processing https://www.kmu.gov.ua/en
Request failed for https://www.kmu.gov.ua/en: HTTPSConnectionPool(host='www.kmu.gov.ua', port=443): Max retries exceeded with url: /en (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object 