In [13]:
import pandas as pd, numpy as np, re, os
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import html5lib
import time
import random
from io import BytesIO
import zlib
import gzip

In [None]:
# def get_page(url):
#     """Scrapes a URL and returns the HTML source.
    
#     Args:
#         url (string): Fully qualified URL of a page.
    
#     Returns:
#         soup (string): HTML source of scraped page.
#     """

#     response = urllib.request.urlopen(urllib.request.Request(url, 
#                                                              headers={'User-Agent': 'Mozilla'}))
#     soup = BeautifulSoup(response, 
#                          'html.parser', 
#                          from_encoding=response.info().get_param('charset'))
    
#     return soup

In [None]:
page = "https://www.socom.mil/SOF-ATL/"

In [None]:
# No robots.txt page present (SOCOM.MIL has one though)
robots = get_page("https://www.socom.mil/SOF-ATL/robots.txt")
robots

In [2]:
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding  # Ensures correct encoding interpretation
        return response.text
    else:
        return None

def parse_links(html, base_url):
    try:
        soup = BeautifulSoup(html, 'html5lib')  # using html5lib as the parser
        for link in soup.find_all('a', href=True):
            full_url = urljoin(base_url, link['href'])
            if full_url.startswith(base_url):  # Filtering external links
                yield full_url
    except Exception as e:
        print(f"Error parsing HTML: {e}")


def crawl_website(base_url):
    visited = set()
    pages_to_visit = {base_url}
    
    while pages_to_visit:
        current_page = pages_to_visit.pop()
        if current_page not in visited:
            visited.add(current_page)
            html_content = fetch_page(current_page)
            if html_content:
                for url in parse_links(html_content, base_url):
                    pages_to_visit.add(url)
            print(f"Visited: {current_page}")
        time.sleep(random.uniform(0.5,2.0)) # Sleep for random time in between 0.5 and 2 seconds
    return visited

In [3]:
# Scrape the page

base_url = 'https://www.socom.mil/SOF-ATL/'
all_pages = crawl_website(base_url)

Visited: https://www.socom.mil/SOF-ATL/
Visited: https://www.socom.mil/SOF-ATL/Pages/cto.aspx
Visited: https://www.socom.mil/SOF-ATL/Pages/SOF_ATL_Lines_of_Effort.aspx
Visited: https://www.socom.mil/SOF-ATL/Pages/SOF-Hard-Problems.aspx
Visited: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Mission%20Assured%20Communications%20-%20MAC%20MegaTalker-Apr23.docx
Visited: https://www.socom.mil/SOF-ATL/Pages/eSOF-main.aspx
Visited: https://www.socom.mil/SOF-ATL/Pages/Defense-Industrial-Base-Cybersecurity.aspx
Visited: https://www.socom.mil/SOF-ATL/Pages/programs.aspx
Visited: https://www.socom.mil/SOF-ATL/PublishingImages/ViperStrike.jpg
Visited: https://www.socom.mil/SOF-ATL/Pages/eSOF-submit-your-idea-form-instruct.aspx
Visited: https://www.socom.mil/SOF-ATL/Pages/eSOF-process.aspx
Visited: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Signature%20Management%20-%20SIGMAN%20MegaTalker-Apr23.docx
Visited: https://www.socom.mil/SOF-AT

In [5]:
# Adding function to save the content from URLs

def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding  # Ensures correct encoding interpretation
        return response.text
    else:
        return None

def parse_links(html, base_url):
    try:
        soup = BeautifulSoup(html, 'lxml')
        for link in soup.find_all('a', href=True):
            full_url = urljoin(base_url, link['href'])
            if full_url.startswith(base_url):  # Filtering external links
                yield full_url
    except Exception as e:
        print(f"Error parsing HTML: {e}")

def save_content(html, url, base_dir="./website_data"):
    filename = url.replace("https://", "").replace("http://", "").replace("/", "_") + ".html"
    filepath = os.path.join(base_dir, filename)
    os.makedirs(base_dir, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(html)

def crawl_website(base_url):
    visited = set()
    pages_to_visit = {base_url}
    
    while pages_to_visit:
        current_page = pages_to_visit.pop()
        if current_page not in visited:
            visited.add(current_page)
            html_content = fetch_page(current_page)
            if html_content:
                save_content(html_content, current_page)  # Save page content to file
                for url in parse_links(html_content, base_url):
                    pages_to_visit.add(url)
                print(f"Visited and saved: {current_page}")
            # Sleep between requests
            time.sleep(random.uniform(0.5, 2.0))  # Sleep for a random time between 0.5 and 2 seconds
    return visited

In [6]:
# Try again to scrape

base_url = 'https://www.socom.mil/SOF-ATL/'
all_pages = crawl_website(base_url)

Visited and saved: https://www.socom.mil/SOF-ATL/
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/cto.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF_ATL_Lines_of_Effort.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF-Hard-Problems.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Mission%20Assured%20Communications%20-%20MAC%20MegaTalker-Apr23.docx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-main.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/Defense-Industrial-Base-Cybersecurity.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/programs.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-submit-your-idea-form-instruct.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-process.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Signature%20Management%20-%20SIGMAN%20MegaTalker-Apr23.docx

In [8]:
# Trying again

def fetch_page(url):
    headers = {
        'Accept-Encoding': 'gzip, deflate',  # indicates that your client can accept compressed content
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        if response.headers.get('content-encoding') == 'gzip':
            from io import BytesIO
            import gzip
            buffer = BytesIO(response.content)
            with gzip.open(buffer, 'rt', encoding=response.apparent_encoding) as f:
                return f.read()
        return response.text
    else:
        return None



def parse_links(html, base_url):
    try:
        soup = BeautifulSoup(html, 'lxml')
        for link in soup.find_all('a', href=True):
            full_url = urljoin(base_url, link['href'])
            if full_url.startswith(base_url):
                yield full_url
    except Exception as e:
        print(f"Error parsing HTML: {e}")


def save_content(html, url, base_dir="./website_data"):
    filename = url.replace("https://", "").replace("http://", "").replace("/", "_") + ".html"
    filepath = os.path.join(base_dir, filename)
    os.makedirs(base_dir, exist_ok=True)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(html)

def crawl_website(base_url):
    visited = set()
    pages_to_visit = {base_url}
    
    while pages_to_visit:
        current_page = pages_to_visit.pop()
        if current_page not in visited:
            visited.add(current_page)
            html_content = fetch_page(current_page)
            if html_content:
                save_content(html_content, current_page)
                for url in parse_links(html_content, base_url):
                    pages_to_visit.add(url)
                print(f"Visited and saved: {current_page}")
            time.sleep(random.uniform(0.5, 2.0))
    return visited

In [9]:
# Try again...

base_url = 'https://www.socom.mil/SOF-ATL/'
all_pages = crawl_website(base_url)

Visited and saved: https://www.socom.mil/SOF-ATL/
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/cto.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF_ATL_Lines_of_Effort.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF-Hard-Problems.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Mission%20Assured%20Communications%20-%20MAC%20MegaTalker-Apr23.docx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-main.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/Defense-Industrial-Base-Cybersecurity.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/programs.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-submit-your-idea-form-instruct.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-process.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Signature%20Management%20-%20SIGMAN%20MegaTalker-Apr23.docx

### Trying to adjust again because we returned a ton of white space and a few uncoded characters

In [15]:
def fetch_page(url):
    headers = {'Accept-Encoding': 'gzip, deflate'}  # Handles gzip and deflate encodings
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Handling potential content encoding
            if 'gzip' in response.headers.get('Content-Encoding', ''):
                # Use gzip module to decode
                buffer = BytesIO(response.content)
                with gzip.open(buffer, 'rt', encoding='utf-8') as f:
                    return f.read()
            elif 'deflate' in response.headers.get('Content-Encoding', ''):
                # Use zlib to decode deflate encoding
                try:
                    return zlib.decompress(response.content).decode('utf-8')
                except zlib.error:
                    return zlib.decompress(response.content, -zlib.MAX_WBITS).decode('utf-8')
            else:
                # No encoding, or encoding is handled by requests automatically
                return response.text
        else:
            print(f"Failed to fetch {url} with status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Request failed for {url}: {e}")
    return None

def parse_links(html, base_url):
    try:
        soup = BeautifulSoup(html, 'lxml')
        for link in soup.find_all('a', href=True):
            full_url = urljoin(base_url, link['href'])
            if full_url.startswith(base_url) and full_url not in visited:
                yield full_url
    except Exception as e:
        print(f"Error parsing HTML for {base_url}: {e}")

def save_content(html, url, base_dir="./website_data"):
    filename = url.replace("https://", "").replace("http://", "").replace("/", "_") + ".html"
    filepath = os.path.join(base_dir, filename)
    os.makedirs(base_dir, exist_ok=True)
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(html)
    except Exception as e:
        print(f"Failed to write {filepath}: {e}")

def crawl_website(base_url):
    global visited
    visited = set()
    pages_to_visit = {base_url}
    
    while pages_to_visit:
        current_page = pages_to_visit.pop()
        if current_page not in visited:
            visited.add(current_page)
            html_content = fetch_page(current_page)
            if html_content:
                save_content(html_content, current_page)
                for url in parse_links(html_content, base_url):
                    pages_to_visit.add(url)
                print(f"Visited and saved: {current_page}")
            time.sleep(random.uniform(0.5, 2.0))
    return visited

In [16]:
# Try again after modifying script to handle content encoding

base_url = 'https://www.socom.mil/SOF-ATL/'
all_pages = crawl_website(base_url)

Visited and saved: https://www.socom.mil/SOF-ATL/
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/cto.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF_ATL_Lines_of_Effort.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/SOF-Hard-Problems.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFHardProblemsDocumentLibrary/USSOCOM%20ST%20Mission%20Assured%20Communications%20-%20MAC%20MegaTalker-Apr23.docx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-main.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/Defense-Industrial-Base-Cybersecurity.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/programs.aspx
Failed to fetch https://www.socom.mil/SOF-ATL/PublishingImages/ViperStrike.jpg with status code: 404
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-submit-your-idea-form-instruct.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/Pages/eSOF-process.aspx
Visited and saved: https://www.socom.mil/SOF-ATL/SOFH