## Notebook to scrape, download, and preprocess APOD images

In [None]:
import os
import glob
import random
import requests
import numpy as np

from PIL import Image
from io import BytesIO
from pathlib import Path
from bs4 import BeautifulSoup

In [None]:
# class names (i.e. labels) in the same order as the search results files in the glob
class_names = ['planetary_nebula', 'comet', 'reflection_nebula', 'aurora', 'spiral_galaxy']

files = glob.glob('*.html')
files

In [None]:
def get_image_from_daily_page(link):
    """ get full-sized APOD image from daily page """
    
    r = requests.get(link)

    im_soup = BeautifulSoup(r.content, "html.parser")
    
    url_stem = 'https://apod.nasa.gov/apod/'
    
    if len(im_soup('img')) > 0:
        pic_url = url_stem + im_soup('img')[0].attrs['src']
    else: 
        return None
    
    response = requests.get(pic_url)
    
    return Image.open(BytesIO(response.content))

In [None]:
def get_img_urls_from_apod_search_page(html_file):
    """ parses html for APOD search results page to get full-sized image urls """
    
    soup = BeautifulSoup(open(html_file, encoding="utf8"), "html.parser")

    img_links = [x.find_previous('a') for x in soup('img')]
    
    img_refs = [x['href'] for x in img_links if x]
    
    print(f"{len(img_refs)} image links recovered")
    
    return img_refs

In [None]:
def create_class_data_from_links(img_links, class_name, base_path, test_ratio=0.7):
    """ downloads, preprocesses and stores images into architecture """
    
    # create train/class and test/class pathways
    train_path = Path(base_path + "/train/" + class_name)
    test_path = Path(base_path + "/test/" + class_name)
    
    try:
        train_path.mkdir(parents=True, exist_ok=False)
        test_path.mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        print(f"paths for this {class_name} already exist")
    else:
        print(f"paths for {class_name} class have been created")
    
    # create train test split
    train_mask = np.full(len(img_links), True)
    train_mask[round(test_ratio*len(img_links)):] = False
    random.shuffle(train_mask)
    
    for is_train, link in zip(train_mask, img_links):
        # download image
        image = get_image_from_daily_page(link)
        if not image: continue 
        
        #process image
        image = preprocess_image(image)
    
        # store processed image in data/train/classname or data/test/classname format
        if is_train:
            image.save(train_path.as_posix() + '/' + link.split('/')[-1].split('.')[0] + '.PNG')
        else:
            image.save(test_path.as_posix() + '/' + link.split('/')[-1].split('.')[0] + '.PNG')


def preprocess_image(image):
    """ either done here or in model training """
    return image

In [None]:
for class_label, file in zip(class_names[1:], files[1:]):
    img_urls = get_img_urls_from_apod_search_page(file)
    
    # file path for data, should have train and test subfolders
    filepath = "/home/doug/Projects/apod_semantic_segmentation/data"
    
    create_class_data_from_links(img_urls, class_name=class_label, base_path=filepath, test_ratio=0.7)

In [None]:
def walk_through_dir(dir_path):
  """Walks through dir_path and returns its contents."""
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

In [None]:
walk_through_dir("/home/doug/Projects/apod_semantic_segmentation/data/train")

In [None]:
walk_through_dir("/home/doug/Projects/apod_semantic_segmentation/data/test")