Crawling RSS and webpages, HTTP

In [4]:
# https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-urllib3-and-requests-modul
# requests has a nicer API than urllib, besides that they have the same functionality
import urllib.request
import requests
from bs4 import BeautifulSoup
import utility as util

In [5]:
# based on:
# https://codeburst.io/building-an-rss-feed-scraper-with-python-73715ca06e1f

def get_children_names(x):
    out = []
    for c in x.children:
        if not c.name:
            pass
        elif not c.prefix:
            out.append(c.name)
        else:
            out.append(f"{c.prefix}:{c.name}")
    return out

def print_additional_tags(item):
    tags_to_show = ["title", "link", "pubDate"]
    for c in item.children:
        name = c.name
        if name in tags_to_show:
            pass
        else:
            print(name)

def print_main_tags(item):
    def find_text(s):
        return item.find(s).text
    title = find_text("title")
    link  = find_text("link")
    pubDate = find_text("pubDate")
    print(title)
    print(pubDate)
    print(link)
        
def parse_content(soup):
    # get all immediate items
    items = soup.rss.channel.findAll("item", recursive=False)
    for i, item in zip(range(5), items):
        print_main_tags(item)
        print_additional_tags(item)
        print()

# search website RSS
links = [
    "https://news.ycombinator.com/rss",
    "https://astralcodexten.substack.com/feed",
]
link = links[1]
soup = None
try:
    # get RSS content from link
    r = requests.get(link)
    print("Status code:", r.status_code)
    print()
    # soup : bs4.BeautifulSoup
    soup = BeautifulSoup(r.content, features='xml')
except Exception as e:
    print("Exception:", e)

print("list children in channel:")
print(*get_children_names(soup.rss.channel))
print()

print("list children in item:")
item = soup.rss.channel.find("item", recursive=False)
print_main_tags(item)
print_additional_tags(item)
print()

print("list items:")
parse_content(soup)


Status code: 200

list children in channel:
title description link image generator lastBuildDate atom:link copyright language webMaster itunes:owner itunes:author googleplay:owner googleplay:email googleplay:author item item item item item item item item item item item item item item item item item item item item

list children in item:
Open Thread 225
Mon, 23 May 2022 04:26:07 GMT
https://astralcodexten.substack.com/p/open-thread-225
description
guid
creator
enclosure
encoded

list items:
Open Thread 225
Mon, 23 May 2022 04:26:07 GMT
https://astralcodexten.substack.com/p/open-thread-225
description
guid
creator
enclosure
encoded

Your Book Review: Making Nature
Fri, 20 May 2022 20:23:34 GMT
https://astralcodexten.substack.com/p/your-book-review-making-nature
description
guid
creator
enclosure
encoded

Hidden Open Thread 224.5
Thu, 19 May 2022 18:26:34 GMT
https://astralcodexten.substack.com/p/hidden-open-thread-2245
description
guid
creator
enclosure
encoded

Lavender's Game: Silexan 

In [9]:
# search website RSS
links = [
    # Mental Outlaw
    "https://www.youtube.com/feeds/videos.xml?channel_id=UC7YOGHUfC1Tb6E4pudI9STA",
    # Foolish Fish
    "https://www.youtube.com/feeds/videos.xml?channel_id=UCVtWVX2xirq6Nybf5bumqwg",
]
link = links[1]
soup = None
try:
    # get RSS content from link
    r = requests.get(link)
    print("Status code:", r.status_code)
    print()
    # soup : bs4.BeautifulSoup
    soup = BeautifulSoup(r.content, 'xml')
except Exception as e:
    print("Exception:", e)

# display entire payload
# print(soup.prettify())

print("list children in feed:")
print( *get_children_names(soup.feed) )
print() 

print("list children in entry:")
entry = soup.feed.find("entry", recursive=False)
print( *get_children_names(entry) )
print()

print("Get yt:videoId :")
# videoId = entry.videoId
videoId = entry.find("yt:videoId", recursive=False)
print("string representation:", str(videoId))
print("namespace:", videoId.namespace)
print("prefix:", videoId.prefix)
print("text content:", videoId.text)
print()

print("Get media:group :")
group       = entry.find("media:group", recursive=False)
title       = group.find("media:title", recursive=False)
content     = group.find("media:content", recursive=False)
thumbnail   = group.find("media:thumbnail", recursive=False)
description = group.find("media:description", recursive=False)
print("title", title.text)
print("content attrs:", *content.attrs.keys())
print("content URL:", content["url"])
print("content type:", content["type"])
print("content width, height:", content["width"], content["height"])
print("thumbnail URL:", thumbnail["url"])
print("thumbnail width, height:", thumbnail["width"], thumbnail["height"])
print("description:", description.text[:200].replace("\n", " "))
print()

# print(item.prettify())

# print("List children in entry:")
# for c in entry.children:
#     if c.name is None:
#         pass
#     else:
#         print(c.name)
#         print(c.text)
#         print()

Status code: 200

list children in feed:
link id yt:channelId title link author published entry entry entry entry entry entry entry entry entry entry entry entry entry entry entry

list children in entry:
id yt:videoId yt:channelId title link author published updated media:group

Get yt:videoId :
string representation: <yt:videoId>Q41LKCrufrg</yt:videoId>
namespace: http://www.youtube.com/xml/schemas/2015
prefix: yt
text content: Q41LKCrufrg

Get media:group :
title Esoteric & Occult News - 19th May 2022
content attrs: url type width height
content URL: https://www.youtube.com/v/Q41LKCrufrg?version=3
content type: application/x-shockwave-flash
content width, height: 640 390
thumbnail URL: https://i2.ytimg.com/vi/Q41LKCrufrg/hqdefault.jpg
thumbnail width, height: 480 360
description: News round of upcoming beautiful and esoteric and occult publications, and from the world of oneiric art and entertainment.  00:00 Intro  00:11 *De Radiis* https://www.blackletter-press.com/product-pa



In [10]:
# search YT website for metadata, channelId
links = [
    # Mental Outlaw
    "https://www.youtube.com/c/MentalOutlaw",
    # Foolish Fish
    "https://www.youtube.com/c/FoolishFishBooks"
]
link = links[1]
soup = None
try:
    # Not sure what this is
    # req = urllib.request.Request(link, method="HEAD")
    # resp = urllib.request.urlopen(req)

    resp = urllib.request.urlopen(link)
    
    # Slower and returns less meta content 
    # resp = requests.get(link)
    
    try:
        print("Status code:", resp.getcode())
        print()
        content = resp.read().decode("utf-8")
    except AttributeError as e:
        print("Status code:", resp.status_code)
        print()
        content = resp.content

    # soup : bs4.BeautifulSoup
    soup = BeautifulSoup(content, 'html')
    meta = soup.find("meta", {"itemprop" : "name"})
    name = meta["content"]
    meta = soup.find("meta", {"itemprop" : "channelId"})
    channel_id = meta["content"]
    print(f"{name}, channelId: {channel_id}")
    
except Exception as e:
    print("Exception:", e)

Status code: 200

Foolish Fish, channelId: UCVtWVX2xirq6Nybf5bumqwg


In [12]:
print("Get children of head")
print( *get_children_names(soup.html.head) )
print()

print("Get children of body")
print( *get_children_names(soup.html.body) )
print()

print("Get meta tag for channelId:")
meta = soup.find("meta", {"itemprop" : "channelId"})
print("string representation:", str(meta))
print("channelId:", meta["content"])
print()

should_list_meta_tags = False
if should_list_meta_tags:
    print("List all meta tags:")
    metas = soup.html.body.findAll("meta", recursive=True)
    for meta in metas:
        print(meta)

Get children of head
meta script script script link link link link link script script script script script script script script script script script script script script script script link script link style style style style style meta link link

Get children of body
script script script iframe ytd-app script script script script script script script script script script script script script link link link title meta meta link meta link meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta link meta meta meta meta span script link span meta meta link link script link div script script script

Get meta tag for channelId:
string representation: <meta content="UCVtWVX2xirq6Nybf5bumqwg" itemprop="channelId"/>
channelId: UCVtWVX2xirq6Nybf5bumqwg



In [253]:
print("Get children of body:")
print( *get_children_names(soup.html.body) )
print()

ytd_app = soup.html.body.find("ytd-app", recursive=False)

print("Get children of ytd-app:")
print( *get_children_names(ytd_app) )
print()

# print( ytd_app.find("ytd-masthead", recursive=False).prettify() )

print("Get <a> tags of ytd-app:")
for a in ytd_app.findAll("a", recursive=False):
    print(a)
print()

# print( ytd_app.div )

Get children of body:
p meta script script script link link link link link script script script script script script script script script script script script script script script script link script link style style style style style meta link link script script script iframe ytd-app script script script script script script script script script script script script script link link link title meta meta link meta link meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta meta link meta meta meta meta span script link span meta meta link link script link div script script script

Get children of ytd-app:
ytd-masthead a a a a a a a a a a a a div

Get <a> tags of ytd-app:
<a href="https://www.youtube.com/about/" slot="guide-links-primary" style="display: none;">About</a>
<a href="https://www.youtube.com/about/press/" slot="gu