In [4]:
import requests
from bs4 import BeautifulSoup
import json

# Bird list of highest count birds in Tokyo.
url = 'https://ebird.org/region/JP-13/bird-list?rank=hc'

def scrape_websites(websites):
    """
    Find the following data:
    1. Common Name
    2. Binomial Name
    3. Identification Info
    4. Macaulay ID of head photo
    
    :param websites: A list of bird URLs
    :return: JSON data
    """
    results = {}
    count = 0

    for url in websites:
        try:
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find needed data
            heading_main = soup.select_one('span.Heading-main')
            heading_sub = soup.select_one('span.Heading-sub.Heading-sub--sci')
            paragraph = soup.select_one('p.u-stack-sm')
            macaulayID = soup.find_all('a', class_='u-showForMedium u-linkPlain')

            # Dict to JSON
            website_data = {
                heading_main.text.strip(): {
                    "binomialName": heading_sub.text.strip(),
                    "identification": paragraph.text.strip(),
                    "macaulayID": macaulayID[-1].text.split()[-1] if macaulayID else None,
                    "url": url,
                }
            }

            results.update(website_data)
            print(heading_main.text.strip())
            print(count)
            count += 1

        except requests.RequestException as e:
            print(f"Scraping {url} wrong: {e}")

    return results

def scrape_birdLinks(url):
    response = requests.get(url)
    response.raise_for_status()

    # Use BeautifulSoup to parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get bird list
    specieList = soup.find_all('a', class_='Species Species--h4')
    linkList = []
    for specie in specieList:
        linkList.append(specie.get('href'))
    
    return linkList

# Get the bird list of highest count birds in Tokyo
birdLinks = scrape_birdLinks(url)

In [None]:
# Scraped TOP100 data
scraped_data = scrape_websites(birdLinks)

# Write data to json
with open('./source/ebird_data.json', 'w', encoding='utf-8') as f:
    json.dump(scraped_data, f, ensure_ascii=False, indent=4)

Eyebrowed Thrush
0
Gray-streaked Flycatcher
1
Common Cuckoo
2
Lesser Black-backed Gull
3
Gray-faced Buzzard
4
Black Drongo
5
Eurasian Hobby
6
Eastern Yellow Wagtail
7
White-shouldered Starling
8
Oriental Honey-buzzard
9
Ruddy-breasted Crake
10
Eurasian Hoopoe
11
Swinhoe's Storm-Petrel
12
Nazca Booby
13
Chinese Egret
14
Wood Sandpiper
15
Yellow Bittern
16
Siberian Thrush
17
Wilson's Storm-Petrel
18
Siberian Blue Robin
19
Oriental Reed Warbler
20
Cocos Booby
21
Eurasian Treecreeper
22
White's Thrush
23
Brown Booby
24
Lesser Cuckoo
25
Black-browed Reed Warbler
26
Eurasian Kestrel
27
Izu Robin
28
Brown Dipper
29
Asian House-Martin
30
Demoiselle Crane
31
Oriental Pratincole
32
Chinese Pond-Heron
33
Zitting Cisticola
34
Yellow-browed Bunting
35
Swinhoe's Snipe
36
Coal Tit
37
Narcissus Flycatcher
38
Black-faced Spoonbill
39
Whimbrel
40
Japanese Robin
41
Pacific Loon
42
Japanese Pygmy Woodpecker
43
Pelagic Cormorant
44
Chestnut-eared Bunting
45
Sakhalin Leaf Warbler
46
Wedge-tailed Shearwater
