In [6]:
import requests
from bs4 import BeautifulSoup
import json

# Bird list of highest count birds in Tokyo.
url = 'https://ebird.org/region/JP-13/bird-list?yr=cur&rank=hc'

def scrape_websites(websites):
    """
    Find the following data:
    1. Common Name
    2. Binomial Name
    3. Identification Info
    4. Macaulay ID of head photo
    
    :param websites: A list of bird URLs
    :return: JSON data
    """
    results = {}
    count = 0

    for url in websites:
        try:
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Find needed data
            heading_main = soup.select_one('span.Heading-main')
            heading_sub = soup.select_one('span.Heading-sub.Heading-sub--sci')
            paragraph = soup.select_one('p.u-stack-sm')
            macaulayID = soup.find_all('a', class_='u-showForMedium u-linkPlain')

            # Dict to JSON
            website_data = {
                heading_main.text.strip(): {
                    "binomialName": heading_sub.text.strip(),
                    "identification": paragraph.text.strip(),
                    "macaulayID": macaulayID[-1].text.split()[-1] if macaulayID else None,
                    "url": url,
                }
            }

            results.update(website_data)
            print(heading_main.text.strip())
            print(count)
            count += 1

        except requests.RequestException as e:
            print(f"Scraping {url} wrong: {e}")

    return results

def scrape_birdLinks(url):
    response = requests.get(url)
    response.raise_for_status()

    # Use BeautifulSoup to parse HTML
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get bird list
    specieList = soup.find_all('a', class_='Species Species--h4')
    linkList = []
    for specie in specieList:
        linkList.append(specie.get('href'))
    
    return linkList

# Get the bird list of highest count birds in Tokyo
birdLinks = scrape_birdLinks(url)

In [7]:
# Scraped TOP100 data
scraped_data = scrape_websites(birdLinks[:100])

# Write data to json
with open('ebird_data.json', 'w', encoding='utf-8') as f:
    json.dump(scraped_data, f, ensure_ascii=False, indent=4)

Willow Tit
0
Mute Swan
1
Brown-eared Bulbul
2
Eyebrowed Thrush
3
Long-eared Owl
4
Eurasian Sparrowhawk
5
Red-necked Stint
6
Garganey
7
Vega Gull
8
Black Kite
9
Owston's Tit
10
Gray-streaked Flycatcher
11
Dark-sided Flycatcher
12
Greater White-fronted Goose
13
Little Bunting
14
Amur Stonechat
15
Sharp-tailed Sandpiper
16
Common Kingfisher
17
Common Cuckoo
18
Lesser Black-backed Gull
19
Common Merganser
20
Eastern Red-rumped Swallow
21
Gray-faced Buzzard
22
Azure-winged Magpie
23
Black Drongo
24
Kentish Plover
25
Eurasian Hobby
26
Eurasian Woodcock
27
Whiskered Tern
28
Black-tailed Godwit
29
Eastern Yellow Wagtail
30
White-shouldered Starling
31
Japanese Sparrowhawk
32
Bull-headed Shrike
33
Oriental Cuckoo
34
Oriental Honey-buzzard
35
Terek Sandpiper
36
Great Knot
37
Black-tailed Gull
38
Ruddy-breasted Crake
39
Sanderling
40
Long-toed Stint
41
Great Crested Tern
42
Ruff
43
Pectoral Sandpiper
44
Black Paradise-Flycatcher
45
Green Sandpiper
46
Slaty-backed Gull
47
Eurasian Hoopoe
48
Rose-r