# source: HaoChe Hung (2024)

## About: brute method alternative to two part Smithsonian API data gathering
### Smithsonian API approach: 
part 1 - edan code, part 2 - derive image url from edan codes
### Brute approach: 
scrape for image urls (and edan codes) after conducting search from https://www.si.edu/openaccess

Note: next step, add scripts for retrieving edan codes here

In [1]:
# To ignore unimporant system warnings
import warnings
warnings.filterwarnings("ignore")

# We will use Pandas, Numpy, and Matplotlib which is a package for visualization with Python
import pandas as pd
import numpy as np

# Load a required package 
# This is a library for accessing and parsing data through URLs
from urllib.parse import urlencode
import urllib.request, json 
from bs4 import BeautifulSoup # for web scraping

import matplotlib.pyplot as plt
import seaborn as sns # visualization styling package

# A magic functin that renders the figure in a notebook 
%matplotlib inline

## links

### original sets for Richard The
butterfly 
https://www.si.edu/search/collection-images?page={page-1}&edan_q=butterfly&edan_fq%5B0%5D=set_name:%22Specimen%20Inventory%22&edan_fq%5B1%5D=media_usage:%22CC0%22

bug 
https://www.si.edu/search/collection-images?page={page-1}&edan_q=bug&edan_fq%5B0%5D=set_name:%22Specimen%20Inventory%22&edan_fq%5B1%5D=media_usage:%22CC0%22

fish
https://www.si.edu/search/collection-images?page={page-1}&edan_q=fish&edan_fq%5B0%5D=set_name:%22Fish%20Images%22&edan_fq%5B1%5D=media_usage:%22CC0%22

orchid
https://www.si.edu/search/collection-images?page={page-1}&edan_q=orchid&edan_fq%5B0%5D=topic:%22Orchids%22&edan_fq%5B1%5D=object_type:%22Living%20botanical%20specimens%22&edan_fq%5B2%5D=set_name:%22Smithsonian%20Gardens%20Orchid%20Collection%22&edan_fq%5B3%5D=media_usage:%22CC0%22

starfish
https://www.si.edu/search/collection-images?page={page-1}&edan_q=starfish&oa=1&edan_fq%5B0%5D=media_usage:CC0

wasp
https://www.si.edu/search/all?page={page-1}&edan_q=wasp&edan_fq%5B0%5D=data_source:%22NMNH%20-%20Education%20%26%20Outreach%22&edan_fq%5B1%5D=media_usage:%22CC0%22

mineral
https://www.si.edu/search/collection-images?edan_q=mineral&edan_fq%5B0%5D=object_type:%22Education%20and%20Outreach%20collections%22&edan_fq%5B1%5D=media_usage:%22CC0%22

https://www.si.edu/search/collection-images?page={page-1}&edan_q=mineral&edan_fq%5B0%5D=object_type:%22Education%20and%20Outreach%20collections%22&edan_fq%5B1%5D=media_usage:%22CC0%22

laelia
https://www.si.edu/search/collection-images?edan_q=laelia&oa=1&edan_fq%5B0%5D=media_usage:CC0

https://www.si.edu/search/collection-images?page={page-1}&edan_q=laelia&oa=1&edan_fq%5B0%5D=media_usage:CC0

### test sets for IoAD
clay (page 1)
https://www.si.edu/search/collection-images?edan_q=clay&oa=1&edan_fq%5B0%5D=media_usage:CC0

clay (page 2+)
https://www.si.edu/search/collection-images?page={page-1}&edan_q=clay&oa=1&edan_fq%5B0%5D=media_usage:CC0


In [2]:
# url = 'https://www.si.edu/search/collection-images?edan_q=bug&edan_fq%5B0%5D=set_name:%22Specimen%20Inventory%22&edan_fq%5B1%5D=media_usage:%22CC0%22'
# response = urllib.request.urlopen(url)
# html = response.read()
# mystr = html.decode("utf8") #
# response.close()
# # print (mystr)

In [3]:
def geturl(page):
    if page<2:
        url='https://www.si.edu/search/collection-images?edan_q=clay&oa=1&edan_fq%5B0%5D=media_usage:CC0'

    else:
        url=f'https://www.si.edu/search/collection-images?page={page-1}&edan_q=clay&oa=1&edan_fq%5B0%5D=media_usage:CC0'

    return url

In [4]:
titles = []
links = []

In [5]:
page_number=range(1,33)

In [6]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [None]:
df_all=pd.DataFrame()

for i in range(1,100):
    url_now=geturl(i)
    response = urllib.request.urlopen(url_now)
    html = response.read()
    mystr = html.decode("utf8") 
    response.close()
    soup = BeautifulSoup(mystr,"html5lib")

    each_elements = soup.find_all('div', class_='node node--teaser node--teaser-long')
    each_elements

    for div in each_elements:
        title = div.select_one('.title.delta').get_text(strip=True)
        img_tag = div.select_one('img')
        link = img_tag['src']

        titles.append(title)
        links.append(link)
    df_temp = pd.DataFrame({'title': titles, 'link': links})

df_all = pd.concat([df_all, df_temp], ignore_index=True)


In [None]:
df_all

In [None]:
df_all[df_all['link'].duplicated()]

In [None]:
df_all_sel=df_all.drop_duplicates()
df_all_sel

In [None]:
# # Extract title and link from each div
# for div in each_elements:
#     title = div.select_one('.title.delta').get_text(strip=True)
#     img_tag = div.select_one('img')
#     link = img_tag['src']

#     titles.append(title)
#     links.append(link)
# df_temp = pd.DataFrame({'title': titles, 'link': links})

# df_all=pd.concat(df_all,df_temp)


In [None]:
df_all.to_csv('clayTest_all.csv')