# Web scraper 

source https://blog.logrocket.com/build-python-web-scraper-beautiful-soup/

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# if I do not use a header I get a 403 error. Since I have a mac and Safari I choose this header
headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/534.57.2 (KHTML,  like Gecko) Version/5.1.7 Safari/534.57.2"}

In [3]:
def fetch_coingecko_html(url, headers):
    # make a request to the target website
    r = requests.get(url, headers = headers)
    #return html text
    return r.text

With BeautifulSoup we parse the html in structured text, and we can use these structures to find pieces of interest, like tables. The name of the tables can be find by using inspect in the browser.
There are standard libraries of pandas read_html, that finds all the tables. We only want the coin table

In [4]:
def extract_crypto_info(html):
    # parse the HTML content with Beautiful Soup
    soup = BeautifulSoup(html, "html.parser")

    # find all the cryptocurrency elements
    coin_table = soup.find("div", {"class": "coin-table"})
    crypto_elements = coin_table.find_all("tr")[1:]

    # iterate through our cryptocurrency elements
    cryptos = []
    for crypto in crypto_elements:
        # extract the information needed using our observations
        cryptos.append({
            "name": crypto.find("td", {"class": "coin-name"})["data-sort"],
            "price": crypto.find("td", {"class": "td-price"}).text.strip(),
            "change_1h": crypto.find("td", {"class": "td-change1h"}).text.strip(),
            "change_24h": crypto.find("td", {"class": "td-change24h"}).text.strip(),
            "change_7d": crypto.find("td", {"class": "td-change7d"}).text.strip(),
            "volume": crypto.find("td", {"class": "td-liquidity_score"}).text.strip(),
            "market_cap": crypto.find("td", {"class": "td-market_cap"}).text.strip()
        })

    return cryptos

In [5]:
dfs = pd.DataFrame()

for i in range(1,10):
    # Url I want to scrape
    url = f"https://www.coingecko.com/?page={i}"
    # fetch CoinGecko's HTML content
    html = fetch_coingecko_html(url, headers)

    # extract our data from the HTML document
    cryptos = extract_crypto_info(html)
    print(f"downloading page {i}")

    #put in pandas dataframe
    dfs = pd.concat([pd.DataFrame(cryptos), dfs])

downloading page 1
downloading page 2
downloading page 3
downloading page 4
downloading page 5
downloading page 6
downloading page 7
downloading page 8
downloading page 9


In [6]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900 entries, 0 to 99
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        900 non-null    object
 1   price       900 non-null    object
 2   change_1h   900 non-null    object
 3   change_24h  900 non-null    object
 4   change_7d   900 non-null    object
 5   volume      900 non-null    object
 6   market_cap  900 non-null    object
dtypes: object(7)
memory usage: 56.2+ KB


In [7]:
dfs.head()

Unnamed: 0,name,price,change_1h,change_24h,change_7d,volume,market_cap
0,Newscrypto Coin,$0.121698,1.2%,-4.7%,-30.5%,"$1,506,663","$17,889,562"
1,Ooki,$0.004013587468,0.5%,-3.8%,-4.7%,"$2,028,139","$17,889,168"
2,BitMart,$0.101433,0.1%,-1.0%,-1.9%,"$2,157,416","$17,866,867"
3,PIP,$0.206171,-1.3%,-12.0%,-12.0%,"$906,718","$17,857,156"
4,Rarible,$1.53,-2.1%,-1.6%,-4.4%,"$87,511.83","$17,770,709"
