# A Practical Web Scraping Guide

- Author: Ezequiel Ortiz Recalde
- Last update: June 10th

Site to scrape: https://coinmarketcap.com/en/

### 1- Imports

In [1]:
# General libraries
import pandas as pd
import numpy as np
import time

# String preprocessing
import re

# Scraping tools
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from scrapy import Selector

### 2- Set up and start the webdriver

In [2]:
# Set the browser to start maximized
custom_options = webdriver.ChromeOptions()
custom_options.add_argument("--start-maximized")

# Call the driver and pass the custom options
browser=webdriver.Chrome(executable_path="chromedriver",\
options=custom_options)

# Specify and open the url you want to access
url= "https://coinmarketcap.com/en/"
browser.get(url)

# Wait before making the next request
time.sleep(np.random.uniform(3,4.5))

### 3- Obtain historical data from a specific token

In [3]:
def historical_data(token:str,timeframe:str, max_loadings:int)->pd.DataFrame:
    """
    Function to obtain historical data of a specific token:
        args:
             token= string that specifies the token to search
             timeframe= date range to extract. Current available options (7, 30, 90, 365) days
             max_loadings= maximum number of times additional data is  
        returns: pandas dataframe with results
    """
    time.sleep(np.random.uniform(2,2.5))
    
    # Click the search button to be able to look for content
    browser.find_element_by_xpath("//div[contains(text(),'Search')]").click()
    time.sleep(np.random.uniform(2,2.5))
    
    # Identify the clicked search box
    opened_search_box = browser.find_element_by_xpath("//div/input[@placeholder='What are you looking for?']")
    
    # Send the desired token and press ENTER
    opened_search_box.send_keys(token+ "\n")
    time.sleep(np.random.uniform(3,4.5))

    print(f"Extracting data for: {token}")
    
    # Select the Historical Data section
    browser.find_element_by_xpath("//a[contains(text(),'Historical Data')]").click()
    time.sleep(np.random.uniform(2.5,3))
    
    # The cookies banner might appear over the Date Range menu. This will interrupt the script
    # To avoid it, we place an additional step to close the banner before proceeding
    try:
        browser.find_element_by_xpath("//div[@class='cmc-cookie-policy-banner__close']").click()
    except Exception:
        pass

    time.sleep(np.random.uniform(2.5,3))

    # Open the Data Range menu
    browser.find_element_by_xpath("//button[contains(text(),'Date Range')]").click()

    time.sleep(np.random.uniform(2.5,3))
    
    # Get the predefined timeframes
    timeframes=browser.find_elements_by_xpath("//div[contains(@class,'predefinedRanges___1WDIZ')]/ul/li")

    timeframe_options=[]
    for option in timeframes:
        timeframe = option.get_attribute("textContent")
        timeframe_options.append(timeframe)
    
    # Select a predefined data range. If it fails, print the available options
    try:
        browser.find_element_by_xpath(f"//div/ul/li[contains(text(),'{timeframe}')]").click()
    except Exception:
        print("Failed. The available timeframe options are:",timeframe_options)

    time.sleep(np.random.uniform(1.5,2))
    
    # Select the continue button
    browser.find_element_by_xpath("//button[contains(text(),'Continue')]").click()
    time.sleep(np.random.uniform(2.5,3))
    
    # Scroll to the end of the page
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight);')
    
    # Load additional data
    for i in range(max_loadings):
        browser.find_element_by_xpath("//button[contains(text(),'Load More')]").click()
        time.sleep(np.random.uniform(1.1,1.2))
        browser.execute_script('window.scrollTo(0,document.body.scrollHeight);')
        time.sleep(np.random.uniform(1.1,1.2))

    # Define the scrapy selector for a faster data extraction
    selenium_response_text = browser.page_source
    scrapy_selector = Selector(text=selenium_response_text)

    # Get the number of columns and their names
    col_names = scrapy_selector.xpath("//table[contains(@class,'cmc-table')]//th/text()").getall()
    n_columns = len(col_names)

    # Get the row values
    rows_values=scrapy_selector.xpath("//table[contains(@class,'cmc-table')]//tr/td/text()").getall()

    # Break the extraction into n equally sized chunks
    rows_array=np.array(rows_values)

    # Specify the number of equally sized chunks
    number_of_chunks=len(rows_array)/n_columns

    # Make the partitions
    partitions=np.array_split(rows_array, number_of_chunks)

    # Create the dataframe and save the token associated with this extraction
    df=pd.DataFrame(data=partitions,columns=col_names)
    df["Token"]=token
    
    # Scroll to the top
    arbitrary_height=0
    browser.execute_script(f"window.scrollTo(0, {arbitrary_height});")

    return(df)

In [4]:
# Specify the tokens to look for
tokens=["BTC","ETH","ADA","DOGE"]

# List to save dataframes
token_data=[]

for token in tokens:
    token_data.append(historical_data(token=token,timeframe=365,max_loadings=10))

Extracting data for: BTC
Extracting data for: ETH
Extracting data for: ADA
Extracting data for: DOGE


In [5]:
# Concat list of dataframes
results=pd.concat(token_data).reset_index(drop=True)

# Preview sample of results
results.sample(8)

Unnamed: 0,Date,Open*,High,Low,Close**,Volume,Market Cap,Token
1940,"Oct 16, 2019",$0.04,$0.04,$0.04,$0.04,"$44,617,564","$1,004,433,806",ADA
960,"Aug 22, 2020",$389.03,$396.47,$382.81,$395.84,"$10,131,847,985","$44,448,918,222",ETH
1147,"Feb 17, 2020",$259.89,$266.87,$244.34,$266.36,"$26,024,080,089","$29,231,690,264",ETH
2302,"Aug 18, 2020",$0.003593,$0.003709,$0.003483,$0.003514,"$62,570,291","$442,352,423",DOGE
1723,"May 20, 2020",$0.06,$0.06,$0.05,$0.06,"$203,310,637","$1,474,606,328",ADA
1009,"Jul 04, 2020",$225.29,$230.05,$225.13,$229.07,"$5,228,310,135","$25,570,190,315",ETH
434,"Apr 01, 2020","$6,437.32","$6,612.57","$6,202.37","$6,606.78","$40,346,426,266","$120,903,014,693",BTC
927,"Sep 24, 2020",$320.99,$351.46,$318.55,$349.36,"$15,714,304,470","$39,384,347,822",ETH


### 4- Obtain Top Gainers and Losers

In [6]:
def top_gainers_or_losers(token_type:str="Top Gainers",select_period:bool=False,period:str=None)->pd.DataFrame:
    """
    Function to obtain top gainers or losers
        args:
             token_type= string to specify whether to look for Top Gainers or Top Losers
             select_period= bool to decid whether or not to select a period
             period= If select_period==True, then specify the timeframe to look for (it can be 1h, 24h, 7d or 30d) 
        returns: pandas dataframe with results
    """
    # Identify the button to select cryptocurrency data
    button = browser.find_elements_by_xpath("//span[contains(text(),'Cryptocurrencies')]")[0]
    
    # Instance of the ActionChains class with the browser object
    action=ActionChains(browser)
    
    # Hover over the button
    action.move_to_element(button).perform()

    time.sleep(np.random.uniform(1.5,2))
    
    # Select the Gainers and Losers section
    browser.find_element_by_xpath("//h6[contains(text(),'Gainers & Losers')]").click()
    
    print(f"Looking for {token_type}...")
    
    # In case a period will be selected, look for all the possible options
    # This is to be used for troubleshooting...
    if select_period==True:
        
        # Identify and click the timeframe menu
        timeframes=browser.find_element_by_xpath("//div[@class='sc-16r8icm-0 tu1guj-0 XdIOT']")
        timeframes.click()
        
        time.sleep(np.random.uniform(1.5,2.2))
        
        # Get the path for the timeframes options
        options_path=timeframes.find_elements_by_xpath("..//button")
        
        # List to save options
        timeframe_options=[]
        
        # Extract the options
        for option in options_path:
            option_text=option.get_attribute("textContent")
            timeframe_options.append(option_text)
        print("Available timeframes:",timeframe_options)
        
        # Select the timeframe
        timeframes.find_element_by_xpath(f"..//button[contains(text(),'{period}')]").click()
        print(f"The period selected is: {period}")

    else:
        print("Since no period was specified, the website default of 24h will be used")

    time.sleep(np.random.uniform(2,2.2))
    
    # Identify the columns and rows paths for the desired token_type (Top Gainers or Top Losers)
    columns_path = browser.find_elements_by_xpath(f"//div/h3[contains(text(),'{token_type}')]/..//table[contains(@class,'cmc-table')]//th")
    rows_path = browser.find_elements_by_xpath(f"//div/h3[contains(text(),'{token_type}')]/..//table[contains(@class,'cmc-table')]//tr/td")
    
    # Get the column names
    columns=[]
    for column in columns_path:
        columns.append(column.get_attribute("textContent"))

    n_columns=len(columns)
    
    # Get the rows values
    rows_values=[]
    for row in rows_path:
        rows_values.append(row.get_attribute("textContent"))
    
    # Extract the tokens
    tokens_path=browser.find_elements_by_xpath(f"//div/h3[contains(text(),\'{token_type}')]/..//table[contains(@class,'cmc-table')]\
    //tr/td//p[@color='text3']")

    tokens=[]
    for token in tokens_path:
        tokens.append(token.get_attribute("textContent"))

    # Specify the size of the chunks to partition the list of row values
    rows_array=np.array(rows_values)
    size_of_chunks=len(rows_array)/n_columns

    # Make the partitions
    partitions=np.array_split(rows_array, size_of_chunks)

    # Create the dataframe and add the token column
    df=pd.DataFrame(data=partitions,columns=columns)
    df["Token"]=tokens

    # Clean the Name column that had the Token name concatenated at the end
    df["Name"]=df.apply(lambda s: re.sub(f'{s["Token"]}$', "", s["Name"]),axis=1)
    
    return(df)

In [7]:
losers=top_gainers_or_losers(token_type="Top Losers",select_period=True,period="30d")

Looking for Top Losers...
Available timeframes: ['1h', '24h', '7d', '30d']
The period selected is: 30d


In [8]:
losers.head()

Unnamed: 0,#,Name,Price,30d,Volume(24h),Token
0,1,unilock.network1,$0.05934,100.00%,"$222,444",UNL
1,2,Crypto Village Accelerator2,$0.000000122,99.90%,"$70,737",CVA
2,3,Zurcoin3,$0.01479,99.79%,"$76,906",ZUR
3,4,3x Long Bitcoin Cash Token4,$0.09031,99.29%,"$108,277",BCHBULL
4,5,DOTDOWN5,$0.007507,99.25%,"$6,549,471",DOTDOWN


In [9]:
gainers=top_gainers_or_losers(token_type="Top Gainers",select_period=True,period="24h")

Looking for Top Gainers...
Available timeframes: ['1h', '24h', '7d', '30d']
The period selected is: 24h


In [10]:
gainers.head()

Unnamed: 0,#,Name,Price,24h,Volume(24h),Token
0,1,ZUM TOKEN1,$0.00002475,378.95%,"$82,910",ZUM
1,2,Commercial Data Storage2,$0.01357,229.80%,"$208,125",CDS
2,3,YAMv23,$41.09,217.30%,"$114,593",YAMV2
3,4,SkyBorn4,$0.000002754,175.61%,"$151,440",SKYBORN
4,5,PeepoCoin5$PEEPO,$0.00000161,132.32%,"$79,888",$PEEPO
