# Scrape crypto currencies from coinmarketcap
https://coinmarketcap.com/

## Import python standard modules

In [1]:
import sys       # sys.path
import os        # os stuff
import codecs    # codecs.open
import json      # json.dump/load
import itertools # itertools.chain
import datetime  # datetime.utcnow, ...
import csv       # csv.writer/reader
import time      # time.sleep

# data science module
import pandas as pd
import numpy as np

__Constants and helper functions__

In [2]:
def getWorkingDir(path='.'):
    assert(path and path != "/") # may need adaption for Windows
    path = os.path.abspath(path)
    if os.path.exists(os.path.join(path, '.git')):
        return path
    return getWorkingDir(os.path.dirname(path))

# BASEDIR is path to the working directory of this git repository
BASEDIR = getWorkingDir()
EXTMODULES = os.path.join(BASEDIR, "ext")
CACHEDIR = os.path.join(BASEDIR, "cache")
COINS = "coins"
TOKENS = "tokens"

def getPath(filename):
    path = os.path.join(CACHEDIR, filename)
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return path

## Import third party modules
* https://github.com/dahuebi/coinmarketcap-history.git
* https://github.com/dahuebi/coinmarketcap.git

In [3]:
# set path for thirdparty modules
def addModulePaths():
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-scraper"))
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-history"))
USED_FOR_IMPORT_BY_IPYN = addModulePaths()

# import thirdparty modules
import coinmarketcap
import coinmarketcap_usd_history

### coinmarketcap-scraper wrapper
Scrape __coins__ and __tokens__ from __coinmarketcap.com__

In [4]:
# please see https://github.com/dahuebi/coinmarketcap.git
def scrapeCoinList():
    """Scrape coin list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('coins', 'all')
    data = coinmarketcap.parseList(html, 'currencies')
    return data

def scrapeTokenList():
    """Scrape token list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('tokens', 'all')
    data = coinmarketcap.parseList(html, 'assets')
    return data

### coinmarketcap_usd_history wrapper
Scrape __coin/token__ data like _open_, _high_, _low_, _close_, _volume_, _marketcap_.

In [5]:
# please see https://github.com/dahuebi/coinmarketcap-history.git
def downloadHistoricalData(currency, startDate, endDate):
    # date format required by coinmarketcap_usd_history
    def historicalDate(date):
        if isinstance(date, datetime.datetime):
            return date.strftime("%Y%m%d")
        return date
    startDate, endDate = historicalDate(startDate), historicalDate(endDate)
    # print(currency, startDate, endDate)
    html = coinmarketcap_usd_history.download_data(currency, startDate, endDate)
    header, rows = coinmarketcap_usd_history.extract_data(html)
    return header, rows

## Scrape currencies

__Cache__ for the currencies, __json__ encoded files.

In [6]:
def readCachedCurrenciesFile(filename):
    """Read cached currencies from *filename*.
    Returns the previously saved dictionary.
    """
    path = getPath(filename)
    if not os.path.exists(path):
        return []
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        try:
            return json.load(fp)
        except json.JSONDecodeError:
            return []

def writeCachedCurrenciesFile(filename, data):
    """Write dictionary to filename.
    """
    path = getPath(filename, createDir=True)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        json.dump(data, fp, indent=4)

The currencies scrape function.

In [7]:
def scrapeCurrencies():
    # coins
    coinsFilename = "coins.txt"
    coins = readCachedCurrenciesFile(coinsFilename)
    if not coins:
        coins = scrapeCoinList()
        writeCachedCurrenciesFile(coinsFilename, coins)
    
    # tokens
    tokensFilename = "tokens.txt"
    tokens = readCachedCurrenciesFile(tokensFilename)
    if not tokens:
        tokens = scrapeTokenList()
        writeCachedCurrenciesFile(tokensFilename, tokens)
    return coins, tokens

## Scrape historical data

__Cache__ for the historical data, __csv__.

In [8]:
def loadCurrencyFromCsv(currency):
    """Load historical data for the *currency*.
    Returns *header* and *rows* of the csv.
    """
    path = getPath("{}.csv".format(currency))
    header, data = [], []
    if not os.path.exists(path):
        return header, data
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        reader = csv.reader(fp)
        try:
            header = next(reader)
        except StopIteration:
            pass
        data = list(reader)
    return header, data

def saveCurrencyToCsv(currency, header, data):
    """Save historical data for the *currency*.
    """
    path = getPath("{}.csv".format(currency), createDir=True)
    newData = sorted(data, key=lambda row: row[0], reverse=True)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        writer = csv.writer(fp, quoting=csv.QUOTE_NONE)
        [writer.writerow(row) for row in itertools.chain([header], newData)]

The function to download the historical data for one currency.

In [9]:
def downloadCurrency(currency):
    """Download historical data for a single *currency*.
    Only request new data if the *date* is not yet in the cache.
    **Updates** all caches.
    """
    parseDate = lambda s: datetime.datetime.strptime(s, "%Y-%m-%d")
    slug = currency["slug"]
    # set default startTime
    startDate = parseDate("2001-01-01")
    # get current UTC datetime
    endDate = datetime.datetime.utcnow()
    # floor to day
    endDate = endDate.replace(hour=0, minute=0, second=0, microsecond=0)
   
    # try to load data from cache
    header, data = loadCurrencyFromCsv(slug)
    for row in data:
        startDate = max(startDate, parseDate(row[0]))
    # increment one date
    startDate += datetime.timedelta(days=1)
    if startDate >= endDate:
        return
  
    header, newData = None, None
    for retry in range(3, 0, -1):
        try:
            header, newData = downloadHistoricalData(slug, startDate, endDate)
        except SystemExit:
            if retry == 1: raise
            time.sleep(3)

    if not newData:
        return
    # remove average
    header = header[:-1]
    for row in newData:
        row = row[:-1] # remove Average
        # parse date from module
        row[0] = datetime.datetime.strptime(row[0], "%b %d %Y").strftime("%Y-%m-%d")
        data.append(row)
    saveCurrencyToCsv(slug, header, data)

## Run the code
The result is a filled __CACHE__ directory.
* _coins.txt_/_tokens.txt_ cointain the coin and token information
* _coin/token name.csv_ cointain the historical data

In [10]:
# scrape coins and tokens
coins, tokens = scrapeCurrencies()
print("# coins:  {:5}".format(len(coins)))
print("# tokens: {:5}".format(len(tokens)))

totalCnt = len(coins) + len(tokens)
remaining = totalCnt
# get historical data for coins and tokens
for currency in itertools.chain(coins, tokens):
    name = currency["name"]
    print("\rget {:10} ({:5}/{:5}){}".format(name, remaining, totalCnt, " "*20),
          end="", flush=True)
    downloadCurrency(currency)
    remaining -= 1
print("\rDone ({}){}".format(totalCnt, " "*40, flush=True))

# coins:    896
# tokens:   564
Done (1460)                                        


## Load the cached data into a pandas dataframe

In [11]:
def loadHistoricalData(minMarketCap=1000*1000):
    """load historical data for all currency
    minMarketCap: minimal market capitalisation
    """
    coins, tokens = scrapeCurrencies()
    currencies = coins + tokens
    dfTot = pd.DataFrame()
    cnt = 0
    remaining = len(currencies)
    for currency in currencies:
        remaining -= 1
        name = currency['name']
        slug = currency['slug']
        print("\r{:5} {:5} {}{}".format(remaining, cnt, name, " "*10), end="", flush=True)
        header, rows = loadCurrencyFromCsv(slug)
        if not rows:
            continue
        header = [header[0]] + ["{} {}".format(name, x) for x in header[1:]]
        df = pd.DataFrame(np.array(rows), columns = header)
        df = df.replace("-", 0)
        df[header[1:]] = df[header[1:]].apply(pd.to_numeric)
        colMarketCap = "{} Market Cap".format(name)
        dfCap = df.loc[df[colMarketCap] >= minMarketCap]
        if dfCap.empty:
            continue
        df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
        dfTot = df if dfTot.empty else pd.merge(dfTot, df, on="Date", how='outer', sort=True)
        cnt += 1
    print("\rDone {:5}{}".format(cnt, " "*40, flush=True))
    dfTot = dfTot.fillna(method="ffill") # use previous values if available
    dfTot = dfTot.fillna(value=0)        # fill missing data with 0
    dfTot = dfTot.set_index("Date")      # use "Date" column as index
    return dfTot

In [12]:
minMarketCap = 1000*1000*1000
df = loadHistoricalData(minMarketCap=minMarketCap)
df.tail()

Done    52                                        


Unnamed: 0_level_0,BTC Open,BTC High,BTC Low,BTC Close,BTC Volume,BTC Market Cap,ETH Open,ETH High,ETH Low,ETH Close,...,WAX Low,WAX Close,WAX Volume,WAX Market Cap,DENT Open,DENT High,DENT Low,DENT Close,DENT Volume,DENT Market Cap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-20,11656.2,13103.0,11656.2,12899.2,11801700000,195979000000,1044.95,1167.11,1044.95,1155.15,...,1.14,1.16,2496310.0,612530000.0,0.045602,0.048224,0.040863,0.046102,14949800.0,484054000.0
2018-01-21,12889.2,12895.9,11288.2,11600.1,9935180000,216740000000,1155.68,1155.68,1021.5,1049.58,...,0.9044,1.03,4370490.0,573869000.0,0.046414,0.047101,0.036696,0.040003,11811600.0,492672000.0
2018-01-22,11633.1,11966.4,10240.2,10931.4,10537400000,195645000000,1055.35,1089.1,930.74,1003.26,...,0.848981,0.911324,9483130.0,508694000.0,0.040218,0.041826,0.031763,0.036874,10722300.0,426905000.0
2018-01-23,10944.5,11377.6,10129.7,10868.4,9660610000,184087000000,1004.17,1023.23,920.54,986.23,...,0.848617,0.96631,8204480.0,451960000.0,0.036781,0.03731,0.031261,0.035015,8258470.0,390419000.0
2018-01-24,10903.4,11501.4,10639.8,11359.4,9940990000,183419000000,987.48,1062.44,965.81,1058.78,...,0.848154,0.900626,10075800.0,475476000.0,0.035049,0.036288,0.032913,0.035048,9564350.0,372031000.0
