# Scrape crypto currencies from coinmarketcap
https://coinmarketcap.com/

## Import python standard modules

In [1]:
import sys       # sys.path
import os        # os stuff
import codecs    # codecs.open
import json      # json.dump/load
import itertools # itertools.chain
import datetime  # datetime.utcnow, ...
import csv       # csv.writer/reader
import time      # time.sleep

# data science module
import pandas as pd
import numpy as np

__Constants and helper functions__

In [2]:
def getWorkingDir(path='.'):
    assert(path and path != "/") # may need adaption for Windows
    path = os.path.abspath(path)
    if os.path.exists(os.path.join(path, '.git')):
        return path
    return getWorkingDir(os.path.dirname(path))

# BASEDIR is path to the working directory of this git repository
BASEDIR = getWorkingDir()
EXTMODULES = os.path.join(BASEDIR, "ext")
CACHEDIR = os.path.join(BASEDIR, "cache")
COINS = "coins"
TOKENS = "tokens"
CRYPTOMARKETSNAME = "crypto-markets.csv"

def getPath(filename, createDir=False):
    path = os.path.join(CACHEDIR, filename)
    directory = os.path.dirname(path)
    if createDir and not os.path.exists(directory):
        os.makedirs(directory)
    return path

## Import third party modules
* https://github.com/dahuebi/coinmarketcap-history.git
* https://github.com/dahuebi/coinmarketcap.git

In [3]:
# set path for thirdparty modules
def addModulePaths():
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-scraper"))
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-history"))
USED_FOR_IMPORT_BY_IPYN = addModulePaths()

# import thirdparty modules
import coinmarketcap
import coinmarketcap_usd_history

### coinmarketcap-scraper wrapper
Scrape __coins__ and __tokens__ from __coinmarketcap.com__

In [4]:
# please see https://github.com/dahuebi/coinmarketcap.git
def scrapeCoinList():
    """Scrape coin list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('coins', 'all')
    data = coinmarketcap.parseList(html, 'currencies')
    return data

def scrapeTokenList():
    """Scrape token list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('tokens', 'all')
    data = coinmarketcap.parseList(html, 'assets')
    return data

### coinmarketcap_usd_history wrapper
Scrape __coin/token__ data like _open_, _high_, _low_, _close_, _volume_, _marketcap_.

In [5]:
# please see https://github.com/dahuebi/coinmarketcap-history.git
def downloadHistoricalData(currency, startDate, endDate):
    # date format required by coinmarketcap_usd_history
    def historicalDate(date):
        if isinstance(date, datetime.datetime):
            return date.strftime("%Y%m%d")
        return date
    startDate, endDate = historicalDate(startDate), historicalDate(endDate)
    # print(currency, startDate, endDate)
    html = coinmarketcap_usd_history.download_data(currency, startDate, endDate)
    header, rows = coinmarketcap_usd_history.extract_data(html)
    return header, rows

## Scrape currencies

__Cache__ for the currencies, __json__ encoded files.

In [6]:
def readCachedCurrenciesFile(filename):
    """Read cached currencies from *filename*.
    Returns the previously saved dictionary.
    """
    path = getPath(filename)
    if not os.path.exists(path):
        return []
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        try:
            return json.load(fp)
        except json.JSONDecodeError:
            return []

def writeCachedCurrenciesFile(filename, data):
    """Write dictionary to filename.
    """
    path = getPath(filename, createDir=True)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        json.dump(data, fp, indent=4)

The currencies scrape function.

In [7]:
def scrapeCurrencies():
    # coins
    coinsFilename = "coins.txt"
    coins = readCachedCurrenciesFile(coinsFilename)
    if not coins:
        coins = scrapeCoinList()
        writeCachedCurrenciesFile(coinsFilename, coins)
    
    # tokens
    tokensFilename = "tokens.txt"
    tokens = readCachedCurrenciesFile(tokensFilename)
    if not tokens:
        tokens = scrapeTokenList()
        writeCachedCurrenciesFile(tokensFilename, tokens)
    return coins, tokens

## Scrape historical data

__Cache__ for the historical data, __csv__.

In [8]:
def loadCurrencyFromCsv(currency):
    """Load historical data for the *currency*.
    Returns *header* and *rows* of the csv.
    """
    path = getPath("{}.csv".format(currency))
    header, data = [], []
    if not os.path.exists(path):
        return header, data
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        reader = csv.reader(fp)
        try:
            header = next(reader)
        except StopIteration:
            pass
        data = list(reader)
    return header, data

def saveCurrencyToCsv(currency, header, data):
    """Save historical data for the *currency*.
    """
    path = getPath("{}.csv".format(currency), createDir=True)
    newData = sorted(data, key=lambda row: row[0], reverse=True)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        writer = csv.writer(fp, quoting=csv.QUOTE_NONE)
        [writer.writerow(row) for row in itertools.chain([header], newData)]

The function to download the historical data for one currency.

In [18]:
def downloadCurrency(currency):
    """Download historical data for a single *currency*.
    Only request new data if the *date* is not yet in the cache.
    **Updates** all caches.
    """
    parseDate = lambda s: datetime.datetime.strptime(s, "%Y-%m-%d")
    slug = currency["slug"]
    # set default startTime
    startDate = parseDate("2001-01-01")
    # get current UTC datetime
    endDate = datetime.datetime.utcnow()
    # floor to day
    endDate = endDate.replace(hour=0, minute=0, second=0, microsecond=0)
   
    # try to load data from cache
    header, data = loadCurrencyFromCsv(slug)
    for row in data:
        startDate = max(startDate, parseDate(row[0]))
    # increment one date
    startDate += datetime.timedelta(days=1)
    if startDate >= endDate:
        return
  
    header, newData = None, None
    for retry in range(3, 0, -1):
        try:
            header, newData = downloadHistoricalData(slug, startDate, endDate)
        except SystemExit:
            if retry == 1: raise
            time.sleep(3)

    if not newData:
        return
    # remove average
    header = header[:-1]
    for row in newData:
        row = row[:-1] # remove Average
        # parse date from module
        row[0] = datetime.datetime.strptime(row[0], "%b %d %Y").strftime("%Y-%m-%d")
        data.append(row)
    saveCurrencyToCsv(slug, header, data)

Put all data into a single csv file.

In [10]:
def singleCsv(currencies):
    """Merge all currencies into a single csv.
    """
    cryptoMarketsPath = getPath(CRYPTOMARKETSNAME, createDir=True)
    cnt = 0
    with codecs.open(cryptoMarketsPath, "w", encoding="UTF-8") as fpOut:
        writer = csv.writer(fpOut, quoting=csv.QUOTE_NONE)
        for currency in currencies:
            # get header from first currency
            slug = currency["slug"]
            name = currency["name"]
            path = getPath("{}.csv".format(currency["slug"]))
            # ignore missing files
            if not os.path.exists(path): continue
            cnt += 1
            with codecs.open(path, "r", encoding="UTF-8") as fp:
                reader = csv.reader(fp)
                hdr = next(reader)
                if fpOut.tell() == 0:
                    # write header
                    hdr.insert(1, "Name")
                    writer.writerow(hdr)
                for row in reader:
                    row.insert(1, name)
                    # replace '-' with 0
                    row = ["0" if r == "-" else r for r in row]
                    writer.writerow(row)
    print("Currencies in {}: {}".format(CRYPTOMARKETSNAME, cnt))

## Run the code
The result is a filled __CACHE__ directory.
* _coins.txt_/_tokens.txt_ cointain the coin and token information
* _coin/token name.csv_ cointain the historical data

In [None]:
# scrape coins and tokens
coins, tokens = scrapeCurrencies()
print("# coins:  {:5}".format(len(coins)))
print("# tokens: {:5}".format(len(tokens)))
currencies = coins + tokens

totalCnt = len(currencies)
remaining = totalCnt
# get historical data for coins and tokens
for currency in currencies:
    name = currency["name"]
    print("\rget {:10} ({:5}/{:5}){}".format(name, remaining, totalCnt, " "*20),
          end="", flush=True)
    downloadCurrency(currency)
    remaining -= 1
    
print("\rDone ({}){}".format(totalCnt, " "*40, flush=True))
singleCsv(currencies)

# coins:    896
# tokens:   564
get POS        (  203/ 1460)                    

## Load the cached data into a pandas dataframe

In [12]:
def loadHistoricalData():
    """load historical data for all currency into pandas frame
    """
    # TODO: may add filtering capabilities
    path = getPath("crypto-markets.csv")
    df = pd.read_csv(path, parse_dates=['Date'], index_col='Date')
    df.rename(columns={"Market Cap": "Market"}, inplace=True)
    df.rename(str.lower, axis='columns', inplace=True)
    df.index.name = "date"
    return df

In [13]:
df = loadHistoricalData()
df

Unnamed: 0_level_0,name,open,high,low,close,volume,market
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-26,BTC,11256.000000,11656.700000,10470.300000,11171.400000,9746200000,189398000000
2018-01-25,BTC,11421.700000,11785.700000,11057.400000,11259.400000,8873170000,192163000000
2018-01-24,BTC,10903.400000,11501.400000,10639.800000,11359.400000,9940990000,183419000000
2018-01-23,BTC,10944.500000,11377.600000,10129.700000,10868.400000,9660610000,184087000000
2018-01-22,BTC,11633.100000,11966.400000,10240.200000,10931.400000,10537400000,195645000000
2018-01-21,BTC,12889.200000,12895.900000,11288.200000,11600.100000,9935180000,216740000000
2018-01-20,BTC,11656.200000,13103.000000,11656.200000,12899.200000,11801700000,195979000000
2018-01-19,BTC,11429.800000,11992.800000,11172.100000,11607.400000,10740400000,192150000000
2018-01-18,BTC,11198.800000,12107.300000,10942.500000,11474.900000,15020400000,188242000000
2018-01-17,BTC,11431.100000,11678.000000,9402.290000,11188.600000,18830600000,192123000000
