# Scrape crypto currencies from coinmarketcap
https://coinmarketcap.com/

## Import python standard modules

In [8]:
import sys       # sys.path
import os        # os stuff
import codecs    # codecs.open
import json      # json.dump/load
import itertools # itertools.chain
import datetime  # datetime.utcnow, ...
import csv       # csv.writer/reader

__Constants and helper functions__

In [9]:
def getWorkingDir(path='.'):
    assert(path and path != "/") # may need adaption for Windows
    path = os.path.abspath(path)
    if os.path.exists(os.path.join(path, '.git')):
        return path
    return getWorkingDir(os.path.dirname(path))

# BASEDIR is path to the working directory of this git repository
BASEDIR = getWorkingDir()
EXTMODULES = os.path.join(BASEDIR, "ext")
CACHEDIR = os.path.join(BASEDIR, "cache")
COINS = "coins"
TOKENS = "tokens"

if not os.path.exists(CACHEDIR):
    os.makedirs(CACHEDIR)
def getPath(filename):
    return os.path.join(CACHEDIR, filename)

## Import third party modules
* https://github.com/dahuebi/coinmarketcap-history.git
* https://github.com/dahuebi/coinmarketcap.git

In [10]:
# set path for thirdparty modules
def addModulePaths():
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-scraper"))
    sys.path.insert(0, os.path.join(EXTMODULES, "coinmarketcap-history"))
USED_FOR_IMPORT_BY_IPYN = addModulePaths()

# import thirdparty modules
import coinmarketcap
import coinmarketcap_usd_history

### coinmarketcap-scraper wrapper
Scrape __coins__ and __tokens__ from __coinmarketcap.com__

In [11]:
# please see https://github.com/dahuebi/coinmarketcap.git
def scrapeCoinList():
    """Scrape coin list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('coins', 'all')
    data = coinmarketcap.parseList(html, 'currencies')
    return data

def scrapeTokenList():
    """Scrape token list."""
    coinmarketcap.lastReqTime = None
    html = coinmarketcap.requestList('tokens', 'all')
    data = coinmarketcap.parseList(html, 'assets')
    return data

### coinmarketcap_usd_history wrapper
Scrape __coin/token__ data like _open_, _high_, _low_, _close_, _volume_, _marketcap_.

In [12]:
# please see https://github.com/dahuebi/coinmarketcap-history.git
def downloadHistoricalData(currency, startDate, endDate):
    # date format required by coinmarketcap_usd_history
    def historicalDate(date):
        if isinstance(date, datetime.datetime):
            return date.strftime("%Y%m%d")
        return date
    startDate, endDate = historicalDate(startDate), historicalDate(endDate)
    # print(currency, startDate, endDate)
    html = coinmarketcap_usd_history.download_data(currency, startDate, endDate)
    header, rows = coinmarketcap_usd_history.extract_data(html)
    return header, rows

## Scrape currencies

__Cache__ for the currencies, __json__ encoded files.

In [13]:
def readCachedCurrenciesFile(filename):
    """Read cached currencies from *filename*.
    Returns the previously saved dictionary.
    """
    path = getPath(filename)
    if not os.path.exists(path):
        return []
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        try:
            return json.load(fp)
        except json.JSONDecodeError:
            return []

def writeCachedCurrenciesFile(filename, data):
    """Write dictionary to filename.
    """
    path = getPath(filename)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        json.dump(data, fp, indent=4)

The currencies scrape function.

In [14]:
def scrapeCurrencies():
    # coins
    coinsFilename = "coins.txt"
    coins = readCachedCurrenciesFile(coinsFilename)
    if not coins:
        coins = scrapeCoinList()
        writeCachedCurrenciesFile(coinsFilename, coins)
    
    # tokens
    tokensFilename = "tokens.txt"
    tokens = readCachedCurrenciesFile(tokensFilename)
    if not tokens:
        tokens = scrapeTokenList()
        writeCachedCurrenciesFile(tokensFilename, tokens)
    return coins, tokens

## Scrape historical data

__Cache__ for the historical data, __csv__.

In [15]:
def loadCurrencyFromCsv(currency):
    """Load historical data for the *currency*.
    Returns *header* and *rows* of the csv.
    """
    path = getPath("{}.csv".format(currency))
    header, data = [], []
    if not os.path.exists(path):
        return header, data
    with codecs.open(path, "r", encoding="UTF-8") as fp:
        reader = csv.reader(fp)
        try:
            header = next(reader)
        except StopIteration:
            pass
        data = list(reader)
    return header, data

def saveCurrencyToCsv(currency, header, data):
    """Save historical data for the *currency*.
    """
    path = getPath("{}.csv".format(currency))
    newData = sorted(data, key=lambda row: row[0], reverse=True)
    with codecs.open(path, "w", encoding="UTF-8") as fp:
        writer = csv.writer(fp, quoting=csv.QUOTE_NONE)
        [writer.writerow(row) for row in itertools.chain([header], newData)]

The function to download the historical data for one currency.

In [16]:
def downloadCurrency(currency):
    """Download historical data for a single *currency*.
    Only request new data if the *date* is not yet in the cache.
    **Updates** all caches.
    """
    parseDate = lambda s: datetime.datetime.strptime(s, "%Y-%m-%d")
    slug = currency["slug"]
    # set default startTime
    startDate = parseDate("2001-01-01")
    # get current UTC datetime
    endDate = datetime.datetime.utcnow()
    # floor to day
    endDate = endDate.replace(hour=0, minute=0, second=0, microsecond=0)
   
    # try to load data from cache
    header, data = loadCurrencyFromCsv(slug)
    for row in data:
        startDate = max(startDate, parseDate(row[0]))
    # increment one date
    startDate += datetime.timedelta(days=1)
    if startDate >= endDate:
        return
    
    header, newData= downloadHistoricalData(slug, startDate, endDate)
    if not newData:
        return
    # remove average
    header = header[:-1]
    for row in newData:
        row = row[:-1] # remove Average
        # parse date from module
        row[0] = datetime.datetime.strptime(row[0], "%b %d %Y").strftime("%Y-%m-%d")
        data.append(row)
    saveCurrencyToCsv(slug, header, data)

## Run the code
The result is a filled __CACHE__ directory.
* _coins.txt_/_tokens.txt_ cointain the coin and token information
* _coin/token name.csv_ cointain the historical data

In [11]:
# scrape coins and tokens
coins, tokens = scrapeCurrencies()
print("coins: {}".format(len(coins)))
print("tokens: {}".format(len(coins)))

# get historical data for coins and tokens
for currency in itertools.chain(coins, tokens):
    print("downloading currency: {}".format(currency))
    downloadCurrency(currency)

896
546
bismuth 20180122 20180123
bitswift 20180113 20180123
cryptcoin 20171229 20180123
truckcoin 20180117 20180123
pascal-lite 20180117 20180123
sling 20180122 20180123
islacoin 20180121 20180123
sydpak 20180121 20180123
coffeecoin 20160521 20180123
win-coin 20180117 20180123
rabbitcoin 20180108 20180123
ocow 20180107 20180123
global-business-revolution 20180121 20180123
9coin 20180110 20180123
bt1-cst 20180111 20180123
bitusd 20180121 20180123
verify 20180121 20180123
bitqy 20180121 20180123
flixxo 20180121 20180123
mysterium 20180121 20180123
lockchain 20180121 20180123
primas 20180121 20180123
aventus 20180121 20180123
quantum 20180121 20180123
change 20180121 20180123
bitdice 20180121 20180123
mercury-protocol 20180121 20180123
oneroot-network 20180121 20180123
ongsocial 20180121 20180123
life 20180121 20180123
obits 20180121 20180123
dao-casino 20180121 20180123
uquid-coin 20180121 20180123
exchange-union 20180121 20180123
adshares 20180121 20180123
prochain 20180121 20180123
le

everus 20180121 20180123
musiconomi 20180121 20180123
ibtc 20180121 20180123
macro1 20180121 20180123
fapcoin 20180121 20180123
hodl-bucks 20180121 20180123
btcmoon 20180121 20180123
wi-coin 20180121 20180123
president-johnson 20180121 20180123
storjcoin-x 20180121 20180123
ethereum-lite 20180121 20180123
blockchain-index 20180121 20180123
ox-fina 20180121 20180123
minex 20180121 20180123
stex 20180121 20180123
bitcoin2x 20180121 20180123
matryx 20180121 20180123
internet-of-things 20180121 20180123
chronologic 20180121 20180123
hyper-tv 20180121 20180123
cash-poker-pro 20180121 20180123
egold 20180121 20180123
corion 20180121 20180123
uahpay 20180121 20180123
anryze 20180121 20180123
encryptotel-eth 20180121 20180123
president-trump 20180121 20180123
facecoin 20180121 20180123
soma 20180121 20180123
swapcoin 20180121 20180123
first-bitcoin-capital 20180121 20180123
10mtoken 20180121 20180123
teslacoilcoin 20180121 20180123
ebit 20180121 20180123
infinity-pay 20180121 20180123
sand-coi

In [116]:
import re
import pandas as pd
import numpy as np
def loadHistoricalData():
    """Load historical data for all coins/tokens and put it into a single csv file
    """
    coins, tokens = scrapeCurrencies()
    header = None
    data = []
    currencies = coins + tokens
    slugs = {}
    for currency in currencies:
        name = currency['name']
        slug = currency['slug']
        symbol = re.sub(' ', '-', currency['symbol']).lower()
        based = ""
        slugs[slug] = currency
        if name.lower() != symbol:
            # token
            based = slugs[symbol]['name']
            #print(name, symbol, slug, based)
        hdr, rows = loadCurrencyFromCsv(slug)
        if not header:
            header = hdr
        data.extend([[name, based] + r for r in rows])
    return ["Name", "Based"] + header, data

#header, histData = loadHistoricalData()
#print(len(histData))
#from pprint import pprint
#pprint(histData[:10])
#pprint(histData[-10:])
def loadAsDf(minMarketCap=1000*1000*1000):
    coins, tokens = scrapeCurrencies()
    currencies = coins + tokens
    dfTot = pd.DataFrame()
    #for currency in currencies:
    for currency in currencies:
        name = currency['name']
        slug = currency['slug']
        header, rows = loadCurrencyFromCsv(slug)
        if not rows:
            continue
        header = [header[0]] + ["{} {}".format(name, x) for x in header[1:]]
        df = pd.DataFrame(np.array(rows), columns = header)
        df = df.replace("-", 0)
        df[header[1:]] = df[header[1:]].apply(pd.to_numeric)
        colMarketCap = "{} Market Cap".format(name)
        dfCap = df.loc[df[colMarketCap] >= minMarketCap]
        if dfCap.empty:
            continue
        print(name, len(df))
        df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")
        dfTot = df if dfTot.empty else pd.merge(dfTot, df, on="Date", how='outer', sort=True)
    dfTot = dfTot.fillna(method="ffill") # use previous values if available
    dfTot = dfTot.fillna(value=0)        # fill missing data with 0
    return dfTot

df = loadAsDf()
print(df.head())
#print(df.tail())

BTC 1731
ETH 900
        Date  BTC Open  BTC High  BTC Low  BTC Close  BTC Volume  \
0 2013-04-28    135.30    135.98   132.10     134.21           0   
1 2013-04-29    134.44    147.49   134.00     144.54           0   
2 2013-04-30    144.00    146.93   134.05     139.00           0   
3 2013-05-01    139.00    139.89   107.72     116.99           0   
4 2013-05-02    116.38    125.60    92.28     105.21           0   

   BTC Market Cap  ETH Open  ETH High  ETH Low  ETH Close  ETH Volume  \
0      1500520000       0.0       0.0      0.0        0.0         0.0   
1      1491160000       0.0       0.0      0.0        0.0         0.0   
2      1597780000       0.0       0.0      0.0        0.0         0.0   
3      1542820000       0.0       0.0      0.0        0.0         0.0   
4      1292190000       0.0       0.0      0.0        0.0         0.0   

   ETH Market Cap  
0             0.0  
1             0.0  
2             0.0  
3             0.0  
4             0.0  
