In [14]:
import os
import datetime

import gzip
import pandas as pd
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup

# Pull all historical data

In [8]:
try:
    html = urlopen('https://gz.blockchair.com/bitcoin/blocks/?fbclid=IwAR2DtTOZeTeLCMVQC-TugYFBlDtFIijZGoSISG1RYbE5fmmGem15hpdCVo4')
except HTTPError as e:
    print(e)
except URLError as e:
    print('URLError: The server could not be found!')

In [9]:
blockchair_bs = BeautifulSoup(html, 'html.parser')

In [10]:
file_url_list = blockchair_bs.pre.find_all('a')

In [11]:
master_filepath = 'data/Blockchair/blockchair.tsv'

# i=0 does not refer to a valid link, so start from i=1
for i in range(1,len(file_url_list)):
    file_url = file_url_list[i].attrs['href']
    download_url = 'https://gz.blockchair.com/bitcoin/blocks/' + file_url
    
    # Download the file from `url` and save it locally under `filename`:
    filename = "data/Blockchair/" + file_url[:-3]
    
    with urlopen(download_url) as response, open(filename, 'wb') as file_out:
        file_out.write(gzip.decompress(response.read()))

    # Create master file at the first iteration
    if i == 1:
        with open(master_filepath, 'w') as final_tsv_file:
            with open(filename, 'r') as tsv_file:
                for row in tsv_file:
                    final_tsv_file.write(row)
    
    else:
        # Append downloaded file to master file
        with open(master_filepath, 'a') as final_tsv_file: 
            with open(filename, 'r') as tsv_file:
                for row in tsv_file:
                    if row[:2] == 'id':
                        final_tsv_file.write('\n')
                    else:
                        final_tsv_file.write(row)
    
    # Delete file at 'filename'
    try:
        os.remove(filename)
    except OSError as e:
        print ("Error: %s - %s." % (e.filename, e.strerror))

In [15]:
# Convert tsv file to csv
blockchair_df = pd.read_csv(master_filepath, sep='\t', 
                            dtype={'version_bits':'object','chainwork':'object'}, index_col=['id'])

  interactivity=interactivity, compiler=compiler, result=result)


In [40]:
# Fix decimal place of certain columns
blockchair_df[['output_total','generation','reward']] = blockchair_df[['output_total','generation','reward']]*1e-8

In [42]:
blockchair_df.to_csv('data/Blockchair/blockchair.csv')

# Daily Update

In [43]:
try:
    html = urlopen('https://gz.blockchair.com/bitcoin/blocks/?fbclid=IwAR2DtTOZeTeLCMVQC-TugYFBlDtFIijZGoSISG1RYbE5fmmGem15hpdCVo4')
except HTTPError as e:
    print(e)
except URLError as e:
    print('URLError: The server could not be found!')

blockchair_bs = BeautifulSoup(html, 'html.parser')
file_url_list = blockchair_bs.pre.find_all('a')

In [63]:
for row in range(blockchair_df.shape[0]):
    blockchair_df

(158, 35)

In [66]:
# This scrapes for the last entry in the download page
file_url = file_url_list[-1].attrs['href']
download_url = 'https://gz.blockchair.com/bitcoin/blocks/' + file_url

# Download the file from `url` and save it locally under `filename`:
filename = "data/Blockchair/" + file_url[:-3]

with urlopen(download_url) as response, open(filename, 'wb') as file_out:
    file_out.write(gzip.decompress(response.read()))

# Read as csv before appending
new_filename = filename[:-3]+'csv'
blockchair_df = pd.read_csv(filename, sep='\t', 
                            dtype={'version_bits':'object','chainwork':'object'}, index_col=['id'])

# Fix decimal place of certain columns
blockchair_df[['output_total','generation','reward']] = blockchair_df[['output_total','generation','reward']]*1e-8

# Append
blockchair_df.to_csv('data/Blockchair/blockchair.csv', mode='a', header=False)

# Delete file at 'filename'
try:
    os.remove(filename)
except OSError as e:
    print ("Error: %s - %s." % (e.filename, e.strerror))

# Check

In [99]:
# Check if there are duplicated ids
blockchair_df = pd.read_csv('data/Blockchair/blockchair.csv', usecols=['id'], index_col=['id'])
assert blockchair_df.index.duplicated().sum() == 0