In [1]:
import requests as re
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
request_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}

In [4]:
site_url = 'https://99bitcoins.com/price-chart-history/'

In [5]:
r = re.get(site_url, headers=request_header)
page = r.text
soup = BeautifulSoup(page, 'html.parser')

In [6]:
main_table = []
url_table = []

for event_div in soup.find_all('div', {'class': 'bitcoin_history'}):
    event_header = event_div.find('h3').text.split(' - ')
    event_date_str = event_header[1].strip(' ')
    price_div = event_div.find('div', {'class': 'info'}).find_all('span')
    event_body = event_div.find('p').text.replace('Source:', 'Sources:').split('Sources:')
    
    event_title = event_header[0].strip(' ')
    event_date = datetime.strptime(event_date_str, '%B %d, %Y').date()
    event_id = event_div['id']
    try:
        bitcoin_value = float(price_div[1].text.strip('$'))
        bitcoin_value_10_days_later = float(price_div[3].text.strip('$'))
    except IndexError:
        bitcoin_value = None
        bitcoin_value_10_days_later = None
    event_maintext = event_body[0].replace('\r','').replace('\n', '')
    
    url_list = [url for url in event_body[1].split('\n') if url.startswith('http')]
    
    main_table.append({
        'event_id': event_id,
        'event_title': event_title,
        'event_date': event_date,
        'bitcoin_value': bitcoin_value,
        'bitcoin_value_10_days_later': bitcoin_value_10_days_later,
        'event_maintext': event_maintext
    })
    
    [url_table.append({
        'event_id': event_id,
        'source_url': url,
    }) for url in url_list]

In [7]:
maincols = ['event_id', 'event_title', 'event_date', 'bitcoin_value', 'bitcoin_value_10_days_later', 'event_maintext']
urlcols = ['event_id', 'source_url']
main_df = pd.DataFrame(main_table)[maincols]
url_df = pd.DataFrame(url_table)[urlcols]

In [8]:
main_df.head()

Unnamed: 0,event_id,event_title,event_date,bitcoin_value,bitcoin_value_10_days_later,event_maintext
0,91,U.S. regulator demands trading data from Bitcoin exchanges in manipulation probe,2018-06-11,7158.95,6709.39,"The U.S. Commodity Futures Trading Commission (CFTC) has sent subpoenas to four crypto-exchanges—Bitstamp, Kraken, ItBit, and Coinbase—demanding answers on the subject of market price distortion."
1,90,"South Korean crypto exchange, CoinRail has been hacked",2018-06-10,7638.44,6747.77,South Korean crypto exchange Coinrail loses over $40M in tokens following a hack.
2,89,U.S.Justice Department launches criminal probe into Bitcoin price manipulation,2018-05-24,7818.21,7608.5,"The Justice Department has opened a criminal probe into whether traders are manipulating the price of Bitcoin and other digital currencies. The investigation is focused on illegal practices that can influence prices -- such as spoofing, or flooding the market with fake orders to trick other traders into buying or selling."
3,88,Prosecutors raid largest South Korean exchange,2018-05-11,9289.09,8371.9,"Prosecutors raided UpBit, the largest cryptocurrency exchange in South Korea on suspicion of fraud."
4,87,Goldman Sachs announces to open a Bitcoin trading operation,2018-05-02,9021.75,8728.95,"One of the largest investment bank announced, Goldman is about to begin using its own money to trade with clients in a variety of contracts linked to the price of Bitcoin."


In [9]:
url_df.head()

Unnamed: 0,event_id,source_url
0,91,https://www.wsj.com/articles/u-s-regulators-demand-trading-data-from-bitcoin-exchanges-in-manipulation-probe-1528492835?mod=searchresults&page=1&pos=1
1,91,https://uk.news.yahoo.com/bitcoin-price-crypto-crash-blamed-095821221.html
2,90,https://www.cnbc.com/2018/06/10/bitcoin-tumbles-10-percent-after-news-of-south-korea-crypto-exchange-hack.html
3,89,https://www.bloomberg.com/news/articles/2018-05-24/bitcoin-manipulation-is-said-to-be-focus-of-u-s-criminal-probe
4,89,https://www.cnbc.com/2018/05/24/us-opens-criminal-probe-into-bitcoin-price-manipulation-bloomberg.html


In [10]:
datadir = os.path.join('input', '99bitcoins')
main_df.to_csv(os.path.join(datadir, '99bitcoins_main.csv'), index=False)
url_df.to_csv(os.path.join(datadir, '99bitcoins_sources.csv'), index=False)