# Bitcoin Sentiment Analysis - News Article Web Scraping

### Import Libraries

In [90]:
import pandas as pd
import numpy as np

from datetime import datetime

import pickle

from bs4 import BeautifulSoup
import requests
import time, os

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chromedriver = "/Applications/chromedriver"
os.environ['webdriver.chrome.driver'] = chromedriver

### Bitcoin Magazine Scraping

In [None]:
# Set up Selenium driver
driver = webdriver.Chrome(chromedriver)
driver.get('https://bitcoinmagazine.com/articles')

In [None]:
def scroll_to_bottom(driver):
    """
    Scroll to the bottom of a webpage.
    Thank you to user53558 on github.
    """

    old_position = 0
    new_position = None

    while new_position != old_position:
        # Get old scroll position
        old_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                 " window.pageYOffset : (document.documentElement ||"
                 " document.body.parentNode || document.body);"))
        # Sleep and Scroll
        time.sleep(1)
        driver.execute_script((
                "var scrollingElement = (document.scrollingElement ||"
                " document.body);scrollingElement.scrollTop ="
                " scrollingElement.scrollHeight;"))
        # Get new position
        new_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                 " window.pageYOffset : (document.documentElement ||"
                 " document.body.parentNode || document.body);"))

In [None]:
# Click "See More" button 100 times. Repeat this cell as much as necessary.
for i in range(0, 100):
    try:
        scroll_to_bottom(driver)
        time.sleep(1)
        driver.find_element_by_xpath('/html/body/phoenix-page/div/div/div[2]/section[2]/section/div/button').click()
    except:
        next

In [None]:
# Get HTML after displaying 100 iterations of more articles
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [None]:
# Get all individual article links
bitcoin_magazine_articles = soup.find_all('phoenix-super-link', 'href')
titles = soup.find_all('phoenix-ellipsis', {'class' : 'm-ellipsis m-card--header'})
bitcoin_magazine_articles = [h.find('a')['href'] for h in titles]
bitcoin_magazine_articles = ['https://bitcoinmagazine.com/' + link for link in bitcoin_magazine_articles]

In [None]:
# Pickle the article list
with open('sentiment_pickles/pickle_bitcoin_magazine.pickle', 'wb') as to_write:
    pickle.dump(bitcoin_magazine_articles, to_write)

In [None]:
# Open pickled links (if re-starting here)
with open('sentiment_pickles/pickle_bitcoin_magazine.pickle', 'rb') as read_file:
    bitcoin_magazine_links = pickle.load(read_file)

In [None]:
# Initialize dictionary for creating the dataframe
bitcoin_magazine_dict = {'date' : [],
                         'website' : [],
                         'title' : [],
                         'body' : []
                        }

# Loop through all collected links and save data to the dictionary
for link in bitcoin_magazine_links:
    try:
        page = requests.get(link).text
        soup = BeautifulSoup(page, 'html5lib')

        title = soup.find('h1', {'class' : 'm-detail-header--title'}).getText()
        
        article_date = soup.find('phoenix-timeago').getText()
        article_date = datetime.strptime(article_date, '%b %d, %Y').date()
        
        body = soup.find('div', {'class' : 'm-detail--body'}).find_all(['p', 'h2'])
        body = '\n'.join([b.getText() for b in body])
        
        bitcoin_magazine_dict['date'].append(article_date)
        bitcoin_magazine_dict['website'].append('Bitcoin Magazine')
        bitcoin_magazine_dict['title'].append(title)
        bitcoin_magazine_dict['body'].append(body)
    except:
        # If there is a failure, skip and move to the next article.
        next

In [None]:
# Create the dataframe
bitcoin_magazine_df = pd.DataFrame(bitcoin_magazine_dict)

In [None]:
# Pickle the articles dataframe
with open('sentiment_pickles/pickle_bitcoin_magazine_df.pickle', 'wb') as to_write:
    pickle.dump(bitcoin_magazine_df, to_write)

### CryptoSlate Scraping

In [None]:
# Set up list of links for CrypoSlate articles
root = 'https://cryptoslate.com/news/bitcoin/page/'
links = [root + str(x) +'/' for x in range(1, 174)]

In [None]:
# Loop through links to get article links
cryptoslate_links = []

for link in links:
    page = requests.get(link).text
    soup = BeautifulSoup(page, 'html5lib')
    
    articles = soup.find('section', {'class': 'list-feed'}).find_all('a', href=True)
    article_links = [a['href'] for a in articles][0:-1] # drop navigation link (next page)
    
    cryptoslate_links.extend(article_links)

In [None]:
# Pickle the article links
with open('sentiment_pickles/pickle_crypto_slate.pickle', 'wb') as to_write:
    pickle.dump(cryptoslate_links, to_write)

In [None]:
# Open pickled links
with open('sentiment_pickles/pickle_crypto_slate.pickle', 'rb') as read_file:
    cryptoslate_links = pickle.load(read_file)

In [None]:
# Initialize dictionary for creating the dataframe
cryptoslate_dict = {'date' : [],
                    'website' : [],
                    'title' : [],
                    'body' : []
                   }

# Loop through all collected links and save data to the dictionary
for link in cryptoslate_links:
    try:
        page = requests.get(link).text
        soup = BeautifulSoup(page, 'html5lib')

        title = soup.find('h1', {'class' : 'post-title'}).getText()

        article_date = soup.find('span', {'class' : 'post-date'}).getText()
        article_date = ' '.join(article_date.split()[0:3])
        article_date = datetime.strptime(article_date, '%B %d, %Y').date()

        body = soup.find('div', {'class' : 'post-box clearfix'}).find('article').find_all(['p', 'h2'])
        body = '\n'.join([b.getText() for b in body])
        
        cryptoslate_dict['date'].append(article_date)
        cryptoslate_dict['website'].append('CryptoSlate')
        cryptoslate_dict['title'].append(title)
        cryptoslate_dict['body'].append(body)
    
    except AttributeError:
        # Body of the article is not found for premium articles
        next

In [None]:
# Create the dataframe
cryptoslate_df = pd.DataFrame(cryptoslate_dict)

In [None]:
# Pickle the articles dataframe
with open('sentiment_pickles/pickle_crypto_slate_df.pickle', 'wb') as to_write:
    pickle.dump(cryptoslate_df, to_write)

### Bitcoin News Scraping

In [4]:
# Set up list of links for Bitcoin News articles
root = 'https://news.bitcoin.com/page/'
links = [root + str(x) +'/' for x in range(2, 1630)] # Skip first page, different format and very recent

In [5]:
# Loop through links to get article links
bitcoin_news_links = []

for link in links:
    page = requests.get(link).texts
    soup = BeautifulSoup(page, 'html5lib')
    
    articles = soup.find('div', {'class': 'td-container td-pb-article-list'}).find_all('div', {'class': 'story story--medium'})
    article_links = [a.find('a')['href'] for a in articles]
    
    bitcoin_news_links.extend(article_links)

In [6]:
# Initialize dictionary for creating the dataframe
bitcoin_news_dict = {'date' : [],
                     'website' : [],
                     'title' : [],
                     'body' : []
                    }

# Loop through all collected links and save data to the dictionary
for link in bitcoin_news_links:
    try:
        page = requests.get(link).text
        soup = BeautifulSoup(page, 'html5lib')

        article_space = soup.find('article', {'class' : 'article__body'})

        title = article_space.find('h1', {'class' : 'article__header__heading'}).getText().strip()

        article_date = soup.find('time', {'class' : 'article__info__date'}).getText().strip()
        article_date = datetime.strptime(article_date, '%b %d, %Y').date()

        body = article_space.find_all(['p', 'h2'])
        body = '\n'.join([b.getText() for b in body])
        
        bitcoin_news_dict['date'].append(article_date)
        bitcoin_news_dict['website'].append('Bitcoin.com')
        bitcoin_news_dict['title'].append(title)
        bitcoin_news_dict['body'].append(body)
    
    except:
        # If there is a failure, move to the next article.
        next

In [8]:
# Create the dataframe
bitcoin_news_df = pd.DataFrame(bitcoin_news_dict)

In [None]:
# Pickle the articles dataframe
with open('sentiment_pickles/pickle_bitcoin_news_df.pickle', 'wb') as to_write:
    pickle.dump(bitcoin_news_df, to_write)

### Coin Telegraph Scraping

In [99]:
# Set up Selenium driver
opts = Options()
opts.add_argument('user-agent=Mozilla/5.0')

driver = webdriver.Chrome(chromedriver, options=opts)
driver.get('https://cointelegraph.com/tags/bitcoin')

In [103]:
# Click "See More" button 1000 times. Repeat this cell as much as necessary.
for i in range(0, 1000):
    try:
        scroll_to_bottom(driver)
        time.sleep(1)
        driver.find_element_by_xpath('/html/body/div/div/div/div[1]/main/div/div/div[2]/div/div[2]/div/div/button').click()
    except:
        next

In [104]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [105]:
coin_telegraph_links = soup.find_all('a', {'class' : 'post-card-inline__title-link'})
coin_telegraph_links = [a['href'] for a in coin_telegraph_links]
coin_telegraph_links = ['https://cointelegraph.com' + a for a in coin_telegraph_links]

In [None]:
# Pickle the article list
with open('sentiment_pickles/pickle_coin_telegraph.pickle', 'wb') as to_write:
    pickle.dump(coin_telegraph_links, to_write)

In [108]:
# Open pickled links (if re-starting here)
with open('sentiment_pickles/pickle_coin_telegraph.pickle', 'rb') as read_file:
    coin_telegraph_links = pickle.load(read_file)

In [137]:
# Initialize dictionary for creating the dataframe
coin_telegraph_dict = {'date' : [],
                       'website' : [],
                       'title' : [],
                       'body' : []
                      }

In [138]:
headers = {'User-Agent': 'Mozilla/5.0'}

# Loop through all collected links and save data to the dictionary
for link in coin_telegraph_links:
    try:
        page = requests.get(link, headers=headers).text
        soup = BeautifulSoup(page, 'html5lib')
        
        article_space = soup.find('article', {'class' : 'post__article'})
        
        title = article_space.find('h1', {'class' : 'post__title'}).getText().strip()
        article_date = article_space.find('time')['datetime']
        article_date = datetime.strptime(article_date, '%Y-%m-%d').date()
        
        body = article_space.find('div', {'class' : 'post-content'}).find_all(['p', 'h2'])
        body = '\n'.join([b.getText() for b in body])
        
        coin_telegraph_dict['date'].append(article_date)
        coin_telegraph_dict['website'].append('Coin Telegraph')
        coin_telegraph_dict['title'].append(title)
        coin_telegraph_dict['body'].append(body)
    except:
        # If there is a failure, move to the next article.
        next

In [139]:
# Create the dataframe
coin_telegraph_df = pd.DataFrame(coin_telegraph_dict)

In [141]:
# Pickle the articles dataframe
with open('sentiment_pickles/pickle_coin_telegraph_df.pickle', 'wb') as to_write:
    pickle.dump(coin_telegraph_df, to_write)