# Web scraper for bitcoin websites

In [10]:
# load selenium for web scraping
from selenium import webdriver
def start_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    #"/Users/cc/opt/anaconda3/bin/chromedriver",
    driver = webdriver.Chrome( options=options)
    #driver.get("https://etherscan.io/accounts")
    #driver.get("https://www.blockchain.com/btc/unconfirmed-transactions")
    return driver

## 1. current coin market price board
#### this is to get coin name, price, percentage change from cointelegraph.com

In [11]:
from selenium.webdriver.common.by import By
def get_data(driver):
    driver.get("https://cointelegraph.com/price-indexes")
    # get table of accounts
    #get coins and prices info from website
    coins = driver.find_elements(By.XPATH, '//h2[@class="price-index-item__name"]')
    prices = driver.find_elements(By.XPATH, '//div[@class="price-index-item__col price-index-item__col_digits"]')

    # get the text from the elements
    coin_list = [coin.text for coin in coins]
    price_list = []
    percentage_list = []
    for p in prices:
        # the first element is price, the second is percent
        price_list.append(p.text.split("\n")[0])
        percentage_list.append(p.text.split("\n")[1])
    return coin_list, price_list, percentage_list

def save_data(coin_list, price_list, percentage_list):
    #create a dataframe
    import pandas as pd 
    df = pd.DataFrame(list(zip(coin_list, price_list, percentage_list)),
                    columns =['Coin', 'Price', 'change on Percentage'])
    
    df['time'] = pd.to_datetime('now')
    df['time'] = df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    # add the dataframe to a csv file
    df.to_csv('coin_price.csv', mode='a', header=True, index=False)

def get_coin_prices():
    driver = start_driver()
    coin_list, price_list, percentage_list = get_data(driver)
    save_data(coin_list, price_list, percentage_list)
    driver.close()

get_coin_prices()

  df['time'] = pd.to_datetime('now')


## 2. yahoo finance news of cryptocurrencies
#### this can be used to get the latest news about most cryptocurrencies, about 5 to 170 for each type of currency.
#### this program can be assigned with the number of top currencies the user want to learn about. 3 top types of currency would take about 20 seconds to get, with about 700 news articles.

#### Firstly, we need to get about 200 kinds of coins' names

In [12]:
# get the type of bitcoins
def get_coin_list(driver):
    driver.get("https://finance.yahoo.com/cryptocurrencies/?offset=0&count=200")
    # get table of accounts
    #get coins and prices info from website
    coins = driver.find_elements(By.XPATH, '//a[@data-test="quoteLink"]')
    # get the text from the elements
    coin_list = [coin.text for coin in coins]
    return coin_list

#### Secondly, we get the news for each coin

In [13]:

import time
import pandas as pd
def get_news_list(searchword, driver):
    driver.get("https://finance.yahoo.com/quote/"+searchword)
    SCROLL_PAUSE_TIME = 0.5
    # Get scroll height
    scrollHeightstart = 0
    scrollHeight = 10000
    i=0
    #usually the maximum # of news we cant get for each coin is about 170, so here we loop for 20 times
    while i<20:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo("+ str(scrollHeightstart)+","+str(scrollHeightstart+scrollHeight)+");")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        scrollHeightstart += scrollHeight   
        i+=1
    news = driver.find_elements(By.XPATH, '//li[@class="js-stream-content Pos(r)"]')
    # get the text from the elements
    news_list = [news.text for news in news]
    #news can be split into source, time, title, and content
    return news_list

#### Then, this is for saving the news into csv

In [14]:
def write_to_df(news_list,coin_type):
    df = pd.DataFrame(news_list)
    # seperate source with "•", seperate time with "\n", seperate title with "\n",
    # seperate content with "\n"
    df['time_of_retreival'] = pd.to_datetime('now')
    df['coin_type'] = coin_type
    df[['source', 'time']] = df[0].str.split("•", expand=True)
    df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
    df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
    
    
    # drop the first column
    df.drop(columns=[0], inplace=True)
    # save the dataframe to an existing csv file
    df.to_csv('news.csv', mode='a', header=True, index=False)

#### Finally, This is the program for running the whole news scraper
##### Feel free to change the parameter for get_coin_news(Top_N_coins)

In [15]:
# Top_N_coins can range from 1 to 200
def get_coin_news(Top_N_coins = 4):
    #1. start the web crawler
    driver = start_driver()
    #2. get the list of coins
    coin_list = get_coin_list(driver)
    #3. get the news for each coin
    for coin in coin_list[0:Top_N_coins]:
        #3.1 from the crawled news, we can get the news list, which includes--title, content, source, and time
        news_list = get_news_list(coin, driver)
        #3.2 write the news list to a csv file
        write_to_df(news_list,coin)


#run the whole second program
get_coin_news(20)

  df['time_of_retreival'] = pd.to_datetime('now')
  df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
  df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
  df['time_of_retreival'] = pd.to_datetime('now')
  df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
  df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
  df['time_of_retreival'] = pd.to_datetime('now')
  df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
  df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
  df['time_of_retreival'] = pd.to_datetime('now')
  df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
  df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
  df['time_of_retreival'] = pd.to_datetime('now')
  df[['time', 'title']] = df['time'].str.split("\n", 1,expand=True)
  df[['title', 'content']] = df['title'].str.split("\n",1,expand=True)
  df['time_of_retreival'] = pd.to_datetime('now')
  df[

KeyError: 0