In [1]:
# Import Libaries and Packages
import numpy as np
import pandas as pd
import re

## Clean Articles Final

This noteboks performs the following procedures:
    
    1. Reads in extracted and cleanded data from prior processes as a csv. 
    2. Performs addtional preprocessing.
    3. Discards data that does not contain selected keywords.

In [3]:
articles_raw = pd.read_csv("use_rEx.csv") #Do not run more than once - hogs memory and takes some time
articles_raw.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
14599,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
14591,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trudeau announces pacific trade de...
14592,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trump to tell Davos the US 'open f...
14593,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: VR film in Davos highlights cyber ...
14594,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: IMF's Lagarde faults Modi speech i...


In [3]:
articles_raw.tail()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
28959,Avivah Litan,\tA member of the Gartner Blog Network\tby Avi...,Blockchain sessions were in full force at the ...,Gartner.com,https://blogs.gartner.com/avivah-litan/2018/04...,2018-04-23 23:22:25+00:00,"Blockchain, Crypto, and Women; Musings from RSAC"
28960,Cyberparse.co.uk,Enlarge / Simplified figurative process of a C...,Enlarge / Simplified figurative process of a C...,Cyberparse.co.uk,http://cyberparse.co.uk/2018/04/23/new-hacks-s...,2018-04-23 23:32:00+00:00,New hacks siphon private cryptocurrency keys f...
28961,Ali Breland and Harper Neidig,THE EU'S NEW TARGET: They're back! European re...,THE EU'S NEW TARGET: They're back! European re...,The Hill,http://thehill.com/policy/technology/overnight...,2018-04-23 23:49:29+00:00,Overnight Tech: EU investigates Apple's Shazam...
28962,Tyler Durden,Surprise! Even kids can be taught about finan...,"Authored by Caitlin Johnstone via Medium.com, ...",Zerohedge.com,https://www.zerohedge.com/news/2018-04-23/msm-...,2018-04-23 23:55:00+00:00,MSM Is Frantically Attacking Dissenting Syria ...
28963,pjbyrne,"Disclaimer: English lawyer, not practising thi...","Disclaimer: English lawyer, not practising thi...",Prestonbyrne.com,https://prestonbyrne.com/2018/04/23/on-ethereu...,2018-04-23 23:59:35+00:00,Whether Ethereum is a security


In [4]:
articles_raw.describe()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
count,28601,28774,28770,28774,28774,28774,28764
unique,7079,26417,27539,1740,27733,26813,25302
top,Scott Scanlon,If you are the site owner (or you manage this ...,"Welcome to Crypto Insider, Business Insider's ...",Youbrandinc.com,https://www.bloomberg.com/news/articles/2018-0...,2018-01-23 00:00:00+00:00,6 things Australian traders will be talking ab...
freq,1743,57,5,1730,3,10,43


__Preprocess and Clean Data__

In [5]:
#Reset index
articles_raw = articles_raw.reset_index()
articles_raw = articles_raw.drop('index', axis = 1)
articles_raw.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
1,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trudeau announces pacific trade de...
2,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trump to tell Davos the US 'open f...
3,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: VR film in Davos highlights cyber ...
4,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: IMF's Lagarde faults Modi speech i...


#### Methodology: 
We have a list of keywords (defined below) that we deem relevant to Bitcoin trading and price. If an article does not mention one of these keywords in the first 300 characters (heuristic/empirical analysis of where the lede of an article ends) we deem it irrelevant.

In [None]:
#Sample of first 300 characters of the first 30 articles. 
#As you can see, relevant articles will mention Bitcoin or a keyword in this selection.
for article in articles_raw['contents'][:30]:
    print(article[:300])
    print()

__Filtering Keyword List__

In [8]:
#Define list of keywords and new 
keywords = [ 
            "BTC", "btc", "BCH", "bch", "Bitcoin", "bitcoin", \
            "Litecoin", "litecoin", "LTC", "ltc", \
            "Ether", "ether", "Ethereum", "ethereum", \
            "ETH", "eth", \
            "hodl", "HODL", \
            "crypto", "cryptocurrency", "cryptocurrencies", \
            "Crypto", "Cryptocurrency", "Cryptocurrencies", \
            "ICO", "ico", "GDAX", "gdax", \
            "Blockchain", "blockchain", \
           ]

articles_final = articles_raw

In [9]:
articles_final.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
14599,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
14591,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trudeau announces pacific trade de...
14592,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: Trump to tell Davos the US 'open f...
14593,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: VR film in Davos highlights cyber ...
14594,"Associated Press, By Associated Press","DAVOS, Switzerland — The latest on the World ...","DAVOS, Switzerland — The Latest on the World E...",Bostonherald.com,http://www.bostonherald.com/business/business_...,2018-01-23 00:00:00+00:00,The Latest: IMF's Lagarde faults Modi speech i...


In [None]:
#Traverse the articles array. 
#If you find an article that does not contain a keyword in the first 300 characters, remove it.
n_removed = 0
indices_to_drop = []

for i in articles_final.index:
    first300 = articles_final['contents'][i][:300] #Grab first 300 characters of article text
    title = articles_final['title'][i]
    n_keywords = 0
    try:
        assert type(first300) is str
        assert type(title) is str
    except:
        print("Found error in article", i, "- Passing")
        indices_to_drop.append(i)
        continue
    
    for keyword in keywords:
        if keyword in first300 or keyword in title:
            n_keywords += 1
            break
        else:
            pass
    
    if n_keywords == 0:
        indices_to_drop.append(i)
        
    if len(indices_to_drop) % 100 == 0:
        print("Marked", len(indices_to_drop), "articles for removal")
    
    if i % 10 == 0:
        print("Article", i)
        
print("Total:", len(indices_to_drop), "articles marked for removal.")

In [11]:
articles_final = articles_final.drop(indices_to_drop)

In [12]:
articles_final = articles_final.reset_index()
articles_final = articles_final.drop('index', axis = 1)
articles_final.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
1,Editorial Team,As it scrambles to serve a massively expanding...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,2018-01-23 00:01:00+00:00,Coinbase hires former Twitter exec to lead cus...
2,Scott Scanlon,So many cryptocurrencies. So much money to be ...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,2018-01-23 00:03:12+00:00,Should you buy bitcoin? Or Ethereum? Or Dash? ...
3,http://www.dailymail.co.uk/home/search.html?s=...,By\n\nPress Association\n\t\nPublished:\n 19:...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,2018-01-23 00:05:47+00:00,World´s biggest Bitcoin exchange wins backing ...
4,Phil Glazer,Initial coin offerings (ICOs) are a regulatory...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,2018-01-23 00:06:02+00:00,Is Regulation Needed for Institutional Investo...


In [13]:
articles_final.to_csv("final_articles.csv")

___