In [1]:
# Import Libraries and Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import datetime
import nltk
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
eng_stopwords = nltk.corpus.stopwords.words('english')

## Clean Articles Final Notebook 2

This notebook performs the following tasks:
    1. Standardizes timestamps
    2. Restrict date range to time-window of interest
    3. Preprocesses text for LSA/LSI
    4. Merges cleaned datasets for final processing    

In [2]:
articles = pd.read_csv("final_articles.csv").drop('Unnamed: 0', axis = 1)
articles = articles.sort_values('timeStamp', ascending = True)
articles.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
1,Editorial Team,As it scrambles to serve a massively expanding...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,2018-01-23 00:01:00+00:00,Coinbase hires former Twitter exec to lead cus...
2,Scott Scanlon,So many cryptocurrencies. So much money to be ...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,2018-01-23 00:03:12+00:00,Should you buy bitcoin? Or Ethereum? Or Dash? ...
3,http://www.dailymail.co.uk/home/search.html?s=...,By\n\nPress Association\n\t\nPublished:\n 19:...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,2018-01-23 00:05:47+00:00,World´s biggest Bitcoin exchange wins backing ...
4,Phil Glazer,Initial coin offerings (ICOs) are a regulatory...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,2018-01-23 00:06:02+00:00,Is Regulation Needed for Institutional Investo...


__Standardize timestamps__

In [3]:
temp = pd.DatetimeIndex(articles['timeStamp']) # Gather all datetime objects
articles['date'] = temp.date                   # Pull out the date from the datetime objects and assign to Date column
articles['time'] = temp.time                   # Pull out the time from the datetime objects and assign to Time column
print(len(articles))
articles.tail(3)

20572


Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,date,time
20569,Avivah Litan,\tA member of the Gartner Blog Network\tby Avi...,Blockchain sessions were in full force at the ...,Gartner.com,https://blogs.gartner.com/avivah-litan/2018/04...,2018-04-23 23:22:25+00:00,"Blockchain, Crypto, and Women; Musings from RSAC",2018-04-23,23:22:25
20570,Cyberparse.co.uk,Enlarge / Simplified figurative process of a C...,Enlarge / Simplified figurative process of a C...,Cyberparse.co.uk,http://cyberparse.co.uk/2018/04/23/new-hacks-s...,2018-04-23 23:32:00+00:00,New hacks siphon private cryptocurrency keys f...,2018-04-23,23:32:00
20571,pjbyrne,"Disclaimer: English lawyer, not practising thi...","Disclaimer: English lawyer, not practising thi...",Prestonbyrne.com,https://prestonbyrne.com/2018/04/23/on-ethereu...,2018-04-23 23:59:35+00:00,Whether Ethereum is a security,2018-04-23,23:59:35


__Restrict data set to date range under consideration__

In [4]:
# iterate over dates and drop those above range of interest
stop = datetime.date(year=2018,month=3,day=26)

for day in range(len(articles['date'])):
    check = articles['date'].loc[day]
    if check > stop:
        articles.drop(day, inplace = True)

In [5]:
print(len(articles))
articles.tail(3)

15888


Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,date,time
15885,newsfeeds@nzherald.co.nz,\t NEW YORK (AP) — Twitter says it will ban or...,NEW YORK (AP) — Twitter says it will ban or re...,Nzherald.co.nz,http://www.nzherald.co.nz/business/news/articl...,2018-03-26 23:25:06+00:00,"Twitter to ban cryptocurrency ads, joining Fac...",2018-03-26,23:25:06
15886,Reuters,Sign up now to get free exclusive access to re...,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26 23:43:50+00:00,Twitter to ban cryptocurrency ads,2018-03-26,23:43:50
15887,Reuters,Sign up now to get free exclusive access to re...,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26 23:43:50+00:00,Twitter to ban cryptocurrency ads,2018-03-26,23:43:50


__Preprocess text for NLP formulations__

In [6]:
#Clean the articles - Remove stopwords, remove punctuation, all lowercase
cleaned_texts = []
for text in articles['contents']:
    text = re.sub('[^a-zA-Z]' ,'\n',text)
    text = [word for word in text.split() if not word in eng_stopwords]
    text = (' '.join(text))
    text = text.lower()
    cleaned_texts.append(text)
    
print(cleaned_texts[0])
print()
print(articles['contents'][0])

a complete payments platform engineered growth build scale recurring business model everything platforms need get sellers paid your business data fingertips the best way start internet business fight fraud machine learning share post twitter tom karlo january at stripe long excited possibilities cryptocurrencies experimentation innovation come in became first major payments company support bitcoin payments our hope bitcoin could become universal decentralized substrate online transactions help customers enable buyers places less credit card penetration use cases credit card fees prohibitive over past year two block size limits reached bitcoin evolved become better suited asset means exchange given overall success bitcoin community achieved hard quibble decisions made along way and certainly happy see novel ambitious project well this led bitcoin becoming less useful payments however transaction confirmation times risen substantially turn led increase failure rate transactions denominat

In [7]:
articles['contents'] = cleaned_texts
articles.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,date,time
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support,2018-01-23,00:00:00
1,Editorial Team,as scrambles serve massively expanding userbas...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,2018-01-23 00:01:00+00:00,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00
2,Scott Scanlon,so many cryptocurrencies so much money made lo...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,2018-01-23 00:03:12+00:00,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12
3,http://www.dailymail.co.uk/home/search.html?s=...,by press association published edt january upd...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,2018-01-23 00:05:47+00:00,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47
4,Phil Glazer,initial coin offerings icos regulatory wild we...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,2018-01-23 00:06:02+00:00,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02


__Combine cleaned articles with "Markers" from Time Series event detection__ 

In [8]:
markers = pd.read_csv("marked_dates_whole.csv").drop('Unnamed: 0', axis = 1)
markers.head()

Unnamed: 0,date,label
0,2018-01-27,1
1,2018-01-26,1
2,2018-01-28,1
3,2018-01-25,1
4,2018-01-29,1


In [9]:
temp = pd.DatetimeIndex(markers['date']) #Gather all datetime objects
markers['date'] = temp.date #Pull out the date from the datetime objects and assign to Date column
markers.head()

Unnamed: 0,date,label
0,2018-01-27,1
1,2018-01-26,1
2,2018-01-28,1
3,2018-01-25,1
4,2018-01-29,1


In [10]:
# comibine dataframes 
df = articles.merge(markers, how='outer', on="date")

In [12]:
# fill missing values with int 0
df['label'] = df['label'].fillna(0)
df.index = np.arange(0,len(df))
df.tail()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title,date,time,label
40397,Reuters,sign get free exclusive access reports researc...,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26 23:43:50+00:00,Twitter to ban cryptocurrency ads,2018-03-26,23:43:50,31.0
40398,Reuters,sign get free exclusive access reports researc...,Twitter Inc will start banning cryptocurrency ...,Cio.com.au,https://www.cio.com.au/article/635378/twitter-...,2018-03-26 23:43:50+00:00,Twitter to ban cryptocurrency ads,2018-03-26,23:43:50,32.0
40399,,,,,,,,2018-03-27,,31.0
40400,,,,,,,,2018-03-27,,32.0
40401,,,,,,,,2018-03-28,,32.0


In [13]:
df.to_csv("cleaned_and_merged.csv")

___