In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from datetime import datetime
import nltk
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
eng_stopwords = nltk.corpus.stopwords.words('english')

In [100]:
articles = pd.read_csv("final_articles.csv").drop('Unnamed: 0', axis = 1)
articles.head()

Unnamed: 0,author,contents,description,publisher,source_url,timeStamp,title
0,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,2018-01-23 00:00:00+00:00,Ending Bitcoin Support
1,Editorial Team,As it scrambles to serve a massively expanding...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,2018-01-23 00:01:00+00:00,Coinbase hires former Twitter exec to lead cus...
2,Scott Scanlon,So many cryptocurrencies. So much money to be ...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,2018-01-23 00:03:12+00:00,Should you buy bitcoin? Or Ethereum? Or Dash? ...
3,http://www.dailymail.co.uk/home/search.html?s=...,By\n\nPress Association\n\t\nPublished:\n 19:...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,2018-01-23 00:05:47+00:00,World´s biggest Bitcoin exchange wins backing ...
4,Phil Glazer,Initial coin offerings (ICOs) are a regulatory...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,2018-01-23 00:06:02+00:00,Is Regulation Needed for Institutional Investo...


In [101]:
temp = pd.DatetimeIndex(articles['timeStamp']) #Gather all datetime objects
articles['date'] = temp.date #Pull out the date from the datetime objects and assign to Date column
articles['time'] = temp.time #Pull out the time from the datetime objects and assign to Time column
del articles['timeStamp'] #Delete original datetime column
articles.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time
0,Stripe.com,A complete payments platform engineered for gr...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00
1,Editorial Team,As it scrambles to serve a massively expanding...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00
2,Scott Scanlon,So many cryptocurrencies. So much money to be ...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12
3,http://www.dailymail.co.uk/home/search.html?s=...,By\n\nPress Association\n\t\nPublished:\n 19:...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47
4,Phil Glazer,Initial coin offerings (ICOs) are a regulatory...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02


In [105]:
#Clean the articles - Remove stopwords, remove punctuation, all lowercase
cleaned_texts = []
for text in articles['contents']:
    text = re.sub('[^a-zA-Z]' ,'\n',text)
    text = [word for word in text.split() if not word in eng_stopwords]
    text = (' '.join(text))
    text = text.lower()
    cleaned_texts.append(text)
    
print(cleaned_texts[0])
print()
print(articles['contents'][0])

a complete payments platform engineered growth build scale recurring business model everything platforms need get sellers paid your business data fingertips the best way start internet business fight fraud machine learning share post twitter tom karlo january at stripe long excited possibilities cryptocurrencies experimentation innovation come in became first major payments company support bitcoin payments our hope bitcoin could become universal decentralized substrate online transactions help customers enable buyers places less credit card penetration use cases credit card fees prohibitive over past year two block size limits reached bitcoin evolved become better suited asset means exchange given overall success bitcoin community achieved hard quibble decisions made along way and certainly happy see novel ambitious project well this led bitcoin becoming less useful payments however transaction confirmation times risen substantially turn led increase failure rate transactions denominat

In [108]:
articles['contents'] = cleaned_texts
articles.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00
1,Editorial Team,as scrambles serve massively expanding userbas...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00
2,Scott Scanlon,so many cryptocurrencies so much money made lo...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12
3,http://www.dailymail.co.uk/home/search.html?s=...,by press association published edt january upd...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47
4,Phil Glazer,initial coin offerings icos regulatory wild we...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02


In [106]:
markers = pd.read_csv("marked_dates_whole.csv").drop('Unnamed: 0', axis = 1)
markers.head()

Unnamed: 0,date,label
0,2018-01-27,1
1,2018-01-26,1
2,2018-01-28,1
3,2018-01-25,1
4,2018-01-29,1


In [107]:
for i in range(0, len(markers['date'])):    
    markers['date'][i] = datetime.strptime(markers['date'][i], "%Y-%m-%d")

type(markers['date'][0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


datetime.datetime

In [109]:
temp = pd.DatetimeIndex(markers['date']) #Gather all datetime objects
markers['date'] = temp.date #Pull out the date from the datetime objects and assign to Date column
markers.head()

Unnamed: 0,date,label
0,2018-01-27,1
1,2018-01-26,1
2,2018-01-28,1
3,2018-01-25,1
4,2018-01-29,1


In [110]:
df = articles.merge(markers, how='outer', on="date")

In [111]:
df['label'] = df['label'].fillna(0)
df.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00,0.0
1,Editorial Team,as scrambles serve massively expanding userbas...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00,0.0
2,Scott Scanlon,so many cryptocurrencies so much money made lo...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12,0.0
3,http://www.dailymail.co.uk/home/search.html?s=...,by press association published edt january upd...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47,0.0
4,Phil Glazer,initial coin offerings icos regulatory wild we...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02,0.0


In [112]:
df.to_csv("cleaned_and_merged.csv")