In [116]:
# Import libraries and packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from datetime import datetime

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec

# Plot styling
sns.set(style='white', context='notebook', palette='deep')

import nltk
from nltk.cluster import KMeansClusterer
from nltk.cluster import euclidean_distance

from sklearn import cluster
from sklearn import metrics
from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec
from gensim.models import word2vec

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

eng_stopwords = nltk.corpus.stopwords.words('english')

## Word2Vec and KMeans Clustering

This notebook performs the following tasks:
    1. Creates W2V vectors of given features
    2. Computes TF-IDF for given features
    3. Performs preprocessing 
    4. Carries out Clustering Analysis

__Bag of Words(BOW) and Word2Vec (W2V) related functions__

In [4]:
## modified the code from open source website http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ 
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [3]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [102]:
#read the data (cleaned and merged)
df = pd.read_csv('cleaned_and_merged.csv').drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00,0.0
1,Editorial Team,as scrambles serve massively expanding userbas...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00,0.0
2,Scott Scanlon,so many cryptocurrencies so much money made lo...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12,0.0
3,http://www.dailymail.co.uk/home/search.html?s=...,by press association published edt january upd...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47,0.0
4,Phil Glazer,initial coin offerings icos regulatory wild we...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02,0.0


### Methodology:

1. Make a dictionary for each event with word2vec
2. Perform feature engineering
3. Cluster Analysis of features
4. Rank clusters and mark larges cluster (i.e. the cluster with most articles)

__Preprocessing__

In [104]:
temp = df

In [106]:
# split str of content into a list of words and insert as new column
temp['c'] = temp['contents'].str.split()

In [107]:
# generate list of bad indeces to drop
bad_indices = []
for i in range(len(temp['c'])):
    if not isinstance(temp['c'][i], list):
        bad_indices.append(i)

In [36]:
trial_df = df.loc[(df.label==0)]
trial_df['label'] = trial_df.label.apply(lambda a: int(a))

In [108]:
# drop flagged indices
df.drop(bad_indices, inplace=True)

In [118]:
# sort df by label
df = df.sort_values('label', ascending = True)
df.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label,c
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00,0.0,"[a, complete, payments, platform, engineered, ..."
32286,Courtney Goldsmith,tech investor tmt investments planning ramp in...,Tech investor TMT Investments is planning to r...,Cityam.com,http://www.cityam.com/282328/tech-investor-tmt...,Tech investor TMT is ready to ride the blockch...,2018-03-15,12:40:00,0.0,"[tech, investor, tmt, investments, planning, r..."
32287,Ricardo Esteves,bitcoin news price information analysis john m...,"John McAfee, founder of the software company M...",Newsbtc.com,https://www.newsbtc.com/2018/03/15/john-mcafee...,John McAfee Joins ‘Hackproof’ Startup CryptoSe...,2018-03-15,12:45:48,0.0,"[bitcoin, news, price, information, analysis, ..."
32288,Steve McCaskill,by steve mccaskill t z networking colt pccw gl...,Colt and PCCW Blockchain trial claims to have ...,TechRadar,https://www.techradar.com/news/blockchain-can-...,Blockchain can 'speed up' payment settelements...,2018-03-15,13:00:42,0.0,"[by, steve, mccaskill, t, z, networking, colt,..."
32289,Lindsay Rowntree,lindsay rowntree th mar news comments blockcha...,Blockchain is a revolutionary technology that ...,Exchangewire.com,https://www.exchangewire.com/blog/2018/03/15/b...,"Blockchain Can Clean Up Programmatic, But We H...",2018-03-15,13:00:44,0.0,"[lindsay, rowntree, th, mar, news, comments, b..."


In [125]:
def cluster(df):
     """
    Accepts: a pandas dataframe
    Performs: computation of w2v, tf-idf, kmeans clustering and classification
    Returns: list of 0's and 1's
    """
    cluster_labels = []
    for i in range(1, int(max(df.label))+1):
        batch = df.loc[df.label==i]
        model = word2vec.Word2Vec(batch.c, min_count=15)
        w2v = dict(zip(model.wv.index2word, model.wv.syn0))
        t = TfidfEmbeddingVectorizer(w2v)
        t.fit(batch.c)
        X = t.transform(batch.c)
        kmeans = KMeans(n_clusters=2)
        kmeans.fit(X)
        if len(kmeans.labels_==0)/len(kmeans.labels_)>0.5:
            labels= 1*(np.array(kmeans.labels_)==0)
        else:
            labels= 1*(np.array(kmeans.labels_)==1)
        cluster_labels.extend(labels)
    return cluster_labels

In [126]:
final_labels = cluster(df)

In [133]:
final_labels

[1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [134]:
zero_labels = [0]*len(df[df['label']==0])
zero_labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


__Analyze Results__

In [135]:
len(zero_labels)+len(final_labels)

45217

In [136]:
zero_labels.extend(final_labels)

In [137]:
len(zero_labels)

45217

In [139]:
df['marks']= zero_labels
df.tail()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label,c,marks
40170,Kimberly Chin,amd heyday chip choice ethereum miners come en...,AMD has received up to 20% of its total revenu...,Business Insider,http://www.businessinsider.com/amd-stock-price...,AMD will get crushed in the race to be the ult...,2018-03-26,20:20:00,32.0,"[amd, heyday, chip, choice, ethereum, miners, ...",1
40173,Kimberly Chin,amd heyday chip choice ethereum miners come en...,AMD has received up to 20% of its total revenu...,Business Insider,http://www.businessinsider.com/amd-stock-price...,AMD will get crushed in the race to be the ult...,2018-03-26,20:20:00,32.0,"[amd, heyday, chip, choice, ethereum, miners, ...",1
40176,Camila Russo,connecting decision makers dynamic network inf...,"Coinbase Inc., which owns one of the largest U...",Bloomberg,https://www.bloomberg.com/news/articles/2018-0...,Crypto Exchange Coinbase Adds Support for Ethe...,2018-03-26,20:21:15,32.0,"[connecting, decision, makers, dynamic, networ...",1
40116,Bloomberg News,this section produced editorial department the...,Crypto company Arbitrade is 'weeks away' from ...,Financial Post,http://business.financialpost.com/technology/b...,Gold casts of Nelson Mandela’s hands sell to O...,2018-03-26,19:45:57,32.0,"[this, section, produced, editorial, departmen...",1
39825,RT,andy bauch created new money series brick bric...,A day rarely goes by without Bitcoin in the ne...,RT,https://www.rt.com/usa/422372-la-artist-hides-...,"Bitcoin LEGO mystery: LA artist hides ‘$10,000...",2018-03-26,15:56:00,32.0,"[andy, bauch, created, new, money, series, bri...",1


In [140]:
df.to_csv('ready for classification')

---