In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

# Final Classification

This notebook performs the following processes:
    
    1. Feature Selection creation: Weighted-frequency of word vector
    2. Classification 

In [3]:
df = pd.read_csv('ready for classification').drop('Unnamed: 0', axis =1)
df.head()

Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label,c,marks
0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00,0.0,"['a', 'complete', 'payments', 'platform', 'eng...",0
1,Courtney Goldsmith,tech investor tmt investments planning ramp in...,Tech investor TMT Investments is planning to r...,Cityam.com,http://www.cityam.com/282328/tech-investor-tmt...,Tech investor TMT is ready to ride the blockch...,2018-03-15,12:40:00,0.0,"['tech', 'investor', 'tmt', 'investments', 'pl...",0
2,Ricardo Esteves,bitcoin news price information analysis john m...,"John McAfee, founder of the software company M...",Newsbtc.com,https://www.newsbtc.com/2018/03/15/john-mcafee...,John McAfee Joins ‘Hackproof’ Startup CryptoSe...,2018-03-15,12:45:48,0.0,"['bitcoin', 'news', 'price', 'information', 'a...",0
3,Steve McCaskill,by steve mccaskill t z networking colt pccw gl...,Colt and PCCW Blockchain trial claims to have ...,TechRadar,https://www.techradar.com/news/blockchain-can-...,Blockchain can 'speed up' payment settelements...,2018-03-15,13:00:42,0.0,"['by', 'steve', 'mccaskill', 't', 'z', 'networ...",0
4,Lindsay Rowntree,lindsay rowntree th mar news comments blockcha...,Blockchain is a revolutionary technology that ...,Exchangewire.com,https://www.exchangewire.com/blog/2018/03/15/b...,"Blockchain Can Clean Up Programmatic, But We H...",2018-03-15,13:00:44,0.0,"['lindsay', 'rowntree', 'th', 'mar', 'news', '...",0


__Train/Test Split__

In [59]:
len(df[df.marks==1])

30024

In [60]:
len(df)

45217

In [6]:
#splitting the data into train and test
train, test = train_test_split(df, test_size=0.20, random_state= 67)

In [7]:
train_Y = train.marks

In [8]:
test_Y = test.marks

In [21]:
test_Y.shape

(11305,)

## Starting the Feature Selection Process

In [10]:
#performing Tf-idf on title
titles= list(train['title'])

tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
tfidf_titles = tfidf.fit_transform(titles)


In [None]:
#for testing set
titles_test= list(test['title'])

tfidf_titles_test = tfidf.fit_transform(titles_test)

In [30]:
#performing Tf-idf on title
content= list(train['contents'])
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
tfidf.fit(content)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=False,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [32]:
tfidf_contents = tfidf.transform(content)

In [33]:
tfidf_contents.shape


(33912, 94766)

In [34]:
#for testing set
titles_content= list(test['contents'])


tfidf_content_test = tfidf.transform(titles_content)

In [35]:
tfidf_content_test.shape

(11305, 94766)

__Functions for weighted-frequency W2V vectors__

In [None]:
## modified the code from open source website http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ 
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [9]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

__function calls and object creations__

In [10]:

model = word2vec.Word2Vec(train['c'], min_count=15)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))


In [11]:
model_test = word2vec.Word2Vec(test['c'], min_count=15)
w2v_test = dict(zip(model_test.wv.index2word, model_test.wv.syn0))

In [12]:
#tf_idf with word to vec
t = TfidfEmbeddingVectorizer(w2v)
t.fit(train['contents'])
X = t.transform(train['contents'])

In [14]:
#the same for test
X_test = t.transform(test['contents'])

In [15]:
X.shape

(36173, 100)

In [53]:
tfidf_titles.shape

(38434, 13237)

In [None]:
np.append(X_test, np.array([binary_html_test]).T,1)

In [51]:
features = np.append(X, np.array([tfidf_titles]).T)

In [52]:
features.shape

(3843401,)

In [39]:
features_test = np.append(X_test, tfidf_titles_test)

In [43]:
features.shape

(3843401,)

## Model Training: 

### Random Forrest Classifier on Weighted Frequency Features

In [16]:
#training random forest
from sklearn.ensemble import RandomForestClassifier
rf_d = RandomForestClassifier(n_estimators=1000)
rf_d.fit(X, train_Y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

__Training and Testing Accuracies__

In [17]:
print('training accuracy is: ', rf_d.score(X, train_Y))

training accuracy is:  0.860448400741


In [18]:
print('testing accuracy is:', rf_d.score(X_test, test_Y))

testing accuracy is: 0.63025210084


### Random Forrest Classifier on tf-idf features

In [None]:
#training random forest
from sklearn.ensemble import RandomForestClassifier
rf_d = RandomForestClassifier(n_estimators=1000)
rf_d.fit(tfidf_titles, train_Y)


__Training and Testing Accuracies__

In [None]:
print('training accuracy is: ', rf_d.score(X, train_Y))

### Logistic Regression Classifier

In [13]:
#training a logistic regression model and recording the accuracy
from sklearn.linear_model import LogisticRegression
lr_od = LogisticRegression()
lr_od.fit(tfidf_contents, train_Y)
print('training accuracy is: ',lr_od.score(tfidf_contents, train_Y))
# print('testing accuracy is:', lr_od.score(tfidf_content_test, test_Y))

training accuracy is:  0.771142958245


__Training and Testing Accuracies__

In [36]:
print('testing accuracy is:', lr_od.score(tfidf_content_test, test_Y))

testing accuracy is: 0.713047324193


In [23]:
df_without_clusters = pd.read_csv('cleaned_and_merged.csv')
df_without_clusters

Unnamed: 0.1,Unnamed: 0,author,contents,description,publisher,source_url,title,date,time,label
0,0,Stripe.com,a complete payments platform engineered growth...,"At Stripe, we’ve long been excited about the p...",Stripe.com,https://stripe.com/blog/ending-bitcoin-support,Ending Bitcoin Support,2018-01-23,00:00:00,0.0
1,1,Editorial Team,as scrambles serve massively expanding userbas...,As it scrambles to serve a massively expanding...,Finextra.com,https://www.finextra.com/newsarticle/31558/coi...,Coinbase hires former Twitter exec to lead cus...,2018-01-23,00:01:00,0.0
2,2,Scott Scanlon,so many cryptocurrencies so much money made lo...,So many cryptocurrencies. So much money to be ...,Youbrandinc.com,https://www.youbrandinc.com/crytocurrency/shou...,Should you buy bitcoin? Or Ethereum? Or Dash? ...,2018-01-23,00:03:12,0.0
3,3,http://www.dailymail.co.uk/home/search.html?s=...,by press association published edt january upd...,The Tokyo-based firm has been awarded a paymen...,Daily Mail,http://www.dailymail.co.uk/wires/pa/article-53...,World´s biggest Bitcoin exchange wins backing ...,2018-01-23,00:05:47,0.0
4,4,Phil Glazer,initial coin offerings icos regulatory wild we...,Initial coin offerings (ICOs) are a regulatory...,Hackernoon.com,https://hackernoon.com/is-regulation-needed-fo...,Is Regulation Needed for Institutional Investo...,2018-01-23,00:06:02,0.0
5,5,Financial Times,hannah murphy london bitflyer tokyo based oper...,Exchange to offer trading of bitcoin/euro pair...,Financial Times,https://www.ft.com/content/5bf1462c-ff84-11e7-...,Japan’s bitFlyer set to stage Europe expansion,2018-01-23,00:06:21,0.0
6,6,Topix.com,not topix user yet forgot your password news p...,The fundamentals are as untenable as they were...,Topix.com,http://www.topix.com/tech/p2p/2018/01/dont-try...,Don't Try To Catch The Bitcoin Knife,2018-01-23,00:11:59,0.0
7,7,Chloe Aiello,var postloadfunctions var foresee enabled var ...,ETF Managers Group Founder and CEO Sam Masucci...,CNBC,https://www.cnbc.com/2018/01/22/marijuana-etf-...,Here's why one investor is way more comfortabl...,2018-01-23,00:26:00,0.0
8,8,Bitcoinist.net,real time prices vires numeris bitcoin ethereu...,Miami has apparently witnessed the “hottest” w...,Bitcoinist.com,http://bitcoinist.com/achain-showcased-collabo...,Achain Showcased its Collaborative Platform at...,2018-01-23,00:30:10,0.0
9,9,Sasha Lekach,the bitcoin craze officially jumped real estat...,The bitcoin craze has officially jumped to rea...,Mashable,http://mashable.com/2018/01/22/cryptohomes-rea...,Homeowners everywhere are listing their proper...,2018-01-23,00:36:22,0.0


In [24]:
df_without_clusters['mark'] =df_without_clusters.label.apply(lambda a: (a>0)*True)

In [29]:
#splitting the data into train and test
train1, test1 = train_test_split(df_without_clusters, test_size=0.20, random_state= 67)

In [None]:
#performing Tf-idf on title
titles1= list(train1['contents'])


tfidf1 = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
tfidf_titles1 = tfidf1.fit_transform(titles1)

In [None]:

X_test1 = t_test1.transform(test1['c'])

___