In [106]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
import numpy as np
import pandas as pd
import pickle
import keras
from tqdm import tqdm_notebook
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

Using TensorFlow backend.


This notebook was used to train a doc2vec model using gensim, transform all texts into document vectors, and then use Keras to build a simple feedforward neural network to use on the datasets.

In [2]:
%cd '/data/hyperpartisan-news-detection'

/data/hyperpartisan-news-detection


# train doc2vec model

In [3]:
train = pd.read_csv('df/train_df.csv')['text'].dropna()
test = pd.read_csv('df/test_df.csv')['text'].dropna()
byarticle = pd.read_csv('df/byarticle_df.csv')['text'].dropna()

In [4]:
train = [TaggedDocument(simple_preprocess(line), [i]) for i, line in enumerate(train)]
test = [TaggedDocument(simple_preprocess(line), [i]) for i, line in enumerate(test)]
byarticle = [simple_preprocess(line) for line in byarticle]

In [5]:
model = Doc2Vec(vector_size=50, min_count=5, epochs=10)

In [6]:
model.build_vocab(train)

In [7]:
%time model.train(train, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 2h 46min 24s, sys: 3min 24s, total: 2h 49min 49s
Wall time: 1h 7min 18s


In [9]:
fname = get_tmpfile("doc2vec_model")
model.save(fname)
#model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
#model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [15]:
model.save('doc2vec_model')

# convert all docs to vectors and save

In [3]:
fname = 'doc2vec_model'
model_load = Doc2Vec.load(fname) 

In [24]:
train = pd.read_csv('df/train_df.csv')
test = pd.read_csv('df/test_df.csv')
byarticle = pd.read_csv('df/byarticle_df.csv')

In [25]:
def add_col(df):
    df['doc2vec'] = np.nan
    df['doc2vec'] = df['doc2vec'].astype(object)
    for idx, row in tqdm_notebook(df.iterrows(), total = df.shape[0]):
        if pd.isnull(row['text']):
            continue
        else:
            preprocessed = simple_preprocess(row['text'])
        df.at[idx, 'doc2vec'] = model_load.infer_vector(preprocessed)
    return df

In [26]:
byarticle = add_col(byarticle)

HBox(children=(IntProgress(value=0, max=645), HTML(value='')))




In [28]:
test = add_col(test)

HBox(children=(IntProgress(value=0, max=150000), HTML(value='')))




In [72]:
train = add_col(train)

HBox(children=(IntProgress(value=0, max=600000), HTML(value='')))




In [98]:
def update_df(df):
    # need to have features in different columns, not as a list because pd.to_csv will save it as a string
    features = df['doc2vec'].apply(pd.Series)
    return pd.concat([df.drop('doc2vec', axis='columns'), features], axis=1)

In [102]:
byarticle = update_df(byarticle)
test = update_df(test)
train = update_df(train)

In [104]:
train.to_csv('df/train_df_d2v.csv', index=False)
test.to_csv('df/test_df_d2v.csv', index=False)
byarticle.to_csv('df/byarticle_df_d2v.csv', index=False)

# test with logistic regression

In [128]:
# prepare dataset
train = pd.read_csv('df/train_df_d2v.csv').dropna()
test = pd.read_csv('df/test_df_d2v.csv').dropna()
byarticle = pd.read_csv('df/byarticle_df_d2v.csv').dropna()

In [129]:
train[train.columns[4:]].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-1.716436,0.706684,-0.086777,-0.524081,-0.114935,0.241039,0.197158,-0.156622,0.129688,1.137669,...,0.447064,-0.315349,0.397872,0.960722,1.146916,-0.69739,-0.575663,0.195232,-0.529706,-0.722719
1,-0.353557,0.020375,1.06034,-0.36451,-0.069735,1.911804,0.378816,0.590775,0.415128,1.305202,...,-1.069766,0.143367,0.364875,0.43479,0.562769,0.327206,-0.244797,1.245764,-0.053976,0.403655
2,-0.348614,0.729325,0.362333,-0.658784,-0.986217,1.053551,1.782581,0.894405,0.189247,0.883016,...,-0.028958,0.135226,0.256453,1.278639,0.237575,-0.361848,-0.70773,0.381473,-0.580781,1.007206
3,-0.342161,1.712023,-0.156347,-0.343225,-0.441173,0.518577,-0.033561,0.812273,-0.518142,2.251147,...,-1.381582,1.078593,0.525991,0.87058,-0.732706,-1.782752,0.578307,1.085978,0.937498,0.365913
4,1.347809,-1.603338,-1.08381,0.831758,0.58977,0.247571,-0.950736,-0.31326,0.593456,-1.296615,...,0.936097,0.240834,1.035502,-0.20084,0.024016,-0.457281,0.745475,-1.427883,-2.120091,0.455509


In [130]:
clf = LogisticRegression(verbose=1)
clf.fit(train[train.columns[4:]], train['label'])

print('byarticle score: ')
print(clf.score(byarticle[byarticle.columns[3:]], byarticle['label']))
print('train score: ')
print(clf.score(train[train.columns[4:]], train['label']))
print('test score: ')
print(clf.score(test[test.columns[4:]], test['label']))



[LibLinear]byarticle score: 
0.5736434108527132
train score: 
0.7607104647275159
test score: 
0.5513124931924627


# Keras model

In [132]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [133]:
seed = 7
np.random.seed(seed)

In [136]:
# Model example 1: 1 hidden layer with 64 hidden nodes
model_d2v_01 = Sequential()
model_d2v_01.add(Dense(64, activation='relu', input_dim=50))
model_d2v_01.add(Dense(1, activation='sigmoid'))
model_d2v_01.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_d2v_01.fit(train[train.columns[4:]], train['label'],
                 validation_data=(test[test.columns[4:]], test['label']),
                 epochs=5, batch_size=32, verbose=2)

Train on 598200 samples, validate on 137715 samples
Epoch 1/5
 - 77s - loss: 0.4354 - acc: 0.7962 - val_loss: 0.8469 - val_acc: 0.5572
Epoch 2/5
 - 76s - loss: 0.4173 - acc: 0.8062 - val_loss: 0.8997 - val_acc: 0.5482
Epoch 3/5
 - 75s - loss: 0.4122 - acc: 0.8092 - val_loss: 0.8266 - val_acc: 0.5688
Epoch 4/5
 - 76s - loss: 0.4091 - acc: 0.8107 - val_loss: 0.8715 - val_acc: 0.5529
Epoch 5/5
 - 77s - loss: 0.4068 - acc: 0.8120 - val_loss: 0.8900 - val_acc: 0.5575


<keras.callbacks.History at 0x7f092425a2b0>

In [139]:
model_d2v_01.evaluate(x=byarticle[byarticle.columns[3:]], y=byarticle['label'])



[0.9181516446808512, 0.5333333335181539]