In [1]:
# Drew on some of this work: https://www.kaggle.com/vsmolyakov/keras-cnn-with-fasttext-embeddings

In [17]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs
import numpy as np
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from keras.models import Sequential
from keras import regularizers, optimizers
from keras.callbacks import EarlyStopping

In [3]:
nyt_df = pd.read_csv('../data/nyt_ts_headline.csv', index_col=0)
breitbart_df = pd.read_csv('../data/breitbart_ts_headline.csv', index_col=0)

In [4]:
nyt_df.head()

Unnamed: 0,headline
2016-01-01T00:00:00Z,The Week on Instagram
2016-01-01T22:00:56Z,Mass Master
2016-01-01T21:17:09Z,Friday Night Music: More Wild Reeds
2016-01-01T00:00:00Z,Our Favorite Styles Photos of 2015
2016-01-01T19:10:36Z,Wishes for the New Year


In [5]:
MAX_NB_WORDS = 100000
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

In [6]:
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('./wiki-news-300d-1M.vec', encoding='utf-8')
i = 0
for line in tqdm(f):
    if i != 0:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    i += 1
f.close()
print('found %s word vectors' % len(embeddings_index))

766it [00:00, 7655.32it/s]

loading word embeddings...


999995it [01:44, 9588.25it/s] 

found 999994 word vectors





In [7]:
nyt_df['y'] = [[1, 0]] * len(nyt_df)
breitbart_df['y'] = [[0, 1]] * len(breitbart_df)
df_all = pd.concat([nyt_df, breitbart_df], ignore_index=True)
df_all = df_all.sample(frac=1., random_state=42)

In [8]:
df_all['cleaned'] = df_all['headline'].apply(lambda x: 
                            ' '.join([word for word in tokenizer.tokenize(x) 
                             if word not in stop_words]))

In [9]:
df_all.head()

Unnamed: 0,headline,y,cleaned
224829,Chicago Cubs Award Controversial Fan Steve Bar...,"[0, 1]",Chicago Cubs Award Controversial Fan Steve Bar...
296055,"Carpetbagging: Norman Lear, Hollywood Producer...","[0, 1]",Carpetbagging Norman Lear Hollywood Producer W...
260125,Donald Trump: Missing FBI Texts ‘One of the Bi...,"[0, 1]",Donald Trump Missing FBI Texts One Biggest Sto...
12328,Classical Music Listings for Feb. 19-25,"[1, 0]",Classical Music Listings Feb 19 25
261893,Germany Revealed as Biggest EU Rule Breaker as...,"[0, 1]",Germany Revealed Biggest EU Rule Breaker Bloc ...


In [39]:
max_seq_len = 20

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(df_all['cleaned'])

df_all['x'] = tokenizer.texts_to_sequences(df_all['cleaned'])

word_index = tokenizer.word_index
print('dict size:', len(word_index))

df_all['x'] = [np.array(x) for x in sequence.pad_sequences(df_all['x'].values, maxlen=max_seq_len)]

dict size: 63746


In [41]:
batch_size = 256
n_epochs = 10

n_filters = 64
embed_dim = 300
weight_decay = 1e-4
n_classes = 2

In [42]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 18713


In [43]:
print("sample words not found: ", np.random.choice(words_not_found, 10))

sample words not found:  ['toutant' 'adama' 'hulaween' 'finnerty' 'apax' 'keefe' 'gaymerx' '发生的时候'
 'mandvi' 'dbacks']


In [44]:
test_split = 0.2

n_test = int(len(df_all) * test_split)

x_vals = np.stack(df_all['x'].values)
y_vals = np.stack(df_all['y'].values)

train_X, test_X = x_vals[:-n_test], x_vals[-n_test:]
train_y, test_y = y_vals[:-n_test], y_vals[-n_test:]

In [45]:
#CNN architecture
model = Sequential()
model.add(Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(n_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(n_filters, 7, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(n_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 300)           19123800  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 20, 64)            134464    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 10, 64)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 10, 64)            28736     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
__________

In [46]:
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [47]:
hist = model.fit(train_X, train_y, 
                 batch_size=batch_size, 
                 epochs=n_epochs, 
                 callbacks=callbacks_list, 
                 validation_split=0.1, 
                 shuffle=True, 
                 verbose=2)

ValueError: Error when checking input: expected embedding_2_input to have shape (20,) but got array with shape (1,)