In [1]:
import numpy as np
import pandas as pd
import gensim
from ast import literal_eval

from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, CuDNNLSTM, LSTM

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("../data/ISOT_Preprocessed/data.csv", index_col=0)
dataset.reset_index(drop=True, inplace=True)

In [3]:
dataset

Unnamed: 0,text,label
0,"['donald', 'trump', 'couldn', 'wish', 'america...",0
1,"['house', 'intelligence', 'committee', 'chairm...",0
2,"['friday', 'reveal', 'milwaukee', 'sheriff', '...",0
3,"['christmas', 'day', 'donald', 'trump', 'annou...",0
4,"['pope', 'francis', 'use', 'annual', 'christma...",0
...,...,...
44893,"['brussels', 'reuter', 'nato', 'ally', 'tuesda...",1
44894,"['london', 'reuters', 'lexisnexi', 'provider',...",1
44895,"['minsk', 'reuter', 'shadow', 'disuse', 'sovie...",1
44896,"['moscow', 'reuter', 'vatican', 'secretary', '...",1


In [4]:
articles = dataset['text'].apply(literal_eval)
articles

0        [donald, trump, couldn, wish, americans, happy...
1        [house, intelligence, committee, chairman, dev...
2        [friday, reveal, milwaukee, sheriff, david, cl...
3        [christmas, day, donald, trump, announce, work...
4        [pope, francis, use, annual, christmas, day, m...
                               ...                        
44893    [brussels, reuter, nato, ally, tuesday, welcom...
44894    [london, reuters, lexisnexi, provider, legal, ...
44895    [minsk, reuter, shadow, disuse, sovietera, fac...
44896    [moscow, reuter, vatican, secretary, state, ca...
44897    [jakarta, reuters, indonesia, buy, 11, sukhoi,...
Name: text, Length: 44898, dtype: object

In [5]:
max_length = 1000
lengths = np.array([len(x) for x in articles])
dataset = dataset[lengths < max_length]
articles = articles[lengths < max_length]
dataset = dataset.reset_index(drop = True)
articles = articles.reset_index(drop = True)

In [6]:
article_length = max(articles.apply(len))
article_length

986

In [7]:
vec_size = 100

word_model = gensim.models.Word2Vec(articles, vector_size = vec_size, window = 5, workers = 12)
word_model.train(articles, epochs = 10, total_examples = len(articles))
wv = word_model.wv

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(articles)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(articles)

In [9]:
# To consider: avoid articles longer than x
count = 0
for i in range(0, len(encoded_articles)):
    if len(encoded_articles[i]) > 1000:
        count += 1
print(count)

0


In [10]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding='post')

In [11]:
padded_articles.shape

(44665, 986)

In [12]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, dataset['label'], test_size=0.25)

In [19]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(LSTM(32, return_sequences=True))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.20))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(1, activation='sigmoid'))

In [20]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 986, 100)          20719700  
                                                                 
 lstm_1 (LSTM)               (None, 986, 32)           17024     
                                                                 
 flatten_1 (Flatten)         (None, 31552)             0         
                                                                 
 dense_1 (Dense)             (None, 128)               4038784   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dropout_1 (Dropout)         (None, 16)               

In [21]:
model.compile(optimizer=RMSprop(learning_rate=1e-5),loss='binary_crossentropy', metrics=['accuracy'])

In [22]:
epochs=20
batch_size=128

In [23]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x22600c0ba90>