In [1]:
with open("../data/reliable_news_prep", "r") as reliable_file:
    rel = [line.strip() for line in reliable_file]
with open("../data/fake_news_prep", "r") as fake_file:
    fake = [line.strip() for line in fake_file]

In [2]:
length=50000

In [4]:
text = rel[:length]+fake[:length]

In [5]:
labels = [0 if i<length else 1 for i in range(2*length)] # reliable - 0; fake - 1

In [7]:
from training_preprocess import prepare_fastText_embedding_matrix as prepare_embedding
from training_preprocess import sequence_vectorize
from training_preprocess import train_val_test_split as split

In [8]:
X_train, X_val, X_test, Y_train, Y_test, Y_val = split(text, labels, 0.2, 0.1)

In [9]:
x_train, x_val, x_test, word_index = sequence_vectorize(X_train, X_val, X_test, sequence_length=1100)

In [10]:
embedding_matrix = prepare_embedding(word_index, "../data/cc.en.300.bin.gz")

In [11]:
from keras import layers, models, optimizers, regularizers

Using TensorFlow backend.


In [26]:
label_to_vector([1])

array([[0., 1.]])

In [12]:
import numpy as np

def label_to_vector(labels):
    array = np.zeros([len(labels), 2])
    for i, label in enumerate(labels):
        array[i, label] = 1
    return array

In [13]:
def bidirectional_LSTM(embedding_dim, dropout_rate, input_shape, num_classes, is_embedding_trainable, embedding_matrix):
    
    # Add an Input Layer
    input_layer = layers.Input((input_shape, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(input_dim=embedding_matrix.shape[0],
                                       output_dim=embedding_dim,
                                       weights=[embedding_matrix],
                                       trainable=is_embedding_trainable)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.6)(embedding_layer)
    
    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(20))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(70, activation="relu")(lstm_layer)
    output_layer2 = layers.Dropout(dropout_rate)(output_layer1)
    output_layer3 = layers.Dense(num_classes, activation="softmax")(output_layer2)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer3)
    
    return model

In [14]:
y_train = label_to_vector(Y_train)

In [15]:
y_val = label_to_vector(Y_val)

In [16]:
import keras

model = bidirectional_LSTM(300, 0.2, 1100, 2, True, embedding_matrix)
model.compile(optimizer='Adam', loss="binary_crossentropy", metrics=['accuracy'])

callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]

model.fit(x_train,
          y_train,
          epochs=10,
          callbacks=callbacks,
          validation_data=(x_val, y_val),
          verbose=2,  # Logs once per epoch.
          batch_size=128)

model.save_weights('../data/sequence_model_with_pre_trained_embedding.h5')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 70000 samples, validate on 20000 samples
Epoch 1/10
 - 3184s - loss: 0.1924 - acc: 0.9271 - val_loss: 0.0998 - val_acc: 0.9681
Epoch 2/10
 - 3173s - loss: 0.0917 - acc: 0.9690 - val_loss: 0.0872 - val_acc: 0.9718
Epoch 3/10
 - 3170s - loss: 0.0684 - acc: 0.9768 - val_loss: 0.0851 - val_acc: 0.9735
Epoch 4/10
 - 3165s - loss: 0.0459 - acc: 0.9850 - val_loss: 0.0835 - val_acc: 0.9729
Epoch 5/10
 - 3171s - loss: 0.0367 - acc: 0.9878 - val_loss: 0.0999 - val_acc: 0.9744
Epoch 6/10
 - 3184s - loss: 0.0262 - acc: 0.9917 - val_loss: 0.0998 - val_acc: 0.9726
Epoch 7/10
 - 3191s - loss: 0.0259 - acc: 0.9916 - val_loss: 0.1146 - val_acc: 0.9721


In [17]:
y_test = label_to_vector(Y_test)

In [19]:
model.evaluate(x_test, y_test)



[0.10529134182436392, 0.9726]

In [20]:
text = "Last Sunday, Rudy Giuliani, former mayor of New York City and current lawyer of President Donald Trump, insisted that there was “nothing wrong” with President Donald Trump’s 2016 campaign team accepting assistance from the Russians. While he called the action possibly ill-advised, it wasn’t illegal. These statements were made in relation to the release of the report into the investigation led by special counsel Robert Mueller into the alleged Russian interference in the 2016 presidential elections. These were also made after the Democrats in the House of Representatives have pledged more in-depth investigations into possible obstruction of justice and campaign misconduct committed by President Trump and members of his inner circle. To quote Giuliani, “There’s nothing wrong with taking information from Russians.” His statement obviously referred to the now-infamous June 2016 Trump Tower meeting between Paul Manafort, Donald Trump, Jr. and Jared Kushner with individuals connected with the Russian government. At the time, the Trump campaign team was seeking damaging information about Hillary Clinton. Since the release of the Mueller report, there has been numerous back-and-for the exchanges between the President’s supporters and critics. Senator Elizabeth Warren, for one thing, has called on the House of Representatives to start impeachment proceedings. Senator Mitt Romney (Republican, Utah) has tweeted that he was “sickened” by the findings, especially the “dishonesty and misdirection by individuals in the highest office of the land including the President”. He has apparently broke with his party although this isn’t the first time – and possibly not the last either – that he has criticized President Trump.But Giuliani said that Senator Romney should “stop the bull”. He added it’s a common practice to accept negative information regarding a political opponent, and that while he would have advised the President not to do it, there was no crime committed. His advice, he said, would have been made “just out of excess caution”.When he was asked about the appropriateness of using information stolen by foreign enemies, Giuliani answered that, “It depends on the stolen material.”President Trump has, of course, been busy tweeting his reactions to the Mueller report and to the reactions from politicians, analysts and media outfits. Among his tweets on Sunday, “Despite No Collusion, No Obstruction, The Radical Left Democrats do not want to go on to Legislate for the good of the people, but only to Investigate and waste time. This is costing our Country greatly, and will cost the Dems big time in 2020!”From his tweets, President Trump continues to assert that he has been completely and fully cleared of misconduct in relation to the Russian interference. He also says that the Democrats’ continued efforts will prove costly to the government and the public. But emphasis must be made that Mueller didn’t explicitly clear the president of obstruction of justice. Instead, he cited guidelines of the Justice Department that stated an incumbent president shouldn’t be indicted. The leaders of the Democratic Party hasn’t been spared. Many of the party’s rising stars including some of its 2020 presidential aspirants have called for impeachment proceedings. Senator Nancy Pelosi (Democratic, California), the House Speaker, isn’t too keen on impeachment but favors a step-by-step approach to Congress’ oversight over the President.Senator Mike Lee (Republican, Utah), a member of the Senate Judiciary Committee, has said, “I think politically speaking it would be a mistake for them to do it.” He added that the most important takeaway from the report was that “there was no collusion” and, thus, the Democrats are making a mistake if they pursue with impeachment proceedings.He further said that, “It’s time to move on” and the American public won’t stand for it since the Mueller investigations didn’t uncover any crimes. Going back to Giuliani, he said that the President’s legal team is considering the release of a detailed written rebuttal to counter the allegations made in the Mueller report. He said that the rebuttal may become necessary regardless of whether the hearings will be done or not and whether other issues will be raised or not in the future. At the moment, he added, “We think the public debate is playing out about as well as it can.”Representative Jerrold Nadler (Democratic, New York), however, didn’t rule out an impeachment. Nadler, the chairman of the House committee in charge of impeachment hearings, has expressed his puzzlement about why Mueller didn’t file charges of criminal conspiracy against the people present and/or involved in the 2016 Trump Tower meeting.  He asserts that the parties “entered into a meeting of the minds…to get stolen material on Hillary” and it constitutes conspiracy. Nadler also said that these offenses are impeachable. The Democrats’ present focus should then be to “go where the evidence leads us.”The Mueller report released to the public has been redacted, and Nadler has subpoenaed the Justice Department for its full and complete version. He has also called Mueller, Attorney General William Bar, and Don McGahn, the former White House counsel, to testify before his committee. The plot just keeps getting thicker!"

In [21]:
from text_preprocess import en_lemmatize, filter_stopwords, filter_punctuation, filter_urls

In [22]:
text = filter_stopwords(filter_punctuation(en_lemmatize(filter_urls([text]))))

In [24]:
_, _, text_vect, _ = sequence_vectorize(X_train, X_val, text, sequence_length=1100)

In [None]:
[1,0]

In [25]:
model.predict(text_vect)

array([[0.9381239, 0.0618761]], dtype=float32)

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1100)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1100, 300)         3000300   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1100, 300)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_1 (Dense)              (None, 70)                2870      
_________________________________________________________________
dropout_1 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 142       
Total para