In [1]:
# imports

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

import random

import os
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
series_len = 100
mask_value = -10
max_len = 0

def text_to_sentiments(text):
    sentences = nltk.tokenize.sent_tokenize(text)
    scores = np.vstack([ np.array(list((sia.polarity_scores(s)).values())) for s in sentences]) #list of scores per sentence
    return scores

def rating_to_score_array(t):
    return np.array([int(t=='FALSE'), int(t=='mostly false'), int(t=='mixture'), int(t=='mostly true'), int(t=='TRUE')])

In [3]:

dataset_path = os.path.join("sources", "snopes_checked_v02.csv")
dataset_load = pd.read_csv(dataset_path, encoding='ANSI')

dataset = pd.DataFrame({
                        'title': dataset_load['article_title_phase2'],
                        'body': dataset_load['original_article_text_phase2'],
                        'category': dataset_load['article_category_phase1'],
                        'descr_rating': dataset_load['fact_rating_phase1']
                        })

df = pd.DataFrame(columns = ['scores', 'veracity'])

df['scores'] = dataset.body.apply(text_to_sentiments)
df['veracity'] = dataset.descr_rating.apply(rating_to_score_array)

df.to_csv(os.path.join('out', 'truefake_mcintire.csv'), index=False)


X = np.array(df['scores'].to_list())
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=series_len, padding='post', truncating='pre', dtype='float64', value=-10)
  
y= np.array(df['veracity'].to_list())

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape

(234, 100, 4)

In [5]:
model = None

model = Sequential()

model.add(Masking(mask_value=-10,
                  input_shape=(100, 4)))

model.add(LSTM(100,return_sequences=True))

model.add(LSTM(50))

model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, 100, 4)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100, 100)          42000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 72,251
Trainable params: 72,251
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=2)

Epoch 1/20


ValueError: in user code:

    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\engine\training.py:748 train_step
        loss = self.compiled_loss(
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\losses.py:149 __call__
        losses = ag_call(y_true, y_pred)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\losses.py:253 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\losses.py:1535 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\keras\backend.py:4687 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    C:\Users\emiel\.virtualenvs\Thesis_Notebook-tvt70hiV\lib\site-packages\tensorflow\python\framework\tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (2, 5) and (2, 1) are incompatible


In [None]:
# Thanks Jason Brownlee
# https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()