In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import random

In [2]:
# imports
import os
import pandas as pd
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()

import nltk
#file for punkt splitter
nltk.download('punkt');
#file for vader sentiment
nltk.download('vader_lexicon');

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"]=20,20
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\emiel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<br><br>

## Data loading/collection

In [3]:
series_len = 200
mask_value = -10
max_len = 0

def text_to_sentiments(text):
    sentences = nltk.tokenize.sent_tokenize(text)
    scores = [ sia.polarity_scores(s)['compound'] for s in sentences] #list of compound score per sentence
    
    if len(scores) < series_len:
        t  = series_len - len(scores)
        scores = np.pad(scores, (t, 0), mode='constant', constant_values=mask_value))
        scores = scores.reshape((1, series_len))
    else:
        scores = scores[(-1*series_len):]
        scores = scores.reshape((1, series_len))
        
    return scores

In [4]:
try:
    df = pd.read_csv(os.path.join('out', 'truefake_series.csv'))
    print('Succesfully loaded data')
except FileNotFoundError:
    print('ERROR: Couldn\'t find data, will start building it now...')
    dataset_path_true = os.path.join("sources", "ISOT", "True.csv")
    dataset_path_fake = os.path.join("sources", "ISOT", "Fake.csv")

    dataset_load_true = pd.read_csv(dataset_path_true, encoding='utf-8') # make sure to use the right encoding
    dataset_load_fake = pd.read_csv(dataset_path_fake, encoding='utf-8') 
    
    df_T = pd.DataFrame(columns = ['scores', 'veracity'])
    df_F = pd.DataFrame(columns = ['scores', 'veracity'])

    df_T['scores'] = dataset_load_true.text.apply(text_to_sentiments)
    df_T['veracity'] = 1

    df_F['scores'] = dataset_load_fake.text.apply(text_to_sentiments)
    df_F['veracity'] = 0

    df = pd.concat([df_T,df_F], ignore_index=True)
    df = df[df['scores'].map(lambda d: len(d)) > 0]
    df.reset_index(drop = True, inplace=True)
    
    df = pd.concat([df_T,df_F], ignore_index=True)
    df = df[df['scores'].map(lambda d: len(d)) > 0]
    df.reset_index(drop = True, inplace=True)
    
    df.to_csv(os.path.join('out', 'truefake_series.csv'), index=False)
    
finally:
    display(df)

ERROR: Couldn't find data, will start building it now...


Unnamed: 0,scores,veracity
0,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",1
1,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",1
2,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",1
3,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",1
4,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",1
...,...,...
44893,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",0
44894,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",0
44895,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",0
44896,"[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10...",0


<br><br>
## Splitting data

In [5]:
def train_test_split(df_in, test_size, shuffle=True):
    
    if shuffle:
        df_in = df_in.sample(frac=1).reset_index(drop=True)
    
    mask_test = np.random.rand(len(df_in)) < test_size # the sample is large enough to probably not care about proper division
    
    train = df_in[~mask_test]
    test = df_in[mask_test]
    
    X_train = train['scores'].tolist()
    y_train = train['veracity'].tolist()
    
    X_test = test['scores'].tolist()
    y_test = test['veracity'].tolist()
    
    return (X_train, y_train), (X_test, y_test)

In [21]:
x = pd.DataFrame({'A': [np.array([0.1,0.2,0,3]),np.array([0.3,0.4,0,5])]})
y = x['A'].to_numpy

y convert to numpy array of samples, steps, 1

<bound method IndexOpsMixin.to_numpy of 0    [0.1, 0.2, 0.0, 3.0]
1    [0.3, 0.4, 0.0, 5.0]
Name: A, dtype: object>

In [6]:
np.random.seed(160)
random.seed(160)

# # turn T/F labels into 1/0
# d = {'T': 1, 'F':0}
# df['veracity'] = df['veracity'].map(d)

(X_train, y_train), (X_test, y_test) = train_test_split(df, test_size = 0.5)

print("train: ",len(X_train), len(y_train),
     "\ntest: ",len(X_test), len(y_test),
     "\ntest/total: {:0.3f}".format(len(X_test)/(len(X_test)+len(X_train)))
    )

print(y_train[:10])
print(type(X_train[0]))

train:  22499 22499 
test:  22399 22399 
test/total: 0.499
[0, 1, 1, 1, 0, 0, 1, 0, 0, 1]
<class 'list'>


In [7]:
from keras.datasets import imdb
(XX_train, yy_train), (XX_test, yy_test) = imdb.load_data(num_words=3)

In [17]:
print(type(XX_train[0]))
print(XX_train[0])
print('\n')
print(type(X_train[1]))
print(X_train[100])

<class 'list'>
[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


<class 'list'>
[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.

<br><br>

## Building the model

In [11]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

In [14]:
model = Sequential()

# 200 [series_len] timesteps (sentences) with a 1-dim feature (sentiment), returns a 1-dim target
# model.add(Embedding(input_dim=1, #1 dimensional input (the sentiment of the sentence in a single var) ((int?))
#                     output_dim=1, #1 dimensional output (true or fake in a single var)
#                     input_length=series_len)) #series_len amount of timesteps per instance

model.add(Masking(mask_value=mask_value))

model.add(LSTM(100))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 1)            1         
_________________________________________________________________
masking_1 (Masking)          (None, 200, 1)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               40800     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 40,902
Trainable params: 40,902
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3


InvalidArgumentError:  indices[0,0] = -10 is not in [0, 1)
	 [[node sequential_1/embedding_1/embedding_lookup (defined at <ipython-input-15-749b5fa260b3>:1) ]] [Op:__inference_train_function_10719]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_1/embedding_1/embedding_lookup:
 sequential_1/embedding_1/embedding_lookup/8086 (defined at c:\users\emiel\appdata\local\programs\python\python38\lib\contextlib.py:113)

Function call stack:
train_function
