In [18]:
import glob
import os 
cwd = os.getcwd()
files = [a for a in glob.iglob(cwd + '**/**/*.txt', recursive=True)]

In [19]:
import pandas as pd

data = pd.DataFrame(columns=['text', 'sentiment', 'truthful'])

for file in files:
    f = open(file)
    text = f.read()
    real = 0
    sentiment = 0
    if 'truthful' in file:
        real = 1
    if 'positive_polarity' in file:
        sentiment = 1
    data = data.append({'text': text, 'sentiment': sentiment, 'truthful': real}, ignore_index=True)
    
data.head()

Unnamed: 0,text,sentiment,truthful
0,"excellent staff and customer service, very cle...",1,0
1,My stay at this hotel was one of the best I ha...,1,0
2,We just got back from a trip to Chicago for my...,1,0
3,I have to say that the Hard Rock Hotel in Chic...,1,0
4,My husband and I recently stayed at the Hard R...,1,0


In [20]:
import re

import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

data['urls_count'] = data['text'].apply(lambda x: len(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', x)))
data['words_count'] = data['text'].apply(lambda x: len(x.split()))
data['sentences_count'] = data['text'].apply(lambda x: len(sent_tokenize(x)))
data['words_per_sentence'] = data['words_count'] / data['sentences_count']

data['text'] = data['text'].apply(lambda x: x.lower().strip())
data['text'] = data['text'].apply(lambda x: re.sub(r" +", " ", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"/[-\/\\^$*+?.()|[\]{}]/g", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"[iex\*][nsx\*][ftx\*][pjx\*]", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"[0-9]+", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"#[a-zA-Z]+", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"['\";:,.?!\/\\()\[\]+]", "", x))
data['text'] = data['text'].apply(lambda x: re.sub(r"[-_]", " ", x))
data['text'] = data['text'].apply(lambda x: re.sub(r" +", " ", x))

data = data[data['text'] != '']

data = data[data['sentiment'] == 1]

y = data['truthful']
X = data.drop(columns='truthful')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fredybotas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fredybotas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

TOP_K = 30000

MAX_SEQUENCE_LENGTH = 10000

def sequence_vectorize(train_texts, val_texts):
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index, max_length

In [22]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stop_words_and_lemmatize(data):
    lemmatizer=WordNetLemmatizer()
    res = []
    for a in data:
        input_str = word_tokenize(a)
        temp_str = ""
        for word in input_str:
            if word in stop_words:
                continue
            temp_str += lemmatizer.lemmatize(word)
            temp_str += " "
        temp_str = temp_str[:-1]
        res.append(temp_str)
    return res

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fredybotas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
import tensorflow as tf

x_train, x_val = X_train.text.values, X_test.text.values
x_train = remove_stop_words_and_lemmatize(x_train)
x_val = remove_stop_words_and_lemmatize(x_val)

x_train, x_val, word_index, max_length = sequence_vectorize(x_train, x_val)
num_features = min(len(word_index) + 1, TOP_K)

In [43]:
embedding_vector_length = 256

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(TOP_K, embedding_vector_length, input_length=max_length),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_test), epochs=5, batch_size=16)

Train on 640 samples, validate on 160 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x165fb3320>

In [37]:
y_pred = model.predict(x_val)
y_pred = [1 if a > 0.5 else 0 for a in y_pred]

print(confusion_matrix(list(y_test), list(y_pred)))

f1_score(list(y_test), y_pred, average='weighted')

[[74  8]
 [14 64]]


0.8621764705882352