In [2]:
import numpy as np
import regex as re
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
en_stop = set(nltk.corpus.stopwords.words('english'))
from sklearn.metrics import confusion_matrix

from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation

In [3]:
#load data
train = pd.read_csv('Quora/train.csv', encoding="utf8")
test = pd.read_csv('Quora/test.csv', encoding="utf8")

In [4]:
#Clean Data
stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [5]:
train['cleaned_text'] = train['question_text'].apply(preprocess_text)
test['cleaned_text'] = test['question_text'].apply(preprocess_text)

In [6]:
#convert text data into token vectors, 
vocabulary_size = 20000

tokenizer = Tokenizer(num_words = vocabulary_size)

#train data
tokenizer.fit_on_texts(train['cleaned_text'])
train_sequences = tokenizer.texts_to_sequences(train['cleaned_text'])

#apply a padding method to add zeros and set the fixed size into each vector.
train_data = pad_sequences(train_sequences, maxlen=50)



# test data
tokenizer.fit_on_texts(test['cleaned_text'])
test_sequences = tokenizer.texts_to_sequences(test['cleaned_text'])

#apply a padding method to add zeros and set the fixed size into each vector.
test_data = pad_sequences(test_sequences, maxlen=50)

In [7]:
#embeddings
embeddings_index = dict()
f = open('C:/Projects/NLP/Home Work/Glove/glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [8]:
# create a weight matrix for words in training docs

embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [9]:
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=50, weights=[embedding_matrix], trainable=False))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 2,110,651
Trainable params: 110,651
Non-trainable params: 2,000,000
_________________________________________________________________
None


In [10]:
model.fit(train_data, np.array(train['target']), validation_split=0.4, epochs = 2)

Train on 783673 samples, validate on 522449 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2778edbf240>

In [11]:
#predict test data

y_pred = model.predict(test_data)

y_pred[y_pred>0.5]=1 
y_pred[y_pred<=0.5]=0 

In [19]:
test[:2]['cleaned_text']

0    many woman become rude arrogant little wealth ...
1    apply college engineering college engineering ...
Name: cleaned_text, dtype: object

In [20]:
test_data[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   11,   21,
          28, 1443, 3287,  548, 1791,  147],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  236,   42,   66,   42,   66,
        1257, 3256,  388, 1319,  236,  388]])

In [30]:
#predict test data
y_pred = model.predict(test_data[:2])

In [35]:
#result
for i in zip(test[:2]['cleaned_text'], y_pred):
    print(i[0])
    print('Positive' if i[1][0] >0.5 else 'Negative')
    print('**************************************') 

many woman become rude arrogant little wealth power
Negative
**************************************
apply college engineering college engineering wait comedk result supposed apply result
Negative
**************************************
