In [408]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime
import tensorflow as tf

import numpy as np
import pandas as pd

In [409]:
df_test = pd.read_csv(r'C:\Users\Aaron\OneDrive\Desktop\nlp-getting-started\test.csv')
df_train = pd.read_csv(r'C:\Users\Aaron\OneDrive\Desktop\nlp-getting-started\train.csv')

In [410]:
#Preprocessing Train Data
import re
import string

#Remove URLS

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [411]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df_train.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
t
@bbcmtd Wholesale Markets ablaze 


In [412]:
#Train
df_train["text"] = df_train.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_train["text"] = df_train.text.map(remove_punct)

#Test
df_test["text"] = df_test.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_test["text"] = df_test.text.map(remove_punct)

In [413]:
# Handling the stop words
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

words = set(nltk.corpus.words.words())
stop = set(stopwords.words("english"))

def remove_nonwords(text):
    filtered_words = [w for w in nltk.wordpunct_tokenize(text) if w.lower() in words]
    return " ".join(filtered_words)

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [414]:
df_train["text"] = df_train.text.map(remove_stopwords)
#df_train["text"] = df_train.text.map(remove_nonwords)
df_test["text"] = df_test.text.map(remove_stopwords)
#df_test["text"] = df_test.text.map(remove_nonwords)

In [415]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

#df_train_word = df_train.text.append(df_test.text)
counter = counter_word(df_train.text) #Vocab from train and test

In [416]:
counters = dict((k, v) for k, v in counter.items() if v <2)
len(counter)

17971

In [417]:
num_unique_words = len(counter)

In [418]:
#Train Test Split
X = df_train.text.to_numpy()
y = df_train.target.to_numpy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, random_state = 10)

In [419]:
#Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(X_train) # fit only to training WHY?

In [420]:
# each word has unique index
word_index = tokenizer.word_index
len(word_index)

14994

In [421]:
# Convert text to number sequence (Train data)
train_seq = tokenizer.texts_to_sequences(X_train)
val_seq = tokenizer.texts_to_sequences(X_test)

# Test Data
test_seq = tokenizer.texts_to_sequences(df_test['text'])

In [422]:
# We want the same legnth for every sequence

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_seq, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_seq, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

test_padded = pad_sequences(test_seq, maxlen=max_length, padding="post", truncating="post")

In [423]:
# Create Simple RNN model
from tensorflow.keras import layers
from tensorflow import keras


model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words,32, input_length=max_length))
model.add(layers.SimpleRNN(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_62"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_62 (Embedding)     (None, 20, 32)            575072    
_________________________________________________________________
simple_rnn_31 (SimpleRNN)    (None, 64)                6208      
_________________________________________________________________
dense_74 (Dense)             (None, 1)                 65        
Total params: 581,345
Trainable params: 581,345
Non-trainable params: 0
_________________________________________________________________


In [424]:
begin_time = datetime.datetime.now()

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]
model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

print(datetime.datetime.now() - begin_time)

Epoch 1/20
179/179 - 5s - loss: 0.5724 - accuracy: 0.7085 - val_loss: 0.4925 - val_accuracy: 0.7931
Epoch 2/20
179/179 - 1s - loss: 0.2581 - accuracy: 0.9047 - val_loss: 0.5536 - val_accuracy: 0.7663
Epoch 3/20
179/179 - 2s - loss: 0.1102 - accuracy: 0.9651 - val_loss: 0.6708 - val_accuracy: 0.7458
Epoch 4/20
179/179 - 2s - loss: 0.0689 - accuracy: 0.9755 - val_loss: 0.7767 - val_accuracy: 0.7616
Epoch 5/20
179/179 - 2s - loss: 0.0493 - accuracy: 0.9807 - val_loss: 0.8562 - val_accuracy: 0.7574
Epoch 6/20
179/179 - 1s - loss: 0.0456 - accuracy: 0.9802 - val_loss: 0.7745 - val_accuracy: 0.7558
Epoch 7/20
179/179 - 2s - loss: 0.0360 - accuracy: 0.9849 - val_loss: 0.8569 - val_accuracy: 0.7463
Epoch 8/20
179/179 - 1s - loss: 0.0331 - accuracy: 0.9846 - val_loss: 0.8562 - val_accuracy: 0.7237
Epoch 9/20
179/179 - 1s - loss: 0.0312 - accuracy: 0.9858 - val_loss: 0.9262 - val_accuracy: 0.7432
Epoch 10/20
179/179 - 2s - loss: 0.0285 - accuracy: 0.9865 - val_loss: 0.9468 - val_accuracy: 0.7437

In [425]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelSimpleRNN.csv')

In [426]:
# Create LSTM model

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_63"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_63 (Embedding)     (None, 20, 32)            575072    
_________________________________________________________________
lstm_29 (LSTM)               (None, 64)                24832     
_________________________________________________________________
dense_75 (Dense)             (None, 1)                 65        
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [427]:
begin_time = datetime.datetime.now()

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

print(datetime.datetime.now() - begin_time)

Epoch 1/20
179/179 - 4s - loss: 0.5599 - accuracy: 0.7024 - val_loss: 0.4602 - val_accuracy: 0.8067
Epoch 2/20
179/179 - 2s - loss: 0.2909 - accuracy: 0.8893 - val_loss: 0.4851 - val_accuracy: 0.7852
Epoch 3/20
179/179 - 2s - loss: 0.1526 - accuracy: 0.9485 - val_loss: 0.6460 - val_accuracy: 0.7621
Epoch 4/20
179/179 - 5s - loss: 0.1010 - accuracy: 0.9683 - val_loss: 0.7350 - val_accuracy: 0.7584
Epoch 5/20
179/179 - 3s - loss: 0.0737 - accuracy: 0.9760 - val_loss: 0.8851 - val_accuracy: 0.7721
Epoch 6/20
179/179 - 2s - loss: 0.0544 - accuracy: 0.9811 - val_loss: 0.7071 - val_accuracy: 0.7752
Epoch 7/20
179/179 - 2s - loss: 0.0413 - accuracy: 0.9809 - val_loss: 1.1804 - val_accuracy: 0.7658
Epoch 8/20
179/179 - 3s - loss: 0.0323 - accuracy: 0.9841 - val_loss: 1.3946 - val_accuracy: 0.7736
Epoch 9/20
179/179 - 3s - loss: 0.0388 - accuracy: 0.9823 - val_loss: 1.0638 - val_accuracy: 0.7799
Epoch 10/20
179/179 - 2s - loss: 0.0348 - accuracy: 0.9827 - val_loss: 1.2915 - val_accuracy: 0.7684

In [428]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelLSTM.csv')

In [429]:
# Create GRU model

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.GRU(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_64"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_64 (Embedding)     (None, 20, 32)            575072    
_________________________________________________________________
gru_4 (GRU)                  (None, 64)                18816     
_________________________________________________________________
dense_76 (Dense)             (None, 1)                 65        
Total params: 593,953
Trainable params: 593,953
Non-trainable params: 0
_________________________________________________________________


In [430]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

Epoch 1/20
179/179 - 3s - loss: 0.6611 - accuracy: 0.5927 - val_loss: 0.5285 - val_accuracy: 0.7511
Epoch 2/20
179/179 - 2s - loss: 0.3664 - accuracy: 0.8483 - val_loss: 0.4647 - val_accuracy: 0.7910
Epoch 3/20
179/179 - 2s - loss: 0.1992 - accuracy: 0.9285 - val_loss: 0.5125 - val_accuracy: 0.7747
Epoch 4/20
179/179 - 2s - loss: 0.1127 - accuracy: 0.9618 - val_loss: 0.6527 - val_accuracy: 0.7873
Epoch 5/20
179/179 - 2s - loss: 0.0815 - accuracy: 0.9730 - val_loss: 0.7538 - val_accuracy: 0.7742
Epoch 6/20
179/179 - 2s - loss: 0.0664 - accuracy: 0.9797 - val_loss: 0.7311 - val_accuracy: 0.7820
Epoch 7/20
179/179 - 2s - loss: 0.0597 - accuracy: 0.9837 - val_loss: 0.7677 - val_accuracy: 0.7715
Epoch 8/20
179/179 - 2s - loss: 0.0541 - accuracy: 0.9809 - val_loss: 0.7293 - val_accuracy: 0.7773
Epoch 9/20
179/179 - 2s - loss: 0.0482 - accuracy: 0.9848 - val_loss: 0.8412 - val_accuracy: 0.7747
Epoch 10/20
179/179 - 2s - loss: 0.0407 - accuracy: 0.9842 - val_loss: 1.1274 - val_accuracy: 0.7637

<tensorflow.python.keras.callbacks.History at 0x2b2c72a8a88>

In [431]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelGRU.csv')