In [371]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime
import tensorflow as tf

import numpy as np
import pandas as pd

In [390]:
df_test = pd.read_csv(r'C:\Users\Aaron\OneDrive\Desktop\nlp-getting-started\test.csv')
df_train = pd.read_csv(r'C:\Users\Aaron\OneDrive\Desktop\nlp-getting-started\train.csv')

In [391]:
#Preprocessing Train Data
import re
import string

#Remove URLS

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [392]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df_train.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
t
@bbcmtd Wholesale Markets ablaze 


In [393]:
#Train
df_train["text"] = df_train.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_train["text"] = df_train.text.map(remove_punct)

#Test
df_test["text"] = df_test.text.map(remove_URL) # map(lambda x: remove_URL(x))
df_test["text"] = df_test.text.map(remove_punct)

In [394]:
# Handling the stop words
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

words = set(nltk.corpus.words.words())
stop = set(stopwords.words("english"))

def remove_nonwords(text):
    filtered_words = [w for w in nltk.wordpunct_tokenize(text) if w.lower() in words]
    return " ".join(filtered_words)

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [395]:
df_train["text"] = df_train.text.map(remove_stopwords)
#df_train["text"] = df_train.text.map(remove_nonwords)
df_test["text"] = df_test.text.map(remove_stopwords)
#df_test["text"] = df_test.text.map(remove_nonwords)

In [396]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

#df_train_word = df_train.text.append(df_test.text)
counter = counter_word(df_train.text) #Vocab from train and test

In [397]:
counters = dict((k, v) for k, v in counter.items() if v <2)
len(counter)

17971

In [398]:
num_unique_words = len(counter)

In [399]:
#Train Test Split
X = df_train.text.to_numpy()
y = df_train.target.to_numpy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, random_state = 10)

In [400]:
#Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(X_train) # fit only to training WHY?

In [401]:
# each word has unique index
word_index = tokenizer.word_index
len(word_index)

14994

In [402]:
# Convert text to number sequence (Train data)
train_seq = tokenizer.texts_to_sequences(X_train)
val_seq = tokenizer.texts_to_sequences(X_test)

# Test Data
test_seq = tokenizer.texts_to_sequences(df_test['text'])

In [403]:
# We want the same legnth for every sequence

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_seq, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_seq, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

test_padded = pad_sequences(test_seq, maxlen=max_length, padding="post", truncating="post")

In [404]:
# Create Simple RNN model
from tensorflow.keras import layers
from tensorflow import keras


model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words,32, input_length=max_length))
model.add(layers.SimpleRNN(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_60"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_60 (Embedding)     (None, 20, 32)            575072    
_________________________________________________________________
simple_rnn_30 (SimpleRNN)    (None, 64)                6208      
_________________________________________________________________
dense_72 (Dense)             (None, 1)                 65        
Total params: 581,345
Trainable params: 581,345
Non-trainable params: 0
_________________________________________________________________


In [405]:
begin_time = datetime.datetime.now()

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]
model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

print(datetime.datetime.now() - begin_time)

Epoch 1/20
179/179 - 2s - loss: 0.5815 - accuracy: 0.6940 - val_loss: 0.4921 - val_accuracy: 0.7862
Epoch 2/20
179/179 - 1s - loss: 0.2635 - accuracy: 0.9000 - val_loss: 0.5460 - val_accuracy: 0.7700
Epoch 3/20
179/179 - 1s - loss: 0.1041 - accuracy: 0.9667 - val_loss: 0.6743 - val_accuracy: 0.7574
Epoch 4/20
179/179 - 1s - loss: 0.0655 - accuracy: 0.9758 - val_loss: 0.6942 - val_accuracy: 0.7505
Epoch 5/20
179/179 - 1s - loss: 0.0506 - accuracy: 0.9804 - val_loss: 0.7470 - val_accuracy: 0.7568
Epoch 6/20
179/179 - 1s - loss: 0.0412 - accuracy: 0.9825 - val_loss: 0.8990 - val_accuracy: 0.7679
Epoch 7/20
179/179 - 1s - loss: 0.0392 - accuracy: 0.9827 - val_loss: 0.9353 - val_accuracy: 0.6991
Epoch 8/20
179/179 - 1s - loss: 0.0340 - accuracy: 0.9858 - val_loss: 0.9173 - val_accuracy: 0.7574
Epoch 9/20
179/179 - 1s - loss: 0.0313 - accuracy: 0.9856 - val_loss: 0.9538 - val_accuracy: 0.7495
Epoch 10/20
179/179 - 1s - loss: 0.0314 - accuracy: 0.9860 - val_loss: 0.9334 - val_accuracy: 0.7384

In [359]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelSimpleRNN.csv')

In [406]:
# Create LSTM model

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_61"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_61 (Embedding)     (None, 20, 32)            575072    
_________________________________________________________________
lstm_28 (LSTM)               (None, 64)                24832     
_________________________________________________________________
dense_73 (Dense)             (None, 1)                 65        
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [407]:
begin_time = datetime.datetime.now()

loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

print(datetime.datetime.now() - begin_time)

Epoch 1/20
179/179 - 4s - loss: 0.5488 - accuracy: 0.7166 - val_loss: 0.4499 - val_accuracy: 0.7973
Epoch 2/20
179/179 - 2s - loss: 0.2824 - accuracy: 0.8919 - val_loss: 0.4921 - val_accuracy: 0.7894
Epoch 3/20
179/179 - 2s - loss: 0.1473 - accuracy: 0.9494 - val_loss: 0.6513 - val_accuracy: 0.7668
Epoch 4/20
179/179 - 2s - loss: 0.0913 - accuracy: 0.9725 - val_loss: 0.7549 - val_accuracy: 0.7794
Epoch 5/20
179/179 - 2s - loss: 0.0695 - accuracy: 0.9790 - val_loss: 0.9721 - val_accuracy: 0.7679
Epoch 6/20
179/179 - 2s - loss: 0.0643 - accuracy: 0.9802 - val_loss: 0.7780 - val_accuracy: 0.7768
Epoch 7/20
179/179 - 2s - loss: 0.0566 - accuracy: 0.9802 - val_loss: 0.8242 - val_accuracy: 0.7773
Epoch 8/20
179/179 - 2s - loss: 0.0438 - accuracy: 0.9814 - val_loss: 0.8104 - val_accuracy: 0.7689
Epoch 9/20
179/179 - 2s - loss: 0.0367 - accuracy: 0.9853 - val_loss: 1.1584 - val_accuracy: 0.7747
Epoch 10/20
179/179 - 2s - loss: 0.0320 - accuracy: 0.9862 - val_loss: 1.1007 - val_accuracy: 0.7694

In [362]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelLSTM.csv')

In [207]:
# Create GRU model

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.GRU(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential_41"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_41 (Embedding)     (None, 20, 32)            10432     
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                18816     
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 65        
Total params: 29,313
Trainable params: 29,313
Non-trainable params: 0
_________________________________________________________________


In [208]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_test), verbose=2)

Epoch 1/20
179/179 - 3s - loss: 0.6841 - accuracy: 0.5702 - val_loss: 0.6853 - val_accuracy: 0.5688
Epoch 2/20
179/179 - 1s - loss: 0.6836 - accuracy: 0.5709 - val_loss: 0.6838 - val_accuracy: 0.5688
Epoch 3/20
179/179 - 1s - loss: 0.6479 - accuracy: 0.6174 - val_loss: 0.5369 - val_accuracy: 0.7479
Epoch 4/20
179/179 - 1s - loss: 0.5136 - accuracy: 0.7588 - val_loss: 0.5082 - val_accuracy: 0.7600
Epoch 5/20
179/179 - 1s - loss: 0.4978 - accuracy: 0.7718 - val_loss: 0.5397 - val_accuracy: 0.7579
Epoch 6/20
179/179 - 1s - loss: 0.4883 - accuracy: 0.7756 - val_loss: 0.5239 - val_accuracy: 0.7616
Epoch 7/20
179/179 - 1s - loss: 0.4837 - accuracy: 0.7768 - val_loss: 0.5074 - val_accuracy: 0.7600
Epoch 8/20
179/179 - 1s - loss: 0.4784 - accuracy: 0.7777 - val_loss: 0.5141 - val_accuracy: 0.7563
Epoch 9/20
179/179 - 1s - loss: 0.4785 - accuracy: 0.7830 - val_loss: 0.5170 - val_accuracy: 0.7532
Epoch 10/20
179/179 - 1s - loss: 0.4704 - accuracy: 0.7844 - val_loss: 0.5156 - val_accuracy: 0.7574

<tensorflow.python.keras.callbacks.History at 0x2b2c4d26ec8>

In [209]:
B = np.where(model.predict(test_padded) > 0.5, 1, 0)
pd.DataFrame(B).to_csv('modelGRU.csv')

[['',
  'e',
  'e',
  '',
  '',
  '',
  'r',
  'e',
  '',
  '',
  '',
  'n',
  '',
  'e',
  '',
  'r',
  '',
  'h',
  'q',
  'u',
  '',
  '',
  'e',
  '',
  '',
  '',
  '',
  '',
  '',
  'l',
  'l',
  '',
  'h',
  '',
  'f',
  '',
  'r',
  'g',
  '',
  'v',
  'e',
  '',
  'u',
  ''],
 ['f',
  '',
  'r',
  'e',
  '',
  '',
  '',
  'f',
  '',
  'r',
  'e',
  '',
  'n',
  'e',
  '',
  'r',
  '',
  'l',
  '',
  '',
  'r',
  '',
  'n',
  'g',
  'e',
  '',
  '',
  '',
  '',
  '',
  '',
  'c',
  '',
  'n',
  '',
  '',
  ''],
 ['r',
  'e',
  '',
  '',
  '',
  'e',
  'n',
  '',
  '',
  '',
  '',
  '',
  '',
  'e',
  '',
  '',
  '',
  'h',
  'e',
  'l',
  '',
  'e',
  'r',
  '',
  'p',
  'l',
  '',
  'c',
  'e',
  '',
  'n',
  '',
  '',
  '',
  'f',
  '',
  'e',
  '',
  '',
  '',
  'f',
  'f',
  '',
  'c',
  'e',
  'r',
  '',
  '',
  'e',
  'v',
  '',
  'c',
  'u',
  '',
  '',
  '',
  '',
  'n',
  '',
  '',
  'h',
  'e',
  'l',
  '',
  'e',
  'r',
  '',
  'p',
  'l',
  '',
  'c',
  'e',
  '',
  