### Imports

In [28]:
import keras.losses
import pandas as pd
import os

In [29]:
import importlib
import My_PythonPackage.nlp_utils as nlp_utils
importlib.reload(nlp_utils)

<module 'My_PythonPackage.nlp_utils' from 'C:\\Users\\guimi\\Documents\\Python_Projects\\My_PythonPackage\\nlp_utils.py'>

In [30]:
os.getcwd()

'C:\\Users\\guimi\\Documents\\Python_Projects\\DataScience\\NLP\\Text_classification'

### Importing datasets

In [31]:
df_train_raw = pd.read_csv('../../Datasets/nlp-getting-started/train.csv')
df_test_raw = pd.read_csv('../../Datasets/nlp-getting-started/test.csv')

In [32]:
df_train_raw.sample(20)

Unnamed: 0,id,keyword,location,text,target
2809,4041,disaster,"Calgary, AB",The @rbcinsurance quote website = disaster. Tr...,0
5476,7815,quarantine,,Reddit Will Now Quarantine Offensive Content h...,0
5493,7839,quarantine,London,Reddit updates content policy promises to quar...,0
5144,7335,nuclear%20reactor,World,Finnish ministers: Fennovoima nuclear reactor ...,1
4905,6981,massacre,,Sousse beach massacre linked to Tunis museum a...,1
6819,9766,trapped,central chazifornia,salute to all the kids still trapped in adult ...,0
912,1319,bloody,PH,Friday supposed to be a happy day but it's a b...,0
840,1219,blizzard,Ontario Canada,@TCGReno just hard reset my Xbox,0
1093,1579,bombed,"Dundas, Ontario",Jays rocking #MLB @JoeyBats19 just bombed one ...,0
4924,7013,mayhem,"Lynchburg, VA",Anyone else think that Stephen sounds like And...,0


In [33]:
print((df_train_raw.target == 1).sum())
print((df_train_raw.target == 0).sum())

3271
4342


### Data Preprocessing

In [34]:
import re
import string

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)


In [35]:
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

df_train['text'] = df_train['text'].map(lambda x: remove_URL(x))
df_train['text'] = df_train['text'].map(lambda x: remove_punct(x))
df_train['text'] = df_train['text'].map(lambda x: x.lower())

df_test['text'] = df_test['text'].map(lambda x: remove_URL(x))
df_test['text'] = df_test['text'].map(lambda x: remove_punct(x))
df_test['text'] = df_test['text'].map(lambda x: x.lower())

In [36]:
df_train.tail(20)

Unnamed: 0,id,keyword,location,text,target
7593,10848,,,i just heard a really loud bang and everyone i...,0
7594,10849,,,a gas thing just exploded and i heard screams ...,1
7595,10850,,,nws flash flood warning continued for shelby c...,1
7596,10851,,,rt livingsafely nws issues severe thunderstorm...,1
7597,10852,,,mh370 aircraft debris found on la reunion ...,1
7598,10853,,,fatherofthree lost control of car after overta...,1
7599,10854,,,13 earthquake in 9km ssw of anza california ip...,1
7600,10855,,,evacuation order lifted for town of roosevelt,1
7601,10859,,,breaking la refugio oil spill may have been co...,1
7602,10860,,,a siren just went off and it wasnt the forney ...,1


In [37]:
stop_words = nlp_utils.usingStopwords('en')
df_train['text'] = df_train['text'].map(lambda x: nlp_utils.removeStopwords(x,stop_words))
df_test['text'] = df_test['text'].map(lambda x: nlp_utils.removeStopwords(x,stop_words))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guimi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
df_train.tail(20)

Unnamed: 0,id,keyword,location,text,target
7593,10848,,,heard really loud bang everyone asleep great,0
7594,10849,,,gas thing exploded heard screams whole street ...,1
7595,10850,,,nws flash flood warning continued shelby count...,1
7596,10851,,,rt livingsafely nws issues severe thunderstorm...,1
7597,10852,,,mh370 aircraft debris found la reunion missing...,1
7598,10853,,,fatherofthree lost control car overtaking coll...,1
7599,10854,,,13 earthquake 9km ssw anza california iphone u...,1
7600,10855,,,evacuation order lifted town roosevelt,1
7601,10859,,,breaking la refugio oil spill may costlier big...,1
7602,10860,,,siren went wasnt forney tornado warning,1


### Selecting variables for model

In [39]:
X = df_train.text
y = df_train.target

### Splitting data into train and test

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

### Vocabulary

In [41]:
tokens = []
termos = {}
for text in X:
    for token in text.split():
        tokens.append(token)
        if token in termos:
            termos[token] += 1
        else:
            termos[token] = 1
vocab_size = len(termos)
vocab_size

17971

In [42]:
# get 5 words with highest frequency from termos
sorted(termos.items(), key=lambda x: x[1], reverse=True)[:5]

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

### Train Numericalização

In [43]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [44]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
word2index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)

In [45]:
max_len = nlp_utils.findMaxLen(train_sequences)
max_len

25

### Train padding

In [46]:
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')

### Validation Numericalization and Padding

In [47]:
val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

In [48]:
# print(X_train[0])
print(train_sequences[0])
print(train_padded[0])

[5625, 3710, 2254, 76, 339, 127, 50, 1908, 3711, 1016, 149, 3712, 3713]
[5625 3710 2254   76  339  127   50 1908 3711 1016  149 3712 3713    0
    0    0    0    0    0    0    0    0    0    0    0]


### RNN model

In [49]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy


In [50]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len),
    LSTM(64, dropout=0.1),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 32)            575072    
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [51]:
loss = BinaryCrossentropy(from_logits=False)
optim = Adam(learning_rate=0.001)
metrics = ['accuracy']
model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_val),verbose=2)

Epoch 1/20
191/191 - 6s - loss: 0.5671 - accuracy: 0.6941 - val_loss: 0.4531 - val_accuracy: 0.7997 - 6s/epoch - 33ms/step
Epoch 2/20
191/191 - 4s - loss: 0.3044 - accuracy: 0.8829 - val_loss: 0.5010 - val_accuracy: 0.7919 - 4s/epoch - 21ms/step
Epoch 3/20
191/191 - 4s - loss: 0.1586 - accuracy: 0.9466 - val_loss: 0.6691 - val_accuracy: 0.7623 - 4s/epoch - 21ms/step
Epoch 4/20
191/191 - 4s - loss: 0.1088 - accuracy: 0.9662 - val_loss: 0.7811 - val_accuracy: 0.7643 - 4s/epoch - 20ms/step
Epoch 5/20
191/191 - 4s - loss: 0.0832 - accuracy: 0.9772 - val_loss: 0.6429 - val_accuracy: 0.7610 - 4s/epoch - 19ms/step
Epoch 6/20
191/191 - 4s - loss: 0.0725 - accuracy: 0.9810 - val_loss: 0.7756 - val_accuracy: 0.7754 - 4s/epoch - 19ms/step
Epoch 7/20
191/191 - 4s - loss: 0.0624 - accuracy: 0.9811 - val_loss: 0.6420 - val_accuracy: 0.7623 - 4s/epoch - 19ms/step
Epoch 8/20
191/191 - 3s - loss: 0.0539 - accuracy: 0.9828 - val_loss: 0.9388 - val_accuracy: 0.7754 - 3s/epoch - 18ms/step
Epoch 9/20
191/1

<keras.callbacks.History at 0x1cde6fbc6d0>

### Predictions

In [87]:
X_test = df_test.text
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [89]:
index2word = {}
for key in word2index:
    index2word[word2index[key]] = key

text = []
for index in test_sequences[0]:
    text.append(index2word[index])
    # type(index)
text

['happened', 'terrible', 'car', 'crash']

In [90]:
predictions = model.predict(test_padded)



In [91]:
predictions

array([[0.23070061],
       [0.9997521 ],
       [0.9997464 ],
       ...,
       [0.99978197],
       [0.0514433 ],
       [0.9997755 ]], dtype=float32)

### Pre Processing

In [53]:
# pattern = re.compile(r'https?://\S+|www\.\S+')
# for t in df_train.text:
#     matches = pattern.findall(t)
#     for match in matches:
#         print(t)
#         print(match)
#         print(pattern.sub())
