### Imports

In [88]:
import keras.losses
import pandas as pd
import os

In [89]:
import importlib
import My_PythonPackage.nlp_utils as nlp_utils
importlib.reload(nlp_utils)

<module 'My_PythonPackage.nlp_utils' from 'C:\\Users\\Guilherme\\Documents\\Programming\\Python\\Python_Projects\\My_PythonPackage\\nlp_utils.py'>

In [90]:
os.getcwd()

'C:\\Users\\Guilherme\\Documents\\Programming\\Python\\Python_Projects\\DataScience\\NLP\\Text_classification'

### Importing datasets

In [91]:
df_train_raw = pd.read_csv('../../Datasets/nlp-getting-started/train.csv')
df_test_raw = pd.read_csv('../../Datasets/nlp-getting-started/test.csv')

In [92]:
df_train_raw.sample(20)

Unnamed: 0,id,keyword,location,text,target
3115,4471,electrocuted,Oblivion?,Just thought I'd let you all know...\nIt's pro...,0
1872,2691,crush,,Only had a crush on one girl in high school an...,0
2375,3414,derail,Road to the Billionaires Club,@TemecaFreeman GM! I pray any attack of the en...,1
7528,10766,wreckage,Mumbai,Wreckage 'Conclusively Confirmed' as From MH37...,1
2681,3845,detonate,"Sydney, Australia",New music from @ApolloBrown featuring M.O.P.? ...,0
5003,7136,military,,Lot of 20 Tom Clancy Military Mystery Novels -...,0
1062,1533,bomb,wny,this is about to be a bomb ass firework pictur...,0
7466,10682,wounds,,Sorrower - Fresh Wounds Over Old Scars (2015...,1
960,1389,body%20bag,"Missouri, USA",Check out Ameribag Healthy Back Bag Shoulder C...,0
5027,7168,mudslide,London,#GBBO The difference between Paul and Mary my ...,0


In [93]:
print((df_train_raw.target == 1).sum())
print((df_train_raw.target == 0).sum())

3271
4342


### Data Preprocessing

In [94]:
import re
import string

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)


In [95]:
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

df_train['text'] = df_train['text'].map(lambda x: remove_URL(x))
df_train['text'] = df_train['text'].map(lambda x: remove_punct(x))
df_train['text'] = df_train['text'].map(lambda x: x.lower())

In [96]:
df_train.tail(20)

Unnamed: 0,id,keyword,location,text,target
7593,10848,,,i just heard a really loud bang and everyone i...,0
7594,10849,,,a gas thing just exploded and i heard screams ...,1
7595,10850,,,nws flash flood warning continued for shelby c...,1
7596,10851,,,rt livingsafely nws issues severe thunderstorm...,1
7597,10852,,,mh370 aircraft debris found on la reunion ...,1
7598,10853,,,fatherofthree lost control of car after overta...,1
7599,10854,,,13 earthquake in 9km ssw of anza california ip...,1
7600,10855,,,evacuation order lifted for town of roosevelt,1
7601,10859,,,breaking la refugio oil spill may have been co...,1
7602,10860,,,a siren just went off and it wasnt the forney ...,1


In [97]:
stop_words = nlp_utils.usingStopwords('en')
df_train['text'] = df_train['text'].map(lambda x: nlp_utils.removeStopwords(x,stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
df_train.tail(20)

Unnamed: 0,id,keyword,location,text,target
7593,10848,,,heard really loud bang everyone asleep great,0
7594,10849,,,gas thing exploded heard screams whole street ...,1
7595,10850,,,nws flash flood warning continued shelby count...,1
7596,10851,,,rt livingsafely nws issues severe thunderstorm...,1
7597,10852,,,mh370 aircraft debris found la reunion missing...,1
7598,10853,,,fatherofthree lost control car overtaking coll...,1
7599,10854,,,13 earthquake 9km ssw anza california iphone u...,1
7600,10855,,,evacuation order lifted town roosevelt,1
7601,10859,,,breaking la refugio oil spill may costlier big...,1
7602,10860,,,siren went wasnt forney tornado warning,1


### Selecting variables for model

In [99]:
X = df_train.text
y = df_train.target

### Splitting data into train and test

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)


### Vocabulary

In [101]:
tokens = []
termos = {}
for text in X:
    for token in text.split():
        tokens.append(token)
        if token in termos:
            termos[token] += 1
        else:
            termos[token] = 1
vocab_size = len(termos)
vocab_size

17971

In [102]:
# get 5 words with highest frequency from termos
sorted(termos.items(), key=lambda x: x[1], reverse=True)[:5]

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

### Train Numericalização

In [103]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [104]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)
word2index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)

In [105]:
max_len = nlp_utils.findMaxLen(train_sequences)
max_len

25

### Train padding

In [106]:
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')

### Validation Numericalization and Padding

In [107]:
val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

In [112]:
# print(X_train[0])
print(train_sequences[0])
print(train_padded[0])

[5625, 3710, 2254, 76, 339, 127, 50, 1908, 3711, 1016, 149, 3712, 3713]
[5625 3710 2254   76  339  127   50 1908 3711 1016  149 3712 3713    0
    0    0    0    0    0    0    0    0    0    0    0]


### RNN model

In [119]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy


In [117]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len),
    LSTM(64, dropout=0.1),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 32)            575072    
                                                                 
 lstm_1 (LSTM)               (None, 64)                24832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 599,969
Trainable params: 599,969
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss = BinaryCrossentropy(from_logits=False)
optim = Adam(learning_rate=0.001)
metrics = ['accuracy']
model.compile(loss=loss, optimizer=optim, metrics=metrics)
model.fit(train_padded, y_train, epochs=20, validation_data=(val_padded, y_val),verbose=2)

Epoch 1/20


In [109]:
# pattern = re.compile(r'https?://\S+|www\.\S+')
# for t in df_train.text:
#     matches = pattern.findall(t)
#     for match in matches:
#         print(t)
#         print(match)
#         print(pattern.sub())
