In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import  torchtext

In [2]:
data = pd.read_csv('datasets/ham-spam/spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [12]:
data.head(4)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [8]:
train,test = train_test_split(data,test_size= 0.2, random_state= 42)

In [18]:
train.reset_index(drop=True , inplace= True)
test.reset_index(drop=True,inplace= True)

In [20]:
test

Unnamed: 0,v1,v2
0,ham,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
1,ham,I sent my scores to sophas and i had to do sec...
2,spam,We know someone who you know that fancies you....
3,ham,Only if you promise your getting out as SOON a...
4,spam,Congratulations ur awarded either å£500 of CD ...
...,...,...
1110,ham,&lt;DECIMAL&gt; m but its not a common car he...
1111,ham,Rightio. 11.48 it is then. Well arent we all u...
1112,ham,Yes i have. So that's why u texted. Pshew...mi...
1113,ham,"Get the door, I'm here"


In [21]:
train.shape, test.shape

((4457, 2), (1115, 2))

In [50]:
# Saving Train and test data in csv files
train.to_csv('datasets/ham-spam/train.csv', index=False)
test.to_csv('datasets/ham-spam/test.csv', index=False)

In [51]:
ls datasets

 Volume in drive C is OSDisk
 Volume Serial Number is E6C6-5DA6

 Directory of C:\Users\raze\Documents\Personal\OneDrive\DS\Deep Learning\TEST\datasets

09/14/2021  07:54 PM    <DIR>          .
09/14/2021  07:54 PM    <DIR>          ..
09/11/2021  06:45 PM             6,148 .DS_Store
09/08/2021  06:37 PM    <DIR>          cifar10
09/14/2021  07:54 PM    <DIR>          data
09/14/2021  07:54 PM    <DIR>          ham-spam
09/14/2021  07:54 PM    <DIR>          tweets
               1 File(s)          6,148 bytes
               6 Dir(s)  391,168,126,976 bytes free


In [80]:
import numpy as np
import torch
import torchtext
from torchtext.legacy.data import Field,LabelField, BucketIterator, TabularDataset

In [67]:
''' NLTK provides a function called word_tokenize() for splitting strings
into tokens (nominally words). It splits tokens based on white space and punctuation'''
import nltk
nltk.download('punkt') # that's the punkt word tokenizer
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raze\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### The parameters of a Field specify how the data should be processed.We use the TEXT field to define how the text should be processed, and the LABEL field to process the labels. in the example its tokenized as individual  words

In [79]:
TEXT  = Field(tokenize= word_tokenize)

In [81]:
LABEL = LabelField(dtype = torch.float)

In [82]:
datafields = [("labels", LABEL), ("text", TEXT)]

In [92]:
trn, tst = TabularDataset.splits(path='./datasets/ham-spam',
                                 train= 'train.csv',
                                 test= 'test.csv',
                                 format = 'csv',
                                 skip_header = True,
                                 fields = [("labels", LABEL), ("text", TEXT)]
                                )

In [91]:
trn[:4]

[<torchtext.legacy.data.example.Example at 0x25101b67550>,
 <torchtext.legacy.data.example.Example at 0x251009278b0>,
 <torchtext.legacy.data.example.Example at 0x25100927b80>,
 <torchtext.legacy.data.example.Example at 0x25100927130>]

In [93]:
len(trn), len(tst)

(4457, 1115)

In [100]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [110]:
trn[5].__dict__.values()

dict_values(['ham', ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']])

In [114]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [115]:
trn[5].labels

'ham'

In [120]:
print(vars(trn[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}


#### build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer). Each index is used to construct a one-hot vector for each word.

In [133]:
TEXT.build_vocab(trn, max_size = 10000)

In [134]:
LABEL.build_vocab(trn)

In [135]:
len(TEXT.vocab), len(LABEL.vocab)

(10002, 2)

In [168]:
TEXT.vocab.freqs.items()



In [154]:
TEXT.vocab.freqs.most_common(10)
# the no. is the unique ID of the word
# compact presentaton of the one hot incoding


[('.', 3862),
 ('to', 1750),
 ('I', 1574),
 (',', 1468),
 ('you', 1462),
 ('?', 1256),
 ('!', 1134),
 ('a', 1068),
 ('the', 946),
 ('...', 923)]

In [153]:
# show text by position
TEXT.vocab.itos[:10] 

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']

In [160]:
# show position by text
TEXT.vocab.stoi['to']

3

Now, we will create iterators that will iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.
#### We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

In [180]:
batch_size = 64

train_iterator, test_iterator = BucketIterator.splits((trn,tst),
                                                     batch_size = batch_size,
                                                     sort_key = lambda x: len(x.text),
                                                     sort_within_batch = False)


TypeError: 'TabularDataset' object is not callable

In [173]:
len(trn[5].text)

9

In [181]:
import torch.nn as nn

In [182]:
class RNN(nn.Module):
    def __init__ (self, input_dim, embedding_dim, hidden_dim , output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim,embedding_dim) # encode into a dense format 
        self.rnn = nn.RNN(embedding_dim, hidden_dim) # the basic recurrent neural network> current WORD EMBEDDING + previous HIDDEN_STATE 
        self.fc = nn.Linear(hidden_dim,output_dim) # last FC layer that gives prediction
    def forward(self, text):   # text : [sentence_lenght , batch_size]
        embedded = self.embedding(text) # convert to dense vector > embedded: [sentence_lenght, batch_size, embedding_dim] 
        output, hidden = self.rnn(embedded) # output_shape: [sentence_lengh, batch_size, hidden_dim]- after concatenation. hidden_shape = [1, batch_ size , hidden_dim] > one hidden output for each sentence ] that will go to the linear stage 
        hidden_1D = hidden.squeeze(0) # when finish loopin the assert will be true
        assert torch.equal(output[-1,:,:], hidden_1D)
        return self.fc(hidden_1D)

In [184]:
input_dim = len(TEXT.vocab) # no. feature input is as the size of the vocabulary
embedding_dim = 100
hidden_dim = 256
output_dim = 1 # 0/1 - spam 

In [185]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [186]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr= 0.000006)

#### we will use BCEWithLogitsLoss loss as our loss function - Creates a criterion that measures the Binary Cross Entropy between the target and the output
This loss combines a Sigmoid layer and the BCELoss in one single class.

In [188]:
criterion = nn.BCEWithLogitsLoss()  # good for binary

In [189]:
# helper function for training 
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
     
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions)) # round to get 0 / 1
        correct = (rounded_preds == batch.labels).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [191]:
num_epochs = 7

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.404 | Train Acc: 86.03% 
| Epoch: 02 | Train Loss: 0.402 | Train Acc: 86.28% 
| Epoch: 03 | Train Loss: 0.401 | Train Acc: 86.39% 
| Epoch: 04 | Train Loss: 0.401 | Train Acc: 86.26% 
| Epoch: 05 | Train Loss: 0.400 | Train Acc: 86.37% 
| Epoch: 06 | Train Loss: 0.402 | Train Acc: 86.29% 
| Epoch: 07 | Train Loss: 0.400 | Train Acc: 86.46% 


## 
## Testing

In [194]:
model.eval()

RNN(
  (embedding): Embedding(10002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [222]:
epoch_loss = 0
epoch_acc = 0

with torch.no_grad():
    for b in test_iterator:
        predict = model(b.text).squeeze(1)
                
        loss = criterion(predict,b.labels)
#         print ('loss:')
#         print(loss)
        
        rounded_preds = torch.round(torch.sigmoid(predict))
#         print ('rounded_preds:')
#         print(rounded_preds)
        
#         print(f'rounded_preds: {rounded_preds}')
              
        correct = (rounded_preds == b.labels).float()
        print(f'correct.sum(): {correct.sum()}')
        print(f'len(correct): {len(correct)}')
        accur = (correct.sum()/ len(correct))
        print(f'accur: {accur}')
        
        epoch_loss += loss.item()
        epoch_acc += accur.item()
        
test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'len(test_iterator): {len(test_iterator)}') 
print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

correct.sum(): 58.0
len(correct): 64
accur: 0.90625
correct.sum(): 62.0
len(correct): 64
accur: 0.96875
correct.sum(): 47.0
len(correct): 64
accur: 0.734375
correct.sum(): 51.0
len(correct): 64
accur: 0.796875
correct.sum(): 53.0
len(correct): 64
accur: 0.828125
correct.sum(): 58.0
len(correct): 64
accur: 0.90625
correct.sum(): 59.0
len(correct): 64
accur: 0.921875
correct.sum(): 62.0
len(correct): 64
accur: 0.96875
correct.sum(): 57.0
len(correct): 64
accur: 0.890625
correct.sum(): 58.0
len(correct): 64
accur: 0.90625
correct.sum(): 54.0
len(correct): 64
accur: 0.84375
correct.sum(): 52.0
len(correct): 64
accur: 0.8125
correct.sum(): 46.0
len(correct): 64
accur: 0.71875
correct.sum(): 33.0
len(correct): 64
accur: 0.515625
correct.sum(): 37.0
len(correct): 64
accur: 0.578125
correct.sum(): 37.0
len(correct): 64
accur: 0.578125
correct.sum(): 52.0
len(correct): 64
accur: 0.8125
correct.sum(): 27.0
len(correct): 27
accur: 1.0
len(test_iterator): 18
| Test Loss: 0.542 | Test Acc: 81.60% |

## LSTM

In [230]:
# change the network stracture !

class RNN(nn.Module):
    def __init__ (self, input_dim, embedding_dim, hidden_dim , output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim,embedding_dim) # encode into a dense format 
            # LSTM - change !
        self.rnn = nn.LSTM(embedding_dim, hidden_dim) # the basic recurrent neural network> current WORD EMBEDDING + previous HIDDEN_STATE 
        self.fc = nn.Linear(hidden_dim,output_dim) # last FC layer that gives prediction
    def forward(self, text):   # text : [sentence_lenght , batch_size]
        embedded = self.embedding(text) # convert to dense vector > embedded: [sentence_lenght, batch_size, embedding_dim] 
            # implement DropOut 
        embedded_dropout = self.dropout(embedded,0.2)
            # LSTM - change !  ['hidden' : 'hidden, _']  > in final output we get the last cell state of LSTM cell
        output, hidden = self.rnn(embedded) # output_shape: [sentence_lengh, batch_size, hidden_dim]- after concatenation. hidden_shape = [1, batch_ size , hidden_dim] > one hidden output for each sentence ] that will go to the linear stage 
        hidden_1D = hidden.squeeze(0) # when finish loopin the assert will be true
        assert torch.equal(output[-1,:,:], hidden_1D)
        return self.fc(hidden_1D)

In [231]:
num_epochs = 7

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')


| Epoch: 01 | Train Loss: 0.399 | Train Acc: 86.47% 
| Epoch: 02 | Train Loss: 0.399 | Train Acc: 86.53% 
| Epoch: 03 | Train Loss: 0.402 | Train Acc: 86.42% 
| Epoch: 04 | Train Loss: 0.400 | Train Acc: 86.46% 
| Epoch: 05 | Train Loss: 0.398 | Train Acc: 86.52% 
| Epoch: 06 | Train Loss: 0.399 | Train Acc: 86.57% 
| Epoch: 07 | Train Loss: 0.400 | Train Acc: 86.51% 


In [232]:
model.eval()

RNN(
  (embedding): Embedding(10002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [233]:
epoch_loss = 0
epoch_acc = 0

with torch.no_grad():
    for b in test_iterator:
        predict = model(b.text).squeeze(1)
                
        loss = criterion(predict,b.labels)

        
        rounded_preds = torch.round(torch.sigmoid(predict))

              
        correct = (rounded_preds == b.labels).float()
#         print(f'correct.sum(): {correct.sum()}')
#         print(f'len(correct): {len(correct)}')
        accur = (correct.sum()/ len(correct))
#         print(f'accur: {accur}')
        
        epoch_loss += loss.item()
        epoch_acc += accur.item()
        
test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'len(test_iterator): {len(test_iterator)}') 
print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

len(test_iterator): 18
| Test Loss: 0.518 | Test Acc: 84.46% |
