In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.metrics import classification_report

In [2]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [3]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        # text.shape = [batch_size, seq_length]
        embedded = self.embedding(text)
        # embedded.shape = [batch_size, seq_length, embedding_dim]
        output, (hidden, cell) = self.rnn(embedded)
        # output.shape = [batch_size, seq_length, hidden_dim]
        # hidden.shape = [num_layers * num_directions, batch_size, hidden_dim]
        # cell.shape = [num_layers * num_directions, batch_size, hidden_dim]
        hidden = self.dropout(hidden[-1,:,:])
        # hidden.shape = [batch_size, hidden_dim]
        return self.fc(hidden)

In [4]:
def tokenizer(sentence):
    return sentence.split()

In [5]:
def encoding(X,y,word_dic,max_len):

    sentences_encoding = []

    for i in range(len(X)):
        sentence = X[i].split()
        encoded_sentence = []
        for j in range(0,min(len(sentence),max_len)):
            encoded_sentence.append(word_dic[sentence[j]])

        if max_len > len(sentence):
            tam = max_len-len(sentence)
            zeros = [0]*tam
            encoded_sentence.extend(zeros)

        sentences_encoding.append(np.array(encoded_sentence))

    return np.array(sentences_encoding), np.array(y)

In [6]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [7]:
newsgroups_train.filenames.shape

(11314,)

In [8]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [9]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=['comp.graphics','comp.windows.x','sci.crypt','comp.sys.ibm.pc.hardware'])
newsgroups_test = fetch_20newsgroups(subset='test', categories=['comp.graphics','comp.windows.x','sci.crypt','comp.sys.ibm.pc.hardware'])

In [10]:
newsgroups_train.data[:10]

 'From: mkagalen@lynx.dac.northeastern.edu (michael kagalenko)\nSubject: Some thoughts on Clipper proposal \nOrganization: Division of Academic Computing, Northeastern University, Boston, MA. 02115 USA\nLines: 25\n\n\nI envision incorporation of new standart into\nvarious communication systems, thus making it prevalent on the market & \ntherefore cheap. The way to do that may be detaching crypto chip from \ncommunication equipment. It seems logical to provide Clipper chip \nto the end-user not as a part of phone, fax, modem & like but in the\nform of smart-card compatible with various telecomm. products. Banks \nwill encourage extensive use of new cards to make transactions by phone.\nNatural step will be to cross-reference this card to the person in the \ngovernment databases - or else this new version of "wiretap proposal" \nmake no sence at all; one wish to eavesdrop (spell.) on the particular \nperson, not on the particular modem or phone.\n \nAs a side note, I disagree with one po

In [11]:
print(list(newsgroups_train.target_names))

['comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'sci.crypt']


In [12]:
num_classes = len(np.unique(newsgroups_train.target))
num_classes

4

In [13]:
label2int = {}
for label in list(newsgroups_train.target_names):
    label2int[label] = len(label2int)

In [14]:
label2int

{'comp.graphics': 0,
 'comp.sys.ibm.pc.hardware': 1,
 'comp.windows.x': 2,
 'sci.crypt': 3}

In [15]:
X_train,y_train = newsgroups_train.data,newsgroups_train.target
X_test,y_test = newsgroups_test.data,newsgroups_test.target

In [16]:
len(X_train)

2362

In [17]:
len(y_train)

2362

In [18]:
word2index = {'OOV':0}
for i in range(len(X_train)):
    for word in X_train[i].split():
        if word not in word2index:
            word2index[word] = len(word2index)

In [19]:
vocab_size=len(word2index)

In [None]:
qt_palavras = [0]*10000
for i in range(len(X_train)):
    num_sentences = 0
    tam_sentenca = len(X_train[i].split())
    print(tam_sentenca)
    qt_palavras[tam_sentenca]+=1

In [21]:
import plotly.express as px
import pandas as pd

fig = px.bar(qt_palavras[1:400])
fig.show()

In [22]:
max_len = 128

In [23]:
train_vectors,y_train_int = encoding(X_train,y_train,word2index,max_len)
test_vectors,y_test_int = encoding(X_train,y_train,word2index,max_len)

In [24]:
train_dataset = TextDataset(train_vectors, y_train_int)
test_dataset = TextDataset(test_vectors, y_test_int)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [25]:
INPUT_DIM = vocab_size#max_len#len(train_vectors[0])
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = num_classes
N_LAYERS = 1
DROPOUT = 0.2

In [26]:
model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1



In [27]:
model

LSTM(
  (embedding): Embedding(80122, 100)
  (rnn): LSTM(100, 256, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=256, out_features=4, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [28]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-4)

In [29]:
N_EPOCHS = 20
for epoch in range(N_EPOCHS):

    model.train()

    total_loss = 0

    for X_batch, y_batch in train_loader:

        optimizer.zero_grad()

        predictions = model(X_batch)

        loss = criterion(predictions, y_batch)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

        total_loss = total_loss / len(train_loader)

    print(f'Epoch: {epoch+1} \t Train Loss: {total_loss:.6f}')

Epoch: 1 	 Train Loss: 0.018413
Epoch: 2 	 Train Loss: 0.018800
Epoch: 3 	 Train Loss: 0.019151
Epoch: 4 	 Train Loss: 0.018023
Epoch: 5 	 Train Loss: 0.017483
Epoch: 6 	 Train Loss: 0.014127
Epoch: 7 	 Train Loss: 0.018062
Epoch: 8 	 Train Loss: 0.016614
Epoch: 9 	 Train Loss: 0.017638
Epoch: 10 	 Train Loss: 0.015290
Epoch: 11 	 Train Loss: 0.015029
Epoch: 12 	 Train Loss: 0.012199
Epoch: 13 	 Train Loss: 0.012606
Epoch: 14 	 Train Loss: 0.013425
Epoch: 15 	 Train Loss: 0.015206
Epoch: 16 	 Train Loss: 0.014092
Epoch: 17 	 Train Loss: 0.013925
Epoch: 18 	 Train Loss: 0.015832
Epoch: 19 	 Train Loss: 0.015975
Epoch: 20 	 Train Loss: 0.013063


In [30]:
y_pred = []
y_test = []
for text, label in test_loader:
    y_prob = model(text)
    _, predicted = torch.max(y_prob, 1)
    y_pred.extend(predicted.tolist())
    y_test.extend(label.tolist())

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.39      0.57      0.46       584
           1       0.61      0.42      0.50       590
           2       0.52      0.49      0.50       593
           3       0.93      0.89      0.91       595

    accuracy                           0.59      2362
   macro avg       0.61      0.59      0.59      2362
weighted avg       0.62      0.59      0.59      2362

