# Text Classification with LSTM and Word Embeddings

In [43]:
!pip install keras

Defaulting to user installation because normal site-packages is not writeable
Collecting keras
  Downloading keras-3.13.2-py3-none-any.whl.metadata (6.3 kB)
Collecting namex (from keras)
  Downloading namex-0.1.0-py3-none-any.whl.metadata (322 bytes)
Collecting optree (from keras)
  Downloading optree-0.18.0-cp312-cp312-win_amd64.whl.metadata (35 kB)
Collecting ml-dtypes (from keras)
  Downloading ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Downloading keras-3.13.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 26.6 MB/s eta 0:00:00
Downloading ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl (212 kB)
Downloading namex-0.1.0-py3-none-any.whl (5.9 kB)
Downloading optree-0.18.0-cp312-cp312-win_amd64.whl (312 kB)
Installing collected packages: namex, optree, ml-dtypes, keras
Successfully installed keras-3.13.2 ml-dtypes-0.5.4 namex-0.1.0 optree-0.18.0


In [46]:
!pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-win_amd64.whl.metadata (4.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.12.19-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.7.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.3.0-py3-none-any.whl.metadata (6



### Load and pre-process the dataset

We will use a dataset that contains over 11,000 tweets associated with disaster keywords like “crash”, “quarantine”, and “bush fires” as well as the location and keyword itself. Every tweet is classified according to whether the tweet really referred to a disaster event or not (a joke with the word or a movie review or something non-disastrous). 

The classification task consists of determining the label of the tweet (disaster or not) given the text of the tweet

We load a pre-trained model for word embedding. We will need it to convert tokens to their ids.

In [2]:
import gensim
import numpy as np
import gensim.downloader as api

# Import model.
model_w2v = api.load('glove-wiki-gigaword-300')

# Add padding token and embedding to the model, so that we can use it later when we pad the sequences.
pad_tok = '<pad>'
pad_emb = np.zeros(300)
model_w2v.add_vector(pad_tok, pad_emb)
pad_tok_id = model_w2v.key_to_index[pad_tok]

# Add unknown token and embedding to the model, so that we can use it later when we convert tokens to indices.
unk_tok = '<unk>'
unk_emb = np.random.normal(size=300)
model_w2v.add_vector(unk_tok, unk_emb)
unk_tok_id = model_w2v.key_to_index['<unk>']



In [None]:
import numpy as np
import pandas as pd

from keras.preprocessing import sequence
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split

filename =  'data/tweets.csv'
max_len = 20
test_size = 0.2

# Load the data. Drop the columns that we won't use.
df = pd.read_csv(filename)
df.drop(['id','keyword','location'], axis=1, inplace=True)

# Extract the text and labels from the dataframe.		
text = df['text'].values
labels = df['target'].values

# Split the data into training and test sets.	
text_train,text_test, labels_train, labels_test = train_test_split(text, labels, test_size=test_size)

# Tokenize the text using the NLTK TweetTokenizer. 
tokenizer = TweetTokenizer()

tokens_train = [tokenizer.tokenize(sentence) for sentence in text_train]
tokens_test = [tokenizer.tokenize(sentence) for sentence in text_test]

# Convert the tokens to indices using the model's vocabulary. If a token is not in the vocabulary, use the index of the unknown token.
tokens_ids_train = [[model_w2v.key_to_index.get(token, unk_tok_id) for token in sentence] for sentence in tokens_train]
tokens_ids_test = [[model_w2v.key_to_index.get(token, unk_tok_id) for token in sentence] for sentence in tokens_test]
    
# Pad the sequences to ensure that they all have the same length.
x_train = sequence.pad_sequences(tokens_ids_train, maxlen=max_len)
x_test = sequence.pad_sequences(tokens_ids_test, maxlen=max_len)

# Get the labels for the training and test sets.
y_train = labels_train
y_test = labels_test

### Definition of the model

We define the classification model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.ModuleList):

	# We provide as parameters the pre-trained word embeddings, the hidden size of the LSTM layer, the number of layers of the LSTM, and the dropout rate. 
	def __init__(self, vectors, hidden_size, num_layers, dropout):
		super(TweetClassifier, self).__init__()
		
		# ensure vectors is a tensor
		if not torch.is_tensor(vectors):
			vectors = torch.tensor(vectors)

		self.hidden_dim = hidden_size
		self.LSTM_layers = num_layers
	
        # init embedding layer
		self.embedding = nn.Embedding.from_pretrained(embeddings=vectors)
		self.dropout = nn.Dropout(dropout)
		# LSTM layer. 

		# ADD LSTM LAYER HERE. See lstm documentation at https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html for more details. 

		# Classification layers. The input size is the same as the hidden size of the LSTM, since we are using the output of the last time step of the LSTM as input to the classification layers. The output size is 1, since we are doing binary classification.
		self.fc1 = nn.Linear(in_features=hidden_size, out_features=257)
		self.fc2 = nn.Linear(257, 1)
		
	def forward(self, x):
		# Initialize the hidden and cell states of the LSTM. The shape of the hidden and cell states should be (num_layers, batch_size, hidden_size).
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))
		
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		# Pass the input through the embedding layer, then through the LSTM layer, and finally through the classification layers. 
		# The output of the LSTM layer is a tuple containing the output of the last time step and the hidden and cell states. We only need the output of the last time step, which is the first element of the tuple. 
		# We can use this output as input to the classification layers.
		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)

		# Pass the output of the LSTM layer through the classification layers to get the final output

		# ADD CODE HERE

		
		return out

We define the Dataset for the Dataloader

In [5]:
from torch.utils.data import Dataset

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

### Training the model

In [6]:
from torch.utils.data import DataLoader
import torch.optim as optim

def compute_accuracy(ground_truth, predictions):
	true_positives = 0
	true_negatives = 0
		
	for true, pred in zip(ground_truth, predictions):
		if (pred > 0.5) and (true == 1):
			true_positives += 1
		elif (pred < 0.5) and (true == 0):
			true_negatives += 1
		else:
			pass
				
	return (true_positives+true_negatives) / len(ground_truth)

batch_size = 64
learning_rate = 0.01
epochs = 5

training_set = DatasetMaper(x_train, y_train)
test_set = DatasetMaper(x_test, y_test)
		
loader_training = DataLoader(training_set, batch_size=batch_size)
loader_test = DataLoader(test_set)

# We pass the pre-trained word embeddings to the model to initialize the embedding layer
model = TweetClassifier(vectors=model_w2v.vectors, hidden_size=128, num_layers=2, dropout=0.5)		
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
for epoch in range(epochs):
			
	train_predictions = []			
	model.train()
	for x_batch, y_batch in loader_training:
		x = x_batch.type(torch.LongTensor)
		y = y_batch.type(torch.FloatTensor)

		y = torch.unsqueeze(y, 1)		
		y_pred = model(x)

		loss = F.binary_cross_entropy(y_pred, y)		
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
				
		train_predictions += list(y_pred.squeeze().detach().numpy())
	# Evaluate the model on the test set
	test_predictions = []
	model.eval()
	with torch.no_grad():
		for x_batch, y_batch in loader_test:
			x = x_batch.type(torch.LongTensor)
			y = y_batch.type(torch.FloatTensor)			
			y_pred = model(x)
			test_predictions += list(y_pred.detach().numpy())
				
			
	train_accuracy = compute_accuracy(labels_train, train_predictions)
	test_accuracy = compute_accuracy(labels_test, test_predictions)
			
	print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuracy, test_accuracy))

Epoch: 1, loss: 0.58573, Train accuracy: 0.57291, Test accuracy: 0.58306
Epoch: 2, loss: 0.45349, Train accuracy: 0.59852, Test accuracy: 0.65857
Epoch: 3, loss: 0.62765, Train accuracy: 0.62824, Test accuracy: 0.72817
Epoch: 4, loss: 0.48637, Train accuracy: 0.70345, Test accuracy: 0.73867
Epoch: 5, loss: 0.40029, Train accuracy: 0.75156, Test accuracy: 0.73539
