<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/06_logistic_regression_embedding_layer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import itertools

import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Utilities

In [2]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

## Read Data

In [3]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.processed_tweet = train_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])
val_df.processed_tweet = val_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])

## remove blank
train_df = train_df[train_df.processed_tweet.str.len()!=0]
val_df = val_df[val_df.processed_tweet.str.len()!=0]

train_df = train_df.dropna()
val_df = val_df.dropna()

## reset index
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

In [4]:
train_df.label.value_counts()

0.0    3999
1.0    3987
Name: label, dtype: int64

In [5]:
val_df.label.value_counts()

0    1000
1     999
Name: label, dtype: int64

In [6]:
train_df.head(3)

Unnamed: 0,raw_tweet,processed_tweet,label
0,Want to say a huge thanks to @WarriorAssaultS ...,"[want, say, huge, thank, ff, thank, support, :)]",1.0
1,@jaynehh_ you just need a job and get a letter...,"[need, job, get, letter, work, place, say, wor...",1.0
2,"@knhillrocks HA yes, make it quick tho :D","[ha, ye, make, quick, tho, :d]",1.0


## Build Vocab

In [7]:
special_words = ['__PAD__','</e>','__UNK__']
unique_words = list(set(itertools.chain.from_iterable(train_df.processed_tweet.tolist())))
vocab = special_words + unique_words
vocab = {w:i for i,w in enumerate(vocab)}
print(f"Number of words in vocab : {len(vocab)}")

Number of words in vocab : 9092


## Train & Val

In [8]:
def tweet_to_tensor(processed_tweet_list, unk_token = '__UNK__'):
  to_tensor_list = []
  unk_token_id = vocab[unk_token]

  for w in processed_tweet_list:
    to_tensor_list.append(vocab.get(w,unk_token_id))

  to_tensor = torch.tensor(to_tensor_list)
  return to_tensor

In [9]:
train_df['tensor_tweet'] = [tweet_to_tensor(tweet) for tweet in train_df.processed_tweet]
val_df['tensor_tweet'] = [tweet_to_tensor(tweet) for tweet in val_df.processed_tweet]

In [10]:
train_df.tensor_tweet.values[0]

tensor([6073, 3484, 7897, 3366, 4470, 3366, 4273, 4816])

In [11]:
train_df.head(3)

Unnamed: 0,raw_tweet,processed_tweet,label,tensor_tweet
0,Want to say a huge thanks to @WarriorAssaultS ...,"[want, say, huge, thank, ff, thank, support, :)]",1.0,"[tensor(6073), tensor(3484), tensor(7897), ten..."
1,@jaynehh_ you just need a job and get a letter...,"[need, job, get, letter, work, place, say, wor...",1.0,"[tensor(3192), tensor(4189), tensor(5739), ten..."
2,"@knhillrocks HA yes, make it quick tho :D","[ha, ye, make, quick, tho, :d]",1.0,"[tensor(2848), tensor(414), tensor(969), tenso..."


In [12]:
train_x = train_df['tensor_tweet'].tolist()
train_y = train_df['label'].tolist()
val_x = val_df['tensor_tweet'].tolist()
val_y = val_df['label'].tolist()

## Converting a tweet to tensor

In [13]:
# class SentimentDataset:

#   def __init__(self, features, labels):
#     self.features = features
#     self.labels = labels


#   def __getitem__(self, index):
#     feature = torch.tensor(self.features[index])
#     label = torch.tensor(self.labels[index],dtype = torch.long)
#     return (feature, label)

#   def __len__(self):
#     return len(self.features)

#   ## dataset
# train_ds = SentimentDataset(train_x, train_y)
# val_ds = SentimentDataset(val_x, val_y)
# print(train_ds[0])  ## (tensor([5536, 8668, 5554,  186, 7898,  186, 2368, 4767]), tensor(1))

## Issue with default dataloader

```
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = False, num_workers = 2)
```

The above code will give you error as all the tweets in the batch are not of same length. To handle this situaton we will you **collate_fn** function and we will add padding to the max length of tweet in the batch.

In [14]:
train_x[:3]  ## list of tensor

[tensor([6073, 3484, 7897, 3366, 4470, 3366, 4273, 4816]),
 tensor([3192, 4189, 5739, 5386, 2122, 1351, 3484, 2122, 5386,   69, 7984, 5100]),
 tensor([2848,  414,  969,  239, 5206, 1499])]

In [15]:
train_y[:3] ## list of float

[1.0, 1.0, 1.0]

In [16]:
def custom_collate(data):
  features = [d[0] for d in data]
  labels = [d[1] for d in data]

  padded_features = pad_sequence(features, batch_first=True, padding_value= vocab['__PAD__'])
  labels = torch.tensor(labels, dtype = torch.float32)
  return (padded_features, labels)

train_dl = DataLoader(list(zip(train_x, train_y)), batch_size = 2, collate_fn = custom_collate, shuffle = True)
example = next(iter(train_dl))
feature, label = example[0], example[1]

In [17]:
feature

tensor([[3560, 4699, 3560, 2122, 1599, 3560,  726, 2152, 4315, 7470, 2878, 2802,
          634],
        [3685, 6759, 2982, 2348,  233,  253, 5100,    0,    0,    0,    0,    0,
            0]])

In [18]:
label

tensor([0., 1.])

In [19]:
### dataloader
BATCH_SIZE = 64
train_dl = DataLoader(list(zip(train_x, train_y)), batch_size = BATCH_SIZE, collate_fn = custom_collate, shuffle = True)
val_dl = DataLoader(list(zip(val_x, val_y)), batch_size = BATCH_SIZE, collate_fn = custom_collate, shuffle = False)

## Model

In [20]:
!pip install -q  pytorch-lightning

In [21]:
import pytorch_lightning as pl
from torchmetrics import Accuracy

In [22]:
## set seed
np.random.seed(121)
torch.manual_seed(121)
pl.seed_everything(121)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

In [23]:
## Model
class SentimentModel(pl.LightningModule):

  def __init__(self, num_embeddings, embedding_dim, learning_rate):
    super().__init__()
    self.learning_rate = learning_rate

    ## Define Model
    self.num_classes = 1
    self.embed_layer = nn.Embedding(num_embeddings= num_embeddings, embedding_dim=embedding_dim)
    self.dense1 = nn.Linear(in_features= embedding_dim, out_features= 16)
    self.relu = nn.ReLU()
    self.dense2 = nn.Linear(in_features= 16, out_features= 1)
    self.sigmoid = nn.Sigmoid()    ## Prob b/w 0 to 1 => 1 if prob >0.5 else 0

    ## define loss
    self.loss_fn = nn.BCELoss()
    ## define metrics
    self.train_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)
    self.val_accuracy = Accuracy(task = "binary", num_classes = 2, threshold= 0.5)


  def forward(self,feature, verbose = False):
    out_embed = self.embed_layer(feature)
    out_mean = torch.mean(out_embed, dim = 1)
    out_dense1 = self.dense1(out_mean)
    out = self.relu(out_dense1)
    out_dense2 = self.dense2(out)
    out_sigmoid = self.sigmoid(out_dense2)

    if verbose:
      print(f"Input shape : {feature.shape}")
      print(f"Embed shape : {out_embed.shape}")
      print(f"Mean shape : {out_mean.shape}")
      print(f"Dense-1 shape : {out_dense1.shape}")
      print(f"Dense-2 shape : {out_dense2.shape}")
      print(f"logits shape : {out_sigmoid.shape}")

    output = torch.squeeze(out_sigmoid, dim = 1)
    return output

  def training_step(self, batch, batch_idx):
    feature, label = batch[0],batch[1]
    logits = self(feature)
    loss = self.loss_fn(logits, label)
    self.train_accuracy(logits,label)
    self.log_dict({"train_loss": loss, "train_accuracy": self.train_accuracy}, on_step = False, on_epoch = True, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    feature, label = batch[0],batch[1]
    logits = self(feature)
    loss = self.loss_fn(logits, label)
    self.val_accuracy(logits,label)
    self.log_dict({"val_loss": loss,  "val_accuracy": self.val_accuracy}, on_step = False, on_epoch = True, prog_bar = True)

    return loss

  def on_train_epoch_end(self):
    self.train_accuracy.reset()

  def on_validation_epoch_end(self):
     print(f"Epoch : {self.current_epoch} Validation Accuracy : {self.val_accuracy.compute()}")
     self.val_accuracy.reset()

  def configure_optimizers(self):
     optimizer = optim.Adam(self.parameters(), lr =self.learning_rate)
     return optimizer

In [24]:
model = SentimentModel(num_embeddings= len(vocab), embedding_dim=100, learning_rate=0.001)
model

SentimentModel(
  (embed_layer): Embedding(9092, 100)
  (dense1): Linear(in_features=100, out_features=16, bias=True)
  (relu): ReLU()
  (dense2): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (loss_fn): BCELoss()
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
)

In [25]:
feature, label

(tensor([[3560, 4699, 3560, 2122, 1599, 3560,  726, 2152, 4315, 7470, 2878, 2802,
           634],
         [3685, 6759, 2982, 2348,  233,  253, 5100,    0,    0,    0,    0,    0,
             0]]),
 tensor([0., 1.]))

In [26]:
## test model architecture
logits = model(feature,verbose = True)
print(f"Logits : {logits}")
print(f"Loss : {model.loss_fn(logits, label)}")

Input shape : torch.Size([2, 13])
Embed shape : torch.Size([2, 13, 100])
Mean shape : torch.Size([2, 100])
Dense-1 shape : torch.Size([2, 16])
Dense-2 shape : torch.Size([2, 1])
logits shape : torch.Size([2, 1])
Logits : tensor([0.4578, 0.4569], grad_fn=<SqueezeBackward1>)
Loss : 0.6976990699768066


## Train Model

In [27]:
## logger
logger = pl.loggers.CSVLogger("logs", name="sentiment_analysis")

## checkpoints
checkpoint_callback  = pl.callbacks.ModelCheckpoint(
                                                filename='{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
                                                every_n_epochs = 2,
                                                save_top_k = -1,
                                                monitor='val_loss',
                                                )


model = SentimentModel(num_embeddings= len(vocab), embedding_dim=100, learning_rate=0.001)

trainer = pl.Trainer(accelerator="cpu",
                     max_epochs = 10,
                     check_val_every_n_epoch=2,
                     callbacks=[checkpoint_callback],
                     logger=logger

                    )

## Train the Model
trainer.fit(model, train_dl, val_dl)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type           | Params
--------------------------------------------------
0 | embed_layer    | Embedding      | 909 K 
1 | dense1         | Linear         | 1.6 K 
2 | relu           | ReLU           | 0     
3 | dense2         | Linear         | 17    
4 | sigmoid        | Sigmoid        | 0     
5 | loss_fn        | BCELoss        | 0     
6 | train_accuracy | BinaryAccuracy | 0     
7 | val_accuracy   | BinaryAccuracy | 0     
--------------------------------------------------
910 K     Trainable params
0         Non-trainable params
910 K     Total params
3.643     Total estimated model params size (

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Epoch : 0 Validation Accuracy : 0.3984375


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1 Validation Accuracy : 0.9769884943962097


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3 Validation Accuracy : 0.985992968082428


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 5 Validation Accuracy : 0.9919959902763367


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 7 Validation Accuracy : 0.9914957284927368


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch : 9 Validation Accuracy : 0.9944972395896912


## Load the model from checkpoints

In [28]:
# model = SentimentModel.load_from_checkpoint("logs/sentiment_analysis/version_13/checkpoints/epoch=9-val_loss=0.03-val_accuracy=0.99.ckpt",
#                   num_embeddings= len(vocab), embedding_dim=100, learning_rate=0.001)

## Predict

In [29]:
model.eval()

SentimentModel(
  (embed_layer): Embedding(9092, 100)
  (dense1): Linear(in_features=100, out_features=16, bias=True)
  (relu): ReLU()
  (dense2): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (loss_fn): BCELoss()
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
)

In [30]:
tweet = "I love this movies"
procesed_tweet = process_tweet(tweet)
tensor_tweet = tweet_to_tensor(procesed_tweet)
tensor_tweet = tensor_tweet.view(1,-1)
print(tensor_tweet.shape)
preds = model(tensor_tweet)[0].item()
int(preds>0.5)

torch.Size([1, 2])


1

In [31]:
tweet = "I hate this movies :("
procesed_tweet = process_tweet(tweet)
tensor_tweet = tweet_to_tensor(procesed_tweet)
tensor_tweet = tensor_tweet.view(1,-1)
print(tensor_tweet.shape)
preds = model(tensor_tweet)[0].item()
int(preds>0.5)

torch.Size([1, 3])


0

In [32]:
tweet = "Thank you so much"
procesed_tweet = process_tweet(tweet)
tensor_tweet = tweet_to_tensor(procesed_tweet)
tensor_tweet = tensor_tweet.view(1,-1)
print(tensor_tweet.shape)
preds = model(tensor_tweet)[0].item()
int(preds>0.5)

torch.Size([1, 2])


1