<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/05_logistic_regression_lighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from sklearn.linear_model import LogisticRegression

## Read Data

In [4]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.processed_tweet = train_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])
val_df.processed_tweet = val_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])

In [5]:
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [6]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

## Create Word Freq by label

In [7]:
pos_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==1]['processed_tweet'].tolist())))
pos_freq_dict.most_common(10)

[(':)', 2866),
 (':-)', 530),
 ('thank', 507),
 (':d', 504),
 ('love', 322),
 ('follow', 306),
 ('...', 221),
 ('day', 193),
 ('good', 191),
 ('like', 186)]

In [8]:
neg_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==0]['processed_tweet'].tolist())))
neg_freq_dict.most_common(10)

[(':(', 3636),
 (':-(', 404),
 ("i'm", 293),
 ('...', 268),
 ('miss', 242),
 ('pleas', 219),
 ('follow', 202),
 ('want', 192),
 ('like', 190),
 ('get', 189)]

## Create Features

- pos_freq : sum of positive freq of all unique words in tweet
- neg_freq : sum of negative freq of all unique words in the tweet

In [9]:
train_df['pos_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
train_df['neg_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

val_df['pos_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
val_df['neg_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

train_df['bias'] = 1
val_df['bias'] = 1

In [10]:
train_df.head(6)

Unnamed: 0,raw_tweet,processed_tweet,label,pos_freq,neg_freq,bias
0,Want to say a huge thanks to @WarriorAssaultS ...,"[want, say, huge, thank, ff, thank, support, :)]",1.0,3575.0,358.0,1
1,@jaynehh_ you just need a job and get a letter...,"[need, job, get, letter, work, place, say, wor...",1.0,958.0,464.0,1
2,"@knhillrocks HA yes, make it quick tho :D","[ha, ye, make, quick, tho, :d]",1.0,690.0,144.0,1
3,@shartyboy Thanks for texting me back :)) I'm ...,"[thank, text, back, :), i'm, text, tomorrow, :)]",1.0,3650.0,512.0,1
4,Laying out a greetings card range for print to...,"[lay, greet, card, rang, print, today, love, j...",1.0,990.0,240.0,1
5,#FollowFriday @CCIFCcanada @AdamEvnmnt @boxcal...,"[followfriday, top, engag, member, commun, wee...",1.0,3026.0,58.0,1


In [11]:
## features  : [bias, pos_freq, neg_freq]

train_x = train_df[['bias','pos_freq','neg_freq']].fillna(0).values
train_y = train_df.label.fillna(0).values

val_x = val_df[['bias','pos_freq','neg_freq']].fillna(0).values
val_y = val_df.label.fillna(0).values

## Logistic Regression using torch lighting

In [12]:
!pip install -q lightning

In [13]:
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torchmetrics import Accuracy
from lightning.pytorch.loggers import CSVLogger

warnings.filterwarnings('ignore')

In [14]:
## set seed
np.random.seed(121)
torch.manual_seed(121)
pl.seed_everything(121)

INFO:lightning_fabric.utilities.seed:Seed set to 121


121

In [15]:
class SentimentDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __getitem__(self, index):
    feature = torch.tensor(self.features[index], dtype = torch.float32)
    label = torch.tensor(self.labels[index], dtype = torch.float32)
    return (feature, label)

  def __len__(self):
    return len(self.features)

In [16]:
class SentimentModel(pl.LightningModule):

    def __init__(self, in_feature, out_feature, learning_rate):
        super().__init__()
        hidden_unit = 32
        self.lr = learning_rate
        self.fc1 = nn.Linear(in_features= in_feature, out_features= hidden_unit)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features= hidden_unit, out_features= out_feature)
        self.sigmoid = nn.Sigmoid()

        self.loss_fn = nn.BCELoss()
        self.accuracy = Accuracy(task = "binary", num_classes = out_feature)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        out = torch.squeeze(out, dim = 1)
        return out

    def training_step(self,batch, batch_idx):
        x, y = batch
        yhat = self(x)
        loss = self.loss_fn(yhat, y)
        acc = self.accuracy(yhat,y)
        self.log_dict({'train_loss': loss, 'train_acc': acc}, on_step = True, on_epoch = True,prog_bar = True)
        return loss


    def validation_step(self, batch, batch_idx):
        x, y = batch
        yhat = self(x)
        # yhat = yhat.round()
        loss = self.loss_fn(yhat, y)
        acc = self.accuracy(yhat,y)
        self.log_dict({'val_loss': loss, 'val_acc': acc}, on_step = True, on_epoch = True,prog_bar = True)
        return loss

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr = self.lr)
        return optimizer

In [17]:
BATCH_SIZE = 64
train_ds = SentimentDataset(train_x, train_y)
val_ds = SentimentDataset(val_x, val_y)
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True, num_workers= 2)
val_dl = DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle = False, num_workers= 2)

In [18]:
feature, label = next(iter(train_dl))
print(f"#feature : {feature.shape}  #label : {label.shape}")

#feature : torch.Size([64, 3])  #label : torch.Size([64])


In [19]:
## logger
logger = CSVLogger("logs", name="sentiment_analysis")

## checkpoints
checkpoint_callback  = pl.callbacks.ModelCheckpoint(
                                                filename='{epoch}-{val_loss:.2f}-{val_accuracy:.2f}',
                                                every_n_epochs = 2,
                                                save_top_k = -1,
                                                monitor='val_loss_epoch',
                                                )


model = SentimentModel(in_feature = feature.shape[1], out_feature = 1, learning_rate= 1e-3)

In [20]:
# test the model
# output should be same as label shape
model.forward(feature).shape

torch.Size([64])

In [21]:
trainer = pl.Trainer(accelerator="cpu",
                     max_epochs = 10,
                     check_val_every_n_epoch=2,
                     callbacks=[checkpoint_callback],
                     logger=logger

                    )

## Train the Model
trainer.fit(model, train_dl, val_dl)

INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type           | Params
--------------------------------------------
0 | fc1      | Linear         | 128   
1 | relu     | ReLU           | 0     
2 | fc2      | Linear         | 33    
3 | sigmoid  | Sigmoid        | 0     
4 | loss_fn  | BCELoss        | 0     
5 | accuracy | BinaryAccuracy | 0     
--------------------------------------------
161       Trainable params
0         Non-trainable params
161       Total params
0.001     Total es

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
# ## Load the model
# model = SentimentModel.load_from_checkpoint("lightning_logs/version_8/checkpoints/epoch=9-val_loss=0.02-val_accuracy=0.00.ckpt",
#                                      in_feature = feature.shape[1], out_feature = 1, learning_rate = 1e-3)

## Predict

In [26]:
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [30]:
def predict(tweet):
  model.eval()
  processed_tweet = process_tweet(tweet)
  pos_freq = np.sum([pos_freq_dict.get(w,0) for w in processed_tweet])
  neg_freq = np.sum([neg_freq_dict.get(w,0) for w in processed_tweet])
  row = torch.tensor([[1, pos_freq, neg_freq]], dtype = torch.float32)
  logits = model(row).item()
  pred = 1 if logits>0.5 else 0
  return pred

In [31]:
tweet = "I love this movie"
predict(tweet)

1

In [32]:
tweet = "I hate this movie"
predict(tweet)

0