<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/04_logistic_regression_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from sklearn.linear_model import LogisticRegression

## Read Data

In [2]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.processed_tweet = train_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])
val_df.processed_tweet = val_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])

In [3]:
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [4]:
val_df.label.value_counts()

1    1000
0    1000
Name: label, dtype: int64

## Create Word Freq by label

In [5]:
pos_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==1]['processed_tweet'].tolist())))
pos_freq_dict.most_common(10)

[(':)', 2866),
 (':-)', 530),
 ('thank', 507),
 (':d', 504),
 ('love', 322),
 ('follow', 306),
 ('...', 221),
 ('day', 193),
 ('good', 191),
 ('like', 186)]

In [6]:
neg_freq_dict = Counter(list(itertools.chain.from_iterable(train_df[train_df.label==0]['processed_tweet'].tolist())))
neg_freq_dict.most_common(10)

[(':(', 3636),
 (':-(', 404),
 ("i'm", 293),
 ('...', 268),
 ('miss', 242),
 ('pleas', 219),
 ('follow', 202),
 ('want', 192),
 ('like', 190),
 ('get', 189)]

## Create Features

- pos_freq : sum of positive freq of all unique words in tweet
- neg_freq : sum of negative freq of all unique words in the tweet

In [7]:
train_df['pos_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
train_df['neg_freq'] = train_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

val_df['pos_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([pos_freq_dict.get(w,0) for w in set(x)]))
val_df['neg_freq'] = val_df.processed_tweet.apply(lambda x: np.sum([neg_freq_dict.get(w,0) for w in set(x)]))

train_df['bias'] = 1
val_df['bias'] = 1

In [8]:
train_df.head(6)

Unnamed: 0,raw_tweet,processed_tweet,label,pos_freq,neg_freq,bias
0,Want to say a huge thanks to @WarriorAssaultS ...,"[want, say, huge, thank, ff, thank, support, :)]",1.0,3575.0,358.0,1
1,@jaynehh_ you just need a job and get a letter...,"[need, job, get, letter, work, place, say, wor...",1.0,958.0,464.0,1
2,"@knhillrocks HA yes, make it quick tho :D","[ha, ye, make, quick, tho, :d]",1.0,690.0,144.0,1
3,@shartyboy Thanks for texting me back :)) I'm ...,"[thank, text, back, :), i'm, text, tomorrow, :)]",1.0,3650.0,512.0,1
4,Laying out a greetings card range for print to...,"[lay, greet, card, rang, print, today, love, j...",1.0,990.0,240.0,1
5,#FollowFriday @CCIFCcanada @AdamEvnmnt @boxcal...,"[followfriday, top, engag, member, commun, wee...",1.0,3026.0,58.0,1


In [9]:
## features  : [bias, pos_freq, neg_freq]

train_x = train_df[['bias','pos_freq','neg_freq']].fillna(0).values
train_y = train_df.label.fillna(0).values

val_x = val_df[['bias','pos_freq','neg_freq']].fillna(0).values
val_y = val_df.label.fillna(0).values

## Logistic Regression DL Model

In [10]:
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')

In [11]:
class SentimentDataset(Dataset):

  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __getitem__(self, index):
    feature = torch.tensor(self.features[index], dtype = torch.float32)
    label = F.one_hot(torch.tensor(self.labels[index], dtype = torch.long), num_classes=2)
    label = label.float()
    return (feature, label)


  def __len__(self):
    return len(self.features)


BATCH_SIZE = 64
# train dataset
train_ds = SentimentDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True, num_workers= 2)
# val dataloader
val_ds = SentimentDataset(val_x, val_y)
val_dl = DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle = False, num_workers= 2)

In [12]:
class SentimentModel(nn.Module):

  def __init__(self, in_feature, out_feature):
    super(SentimentModel, self).__init__()
    hidden_unit = 32
    self.fc1 = nn.Linear(in_features= in_feature, out_features= hidden_unit)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(in_features= hidden_unit, out_features= out_feature)
    self.log_softmax = nn.LogSoftmax()

  def forward(self, x):
    out = self.fc1(x)
    out = self.relu(out)
    out = self.fc2(out)
    out = self.log_softmax(out)
    return out

In [13]:
## example = [feature, label]
example = next(iter(train_dl))
feature, label = example[0], example[1]
print(f"feature shape : {feature.shape}")
in_feature = feature.shape[1]
out_feature = label.shape[1]
print(f"in feature : {in_feature}")
print(f"out feature : {out_feature}")

feature shape : torch.Size([64, 3])
in feature : 3
out feature : 2


In [14]:
N_EPOCH = 10
LEARNING_RATE = 1e-3

model = SentimentModel(in_feature, out_feature)
optimizer = Adam(model.parameters(), lr = LEARNING_RATE)
loss_fn = nn.BCEWithLogitsLoss()

In [15]:
## Test the architecture and loss function one batch
model.eval()
logits = model(feature)
print("logits  : ",logits.shape)
print("label  : ",label.shape)
print("Loss : ", loss_fn(logits, label))

logits  :  torch.Size([64, 2])
label  :  torch.Size([64, 2])
Loss :  tensor(0.3675, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [16]:
def calculate_accuracy(logits, label):
    corrected = 0
    values, indices = torch.topk(logits, k = 1)
    pred_label = torch.squeeze(indices, dim = 1)
    values, indices = torch.topk(label, k = 1)
    true_label = torch.squeeze(indices, dim = 1)
    corrected += torch.sum(true_label == pred_label).item()
    return corrected

In [17]:
for ep in range(N_EPOCH):

  train_loss = 0.0
  val_loss = 0.0
  train_corrected = 0.0
  val_corrected = 0.0
  train_samples = 0
  val_samples = 0

  ## training
  model.train()
  for feature, label in train_dl:
    logits = model(feature)
    loss = loss_fn(logits, label)
    train_loss += loss.item()
    train_corrected += calculate_accuracy(logits, label)
    train_samples += len(label)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  ## validation
  model.eval()
  for feature, label in val_dl:
    logits = model(feature)
    loss = loss_fn(logits, label)
    val_loss += loss.item()
    val_corrected += calculate_accuracy(logits, label)
    val_samples += len(label)

  print(f"Epoch : {ep+1}\
          Train Loss : {train_loss/len(train_dl):.4f}\
          Train Accuracy : {train_corrected/train_samples:.4f}\
          Val Loss : {val_loss/len(val_dl):.4f}\
          Val Accuracy : {val_corrected/val_samples:.4f} ")

Epoch : 1          Train Loss : 0.4134          Train Accuracy : 0.9891          Val Loss : 0.3669          Val Accuracy : 0.9925 
Epoch : 2          Train Loss : 0.4354          Train Accuracy : 0.9908          Val Loss : 0.5239          Val Accuracy : 0.9710 
Epoch : 3          Train Loss : 0.4270          Train Accuracy : 0.9920          Val Loss : 0.3646          Val Accuracy : 0.9915 
Epoch : 4          Train Loss : 0.4602          Train Accuracy : 0.9843          Val Loss : 0.3950          Val Accuracy : 0.9920 
Epoch : 5          Train Loss : 0.4704          Train Accuracy : 0.9930          Val Loss : 0.3961          Val Accuracy : 0.9915 
Epoch : 6          Train Loss : 0.4398          Train Accuracy : 0.9931          Val Loss : 0.3843          Val Accuracy : 0.9905 
Epoch : 7          Train Loss : 0.4074          Train Accuracy : 0.9926          Val Loss : 0.3706          Val Accuracy : 0.9910 
Epoch : 8          Train Loss : 0.4176          Train Accuracy : 0.9870          Va

## Accuracy

In [18]:
## test the accuract of one batch
model.eval()
example = next(iter(train_dl))
feature, label = example[0][:3,], example[1][:3,]
logits = model(feature)
print("logits : ", logits)

## pred
values, indices = torch.topk(logits, k = 1)
pred = torch.squeeze(indices, dim = 1)
print(f"values : {values}")
print(f"indices : {indices}")
print(f"Pred : {pred}")

## true
values, indices = torch.topk(label, k = 1)
true = torch.squeeze(indices, dim = 1)
print(f"values : {values}")
print(f"indices : {indices}")
print(f"Pred : {true}")

logits :  tensor([[ -89.2363,    0.0000],
        [-330.2580,    0.0000],
        [-336.5211,    0.0000]], grad_fn=<LogSoftmaxBackward0>)
values : tensor([[0.],
        [0.],
        [0.]], grad_fn=<TopkBackward0>)
indices : tensor([[1],
        [1],
        [1]])
Pred : tensor([1, 1, 1])
values : tensor([[1.],
        [1.],
        [1.]])
indices : tensor([[1],
        [1],
        [1]])
Pred : tensor([1, 1, 1])


## Predict

In [19]:
import re
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [21]:
def predict(tweet):
  model.eval()
  processed_tweet = process_tweet(tweet)
  pos_freq = np.sum([pos_freq_dict.get(w,0) for w in processed_tweet])
  neg_freq = np.sum([neg_freq_dict.get(w,0) for w in processed_tweet])
  row = torch.tensor([[1, pos_freq, neg_freq]], dtype = torch.float32)
  logits = model(row)
  values, indices = torch.topk(logits, k = 1)
  pred = torch.squeeze(indices, dim = 1)
  return pred.item()

In [22]:
tweet = "I love this movie"
predict(tweet)

1

In [23]:
tweet = "I hate this movie"
predict(tweet)

0