In [1]:
import os
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tq
from torchtext.data import TabularDataset

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
USE_STEMEER = False
SEED = 42
QUICK = True
TEST_SIZE = 0.2

In [3]:
# seeding function for reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

In [4]:
# Reading the csv file and removing unnecessary columns
df = pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv",
                 encoding="latin1",
                 header=None)
df = df.rename(columns={0:"sentiment",
                        1:"id",
                        2:"time",
                        3:"query",
                        4:"username",
                        5:"text"})
df = df[["sentiment","text"]]
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.sentiment.value_counts()
# Looks like dataset is well balanced :)

1    800000
0    800000
Name: sentiment, dtype: int64

## Text Preprocessing

In [6]:
import re
from nltk.stem.porter import PorterStemmer

use_stemmer = USE_STEMEER
if use_stemmer:
      porter_stemmer = PorterStemmer()

def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|;\s?D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip('"\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Convert to lower case
    tweet = tweet.lower()
    
    words = tweet.split()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
        processed_tweet.append(word)
    return ' '.join(processed_tweet)

In [7]:
# Example output
print(df.text[0])
print(preprocess_tweet(df.text[0]))

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
user_mention url  aww thats a bummer you shoulda got david carr of third day to do it emo_pos


In [8]:
%%time
df['Processed_text'] = df.text.apply(preprocess_tweet)

CPU times: user 5min 4s, sys: 493 ms, total: 5min 5s
Wall time: 5min 4s


In [9]:
df.head()

Unnamed: 0,sentiment,text,Processed_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",user_mention url aww thats a bummer you shoul...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,user_mention i dived many times for the ball m...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",user_mention no its not behaving at all im mad...


In [10]:
df = df[["Processed_text", "sentiment"]]
df.to_csv("train.csv", index=None)

In [11]:
import torchtext
tweet = torchtext.data.Field(lower=True) # , tokenize="spacy")
targets = torchtext.data.RawField(is_target=True)
fields = [("Processed_text",tweet ), ("sentiment",targets)]

In [12]:
%%time
dataset = TabularDataset(path="./train.csv", format="CSV", fields=fields, skip_header=True)

CPU times: user 42.4 s, sys: 1.34 s, total: 43.7 s
Wall time: 43.6 s


In [13]:
tweet.build_vocab(dataset, max_size=100_000, min_freq=5, vectors="glove.6B.100d")

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                           
100%|█████████▉| 399470/400000 [00:26<00:00, 15211.95it/s]

In [14]:
vocab = tweet.vocab
vocab_size = len(vocab)
print(vocab_size)

57280


In [15]:
train_dataset, valid_dataset = dataset.split(1-TEST_SIZE)

In [16]:
biter = torchtext.data.BucketIterator(dataset=train_dataset, 
                                      batch_size=4,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)
for i in biter:
    print(i.Processed_text.shape)
    print(len(i.sentiment))
    break

torch.Size([17, 4])
4


In [17]:
train_biter = torchtext.data.BucketIterator(dataset=train_dataset, 
                                      batch_size=100,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)
valid_biter = torchtext.data.BucketIterator(dataset=valid_dataset, 
                                      batch_size=100,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)

In [18]:
MAX = -1
for text, cls in valid_biter:
    MAX = max(MAX, torch.max(text).item())
for text, cls in train_biter:
    MAX = max(MAX, torch.max(text).item())
print(MAX)

100%|█████████▉| 399470/400000 [00:40<00:00, 15211.95it/s]

57279


## Bag of embedding model

In [19]:
import torch.nn as nn

class TextSentiment(nn.Module):
    """
    from torchtext examples
    """
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        x = self.embedding(text)
        return self.fc(x)

model = TextSentiment(vocab_size, embed_dim=100, num_class=2)

In [20]:
model.embedding.require_grad = False
#model.embedding.require_grad_(False)

In [21]:
for text, cls in train_biter:
    cls = torch.tensor([int(i) for i in cls])
    text = text.T
    output = model(text)
    print(output.shape)
    break

torch.Size([100, 2])


In [22]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.5)

In [23]:
num_epochs = 10

for epoch in range(num_epochs):

    train_loss = 0.0
    valid_loss = 0.0
    valid_acc  = 0.0
    # Train the model
    model.train()
    bar = tq(train_biter, postfix={"train_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
    for text, cls in bar:
        optimizer.zero_grad()
        cls = torch.tensor([int(i) for i in cls])
        text = text.T
        output = model(text)
        loss = criterion(output, cls)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
        bar.set_postfix(ordered_dict={"train_loss":loss.item() , "Accuracy":acc})
    
    model.eval()
    LOSS = 0.0
    i  = 0
    with torch.no_grad():
        bar = tq(valid_biter, postfix={"valid_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
        for text, cls in bar:
            cls = torch.tensor([int(i) for i in cls])
            text = text.T
            output = model(text)
            loss = criterion(output, cls)
            acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
            valid_loss += loss.item()
            valid_acc += acc
            bar.set_postfix(ordered_dict={"valid_loss":loss.item(), "Accuracy":acc})
    
    print(f"epoch {epoch}")
    print(f"training   loss : {train_loss/len(train_biter)}")
    print(f"validation loss : {valid_loss/len(valid_biter)}")
    print(f"validation acc  : {valid_acc/len(valid_biter)}")
    scheduler.step()

epoch 0
training   loss : 0.6167557886173017
validation loss : 0.6130311371944844
validation acc  : 0.6494156250000094
epoch 1
training   loss : 0.5565248841885477
validation loss : 0.5341699340287596
validation acc  : 0.7375093750000036
epoch 2
training   loss : 0.5360430646222085
validation loss : 0.5567478049639613
validation acc  : 0.7033531250000131
epoch 3
training   loss : 0.5254401282477192
validation loss : 0.5123430011235177
validation acc  : 0.7516062499999984
epoch 4
training   loss : 0.5165074256272055
validation loss : 0.5066159102600067
validation acc  : 0.7601374999999981
epoch 5
training   loss : 0.5000989233423024
validation loss : 0.49557027911767365
validation acc  : 0.7715031249999954
epoch 6
training   loss : 0.4972173641738482
validation loss : 0.49462904264219104
validation acc  : 0.7684999999999959
epoch 7
training   loss : 0.4952928430680186
validation loss : 0.4932508804276586
validation acc  : 0.7686593749999973
epoch 8
training   loss : 0.4932814750424586
v

## LSTM

In [24]:
import torch.nn as nn

class TextSentimentLSTM(nn.Module):
    """
    from torchtext examples
    """
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.LSTM = nn.LSTM(embed_dim, 40, 1, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(40*2, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        x = self.embedding(text)
        x = self.LSTM(x)
        h = torch.transpose(x[1][0], 0, 1)
        h = torch.reshape(h, (h.shape[0], -1))
        return self.fc(h)

model = TextSentimentLSTM(vocab_size, embed_dim=100, num_class=2)

In [25]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.5)

In [26]:
num_epochs = 10

for epoch in range(num_epochs):

    train_loss = 0.0
    valid_loss = 0.0
    valid_acc  = 0.0
    # Train the model
    model.train()
    bar = tq(train_biter, postfix={"train_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
    for text, cls in bar:
        optimizer.zero_grad()
        cls = torch.tensor([int(i) for i in cls])
        text = text.T
        output = model(text)
        loss = criterion(output, cls)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
        bar.set_postfix(ordered_dict={"train_loss":loss.item() , "Accuracy":acc})
    
    model.eval()
    LOSS = 0.0
    i  = 0
    with torch.no_grad():
        bar = tq(valid_biter, postfix={"valid_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
        for text, cls in bar:
            cls = torch.tensor([int(i) for i in cls])
            text = text.T
            output = model(text)
            loss = criterion(output, cls)
            acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
            valid_loss += loss.item()
            valid_acc += acc
            bar.set_postfix(ordered_dict={"valid_loss":loss.item(), "Accuracy":acc})
    
    print(f"epoch {epoch}")
    print(f"training   loss : {train_loss/len(train_biter)}")
    print(f"validation loss : {valid_loss/len(valid_biter)}")
    print(f"validation acc  : {valid_acc/len(valid_biter)}")
    scheduler.step()

epoch 0
training   loss : 0.5463770840130746
validation loss : 0.4868140486255288
validation acc  : 0.7636468749999956
epoch 1
training   loss : 0.4715270173503086
validation loss : 0.4559296695981175
validation acc  : 0.7832468749999961
epoch 2
training   loss : 0.44864414104493333
validation loss : 0.4402766204159707
validation acc  : 0.7933031249999928
epoch 3
training   loss : 0.43469986379845066
validation loss : 0.43563658468425276
validation acc  : 0.7951468749999928
epoch 4
training   loss : 0.4249087890703231
validation loss : 0.4235063418187201
validation acc  : 0.8024656249999904
epoch 5
training   loss : 0.4165096571436152
validation loss : 0.41776211448013784
validation acc  : 0.8061999999999919
epoch 6
training   loss : 0.4129209479445126
validation loss : 0.4174101486802101
validation acc  : 0.8067968749999924
epoch 7
training   loss : 0.4097412711556535
validation loss : 0.41239662929903714
validation acc  : 0.8096499999999888
epoch 8
training   loss : 0.406795225145760