In [1]:
import os
import torch
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tq
from torchtext.data import TabularDataset

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
USE_STEMEER = False
SEED = 42
QUICK = True
TEST_SIZE = 0.2
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda:0'

In [3]:
# seeding function for reproducibility
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

In [4]:
# Reading the csv file and removing unnecessary columns
df = pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv",
                 encoding="latin1",
                 header=None)
df = df.rename(columns={0:"sentiment",
                        1:"id",
                        2:"time",
                        3:"query",
                        4:"username",
                        5:"text"})
df = df[["sentiment","text"]]
df["sentiment"] = df["sentiment"].map({0: 0, 4: 1})
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.sentiment.value_counts()
# Looks like dataset is well balanced :)

1    800000
0    800000
Name: sentiment, dtype: int64

## Text Preprocessing

In [6]:
import re
from nltk.stem.porter import PorterStemmer

use_stemmer = USE_STEMEER
if use_stemmer:
      porter_stemmer = PorterStemmer()

def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|;\s?D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip " and ' from tweet
    tweet = tweet.strip('"\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Convert to lower case
    tweet = tweet.lower()
    
    words = tweet.split()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                word = str(porter_stemmer.stem(word))
        processed_tweet.append(word)
    return ' '.join(processed_tweet)

In [7]:
# Example output
print(df.text[2])
print(preprocess_tweet(df.text[2]))

@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
user_mention i dived many times for the ball managed to save 50% the rest go out of bounds


In [8]:
%%time
df['Processed_text'] = df.text.apply(preprocess_tweet)

CPU times: user 4min 23s, sys: 421 ms, total: 4min 23s
Wall time: 4min 24s


In [9]:
df.head()

Unnamed: 0,sentiment,text,Processed_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",user_mention url aww thats a bummer you shoul...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,user_mention i dived many times for the ball m...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",user_mention no its not behaving at all im mad...


In [10]:
df = df[["Processed_text", "sentiment"]]
df.to_csv("train.csv", index=None)

In [11]:
import torchtext
tweet = torchtext.data.Field(lower=True) # , tokenize="spacy")
targets = torchtext.data.RawField(is_target=True)
fields = [("Processed_text",tweet ), ("sentiment",targets)]

In [12]:
%%time
dataset = TabularDataset(path="./train.csv", format="CSV", fields=fields, skip_header=True)

CPU times: user 36.6 s, sys: 961 ms, total: 37.6 s
Wall time: 37.6 s


In [13]:
tweet.build_vocab(dataset, max_size=100_000, min_freq=5, vectors="glove.6B.100d")
vocab = tweet.vocab
vocab_size = len(vocab)
print(vocab_size)

.vector_cache/glove.6B.zip: 862MB [06:29, 2.21MB/s]                           
100%|█████████▉| 398520/400000 [00:22<00:00, 18628.73it/s]

57280


In [14]:
train_dataset, valid_dataset = dataset.split(1-TEST_SIZE)

biter = torchtext.data.BucketIterator(dataset=train_dataset, 
                                      batch_size=4,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)

In [15]:
for i in biter:
    print(i.Processed_text.shape)
    print(len(i.sentiment))
    break

torch.Size([17, 4])
4


In [16]:
train_biter = torchtext.data.BucketIterator(dataset=train_dataset, 
                                      batch_size=200,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)
valid_biter = torchtext.data.BucketIterator(dataset=valid_dataset, 
                                      batch_size=200,
                                      sort_key=lambda x: len(x.comment_text),
                                      train=True, 
                                      sort=False,
                                      shuffle=True)

In [17]:
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [18]:
class TextSentimentTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hdim=400):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=False)
        self.embedding.weight.data.copy_(vocab.vectors)
        self.pos_encoder = PositionalEncoding(embed_dim)
        encoder_layers = nn.TransformerEncoderLayer(embed_dim, 2, hdim, 0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, 6)
        self.fc = nn.Linear(embed_dim*2, 100)
        self.fc2 = nn.Linear(100, num_class)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.2)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        #self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        x = self.drop(self.embedding(text)).transpose(0, 1)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = torch.cat((x[0].unsqueeze(0), x[-1].unsqueeze(0)), 0)
        x = torch.transpose(x, 0, 1)
        x = torch.reshape(x, (x.shape[0], -1))
        x = self.fc(x)
        return self.fc2(self.relu(x))

model = TextSentimentTransformer(vocab_size, embed_dim=100, num_class=2).to(device)
# for i in model.embedding.parameters():
#     i.requires_grad = False

In [19]:
for text, cls in train_biter:
    cls = torch.tensor([int(i) for i in cls]).to(device)
    text = text.T.to(device)
    output = model(text)
    print(output.shape)
    break

100%|█████████▉| 398520/400000 [00:40<00:00, 18628.73it/s]

torch.Size([200, 2])


In [20]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.5)

In [21]:
num_epochs = 10

for epoch in range(num_epochs):

    train_loss = 0.0
    valid_loss = 0.0
    valid_acc  = 0.0
    # Train the model
    model.train()
    bar = tq(train_biter, postfix={"train_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
    for text, cls in bar:
        optimizer.zero_grad()
        cls = torch.tensor([int(i) for i in cls]).to(device)
        text = text.T.to(device)
        output = model(text)
        loss = criterion(output, cls)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
        bar.set_postfix(ordered_dict={"train_loss":loss.item() , "Accuracy":acc})
    
    model.eval()
    LOSS = 0.0
    i  = 0
    with torch.no_grad():
        bar = tq(valid_biter, postfix={"valid_loss":0.0, "Accuracy":0.0}, leave=False, disable=True)
        for text, cls in bar:
            cls = torch.tensor([int(i) for i in cls]).to(device)
            text = text.T.to(device)
            output = model(text)
            loss = criterion(output, cls)
            acc = torch.sum(cls == torch.argmax(output, axis=1)).item() / cls.shape[0]
            valid_loss += loss.item()
            valid_acc += acc
            bar.set_postfix(ordered_dict={"valid_loss":loss.item(), "Accuracy":acc})
    
    print(f"epoch {epoch}")
    print(f"training   loss : {train_loss/len(train_biter)}")
    print(f"validation loss : {valid_loss/len(valid_biter)}")
    print(f"validation acc  : {valid_acc/len(valid_biter)}")
    scheduler.step()

epoch 0
training   loss : 0.4583382269134745
validation loss : 0.4136390585638583
validation acc  : 0.8151781250000011
epoch 1
training   loss : 0.4087041346682236
validation loss : 0.4020123929902911
validation acc  : 0.8144468750000007
epoch 2
training   loss : 0.39286177440546455
validation loss : 0.392120366692543
validation acc  : 0.8239218750000014
epoch 3
training   loss : 0.3813425683812238
validation loss : 0.391161370575428
validation acc  : 0.8240000000000015
epoch 4
training   loss : 0.3722769321012311
validation loss : 0.3827553320955485
validation acc  : 0.827787500000003
epoch 5
training   loss : 0.35749877492198723
validation loss : 0.38739893317222596
validation acc  : 0.8267343750000026
epoch 6
training   loss : 0.35148258655564857
validation loss : 0.3850011092983186
validation acc  : 0.8263656250000028
epoch 7
training   loss : 0.3472252311650664
validation loss : 0.39349095568060877
validation acc  : 0.8282656250000007
epoch 8
training   loss : 0.34278127789264545
