In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import string
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('archive/training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1", engine="python")
data.columns = ["label", "time", "date", "query", "username", "text"]
data.head()

Unnamed: 0,label,time,date,query,username,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [3]:
data.shape

(1599999, 6)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   label     1599999 non-null  int64 
 1   time      1599999 non-null  int64 
 2   date      1599999 non-null  object
 3   query     1599999 non-null  object
 4   username  1599999 non-null  object
 5   text      1599999 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [5]:
data.dtypes

label        int64
time         int64
date        object
query       object
username    object
text        object
dtype: object

In [6]:
# checking if there is any null value
np.sum(data.isnull().any(axis=1))

0

In [7]:
# checking how many unique labels
data['label'].unique()

array([0, 4])

In [8]:
# cheking how many unique users
len(data['username'].unique())
# there are users tweeted multiple times 

659775

# Data Preparation

In [9]:
# Work with only the text and labels
data = data[['text', 'label']]

In [10]:
# Convert label 4 as probability 1
data['label'][data['label'] == 4] = 1

In [11]:
# For now work with small amount of data
positive = data[data['label'] == 1]
negative = data[data['label'] == 0]
data = pd.concat([negative[:4000], positive[:4000]])
data.reset_index(inplace=True)

In [12]:
# making text lower case
data['text'] = data['text'].str.lower()

In [13]:
# remove stop words from tweets
def remove_stopwords(text):
    cleaned_text = []
    for word in text.split():
        if word not in nltk.corpus.stopwords.words('english'):
            cleaned_text.append(word)
    return " ".join(cleaned_text)

data['text'] = data['text'].apply(remove_stopwords)

In [14]:
data

Unnamed: 0,index,text,label
0,0,upset can't update facebook texting it... migh...,0
1,1,@kenichan dived many times ball. managed save ...,0
2,2,whole body feels itchy like fire,0
3,3,"@nationwideclass no, behaving all. i'm mad. he...",0
4,4,@kwesidei whole crew,0
...,...,...,...
7995,803994,lipton tea -so delicious calming (even teine)....,1
7996,803995,@bryndrescher lol history belongs last person ...,1
7997,803996,cooked pasta sister. birthday tom,1
7998,803997,today's gonna gooooood,1


In [15]:
# remove emails and urls
def remove_emails(text):
    return re.sub(r'[\w\.\+\-]+\@[\w\.]+\.[a-z]{2,3}', '' , text)

def remove_urls(text):
    return re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', '', text)

data['text'] = data['text'].apply(remove_emails)
data['text'] = data['text'].apply(remove_urls)

In [16]:
data.at[49, 'text']

"broadband plan 'a massive broken promise'  via ~tautao still waiting broadband"

In [17]:
# removing punctuations
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))
data['text'] = data['text'].apply(remove_punctuations)

In [18]:
# removing numbers
def remove_digits(text):
    return text.translate(str.maketrans('', '', string.digits))
data['text'] = data['text'].apply(remove_digits)

In [19]:
# tokenize tweet text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)

In [20]:
data

Unnamed: 0,index,text,label
0,0,"[upset, cant, update, facebook, texting, it, m...",0
1,1,"[kenichan, dived, many, times, ball, managed, ...",0
2,2,"[whole, body, feels, itchy, like, fire]",0
3,3,"[nationwideclass, no, behaving, all, im, mad, ...",0
4,4,"[kwesidei, whole, crew]",0
...,...,...,...
7995,803994,"[lipton, tea, so, delicious, calming, even, te...",1
7996,803995,"[bryndrescher, lol, history, belongs, last, pe...",1
7997,803996,"[cooked, pasta, sister, birthday, tom]",1
7998,803997,"[todays, gonna, gooooood]",1


In [21]:
# Applying Stemming
# Lemmatization is better than stemming

# stemmer = nltk.PorterStemmer()
# def replace_by_stem_words(text):
#     text = [stemmer.stem(word) for word in text]
#     return text

# data['text'] = data['text'].apply(replace_by_stem_words)

In [22]:
# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

data['text'] = data['text'].apply(lemmatizer_on_text)

In [23]:
# adding <bos> and <eos> tokens
def add_start_and_end_tokens(tokens):
    tokens.insert(0, '<bos>')
    tokens.append('<eos>')
    return tokens
data['text'] = data['text'].apply(add_start_and_end_tokens)

In [24]:
# finding out number of unique words
all_words = []
for tweet in data['text']:
    for word in tweet:
        all_words.append(word)
print("total number of lemmatize words in all tweets", len(all_words))
unique_words = list(set(all_words))
print("total number of unique lemmatize words in all tweets", len(unique_words))

word_counts = [all_words.count(word) for word in unique_words]
unique_words = [word for _, word in sorted(zip(word_counts, unique_words), key=lambda pair: pair[0], reverse=True)]
unique_words = unique_words[:1000]

total number of lemmatize words in all tweets 78446
total number of unique lemmatize words in all tweets 15182


In [25]:
# indexing words
word_to_ix = {word:i for i,word in enumerate(unique_words)}
ix_to_word = {i:word for i,word in enumerate(unique_words)}

# LSTM

In [26]:
# creating train and test set
X = data.text
y = data.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = list(X_train)
X_test = list(X_test)
y_train = list(y_train)
y_test = list(y_test)

In [27]:
# word to vec
class FormattedDataset(Dataset):
    def __init__(self, X, y, unique_words, max_len):
        super().__init__()
        self.X = X
        self.y = y
        self.unique_words = unique_words
        self.word_to_ix = {word:i for i,word in enumerate(unique_words)}
        self.max_len = max_len
        
    def __getitem__(self, index):
        selected_tweet = self.X[index]
        tweet_length = len(selected_tweet)
        tweet_vec = torch.zeros((len(self.unique_words), self.max_len))
        for i in range(tweet_length):
            if self.word_to_ix.get(selected_tweet[i], 0):
                tweet_vec[self.word_to_ix[selected_tweet[i]]][self.max_len - tweet_length + i] = 1
        return {'text':tweet_vec.T, 'label':self.y[index]}
        
    def __len__(self):
        return len(self.y)


train_data = FormattedDataset(X_train, y_train, unique_words, max([len(x) for x in X]))
test_data = FormattedDataset(X_test, y_test, unique_words, max([len(x) for x in X]))

train_loader = DataLoader(dataset=train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=32, shuffle=True)

In [28]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())
device = "cpu"

True


In [35]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_layers, num_classes):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_sizes = hidden_sizes
        self.rnn = nn.GRU(input_size, hidden_sizes[0], num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc2 = nn.Linear(hidden_sizes[1], num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_sizes[0]).to(device)
 
        out, _ = self.rnn(x, h0)
        # out: batch_size, seq_length, hidden_size 
        out = out[:, -1, :]
        out = F.relu(self.fc(out))
        out = F.dropout(out, p=0.2)
        out = self.fc2(out)
        return out

In [38]:
input_size = len(unique_words)
hidden_size = [512, 128]
num_layers = 2
num_classes = 2
learning_rate = 0.001

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [39]:
num_epochs = 2
n_total_steps = len(train_loader)

# training
for epoch in range(num_epochs):
    for i, example in enumerate(train_loader):
        tweets = example['text'].to(device)
        labels = example['label'].to(device)
        
        # forward pass
        outputs = model(tweets)
        loss = criterion(outputs, labels)
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1)%20 == 0:
            print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{n_total_steps}, loss = {loss.item():.4f}')
            
    # testing
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for example in test_loader:
            tweets = example['text'].to(device)
            labels = example['label'].to(device)
            outputs = model(tweets)

            # value, index
            _, predictions = torch.max(outputs, 1)
            n_samples += labels.shape[0]
            n_correct += (predictions == labels).sum().item()

        accuracy = 100.0 * (n_correct/ n_samples)
        print(f'accuracy = {accuracy}')

epoch 1 / 2, step 20/200, loss = 0.6936
epoch 1 / 2, step 40/200, loss = 0.6930
epoch 1 / 2, step 60/200, loss = 0.6847
epoch 1 / 2, step 80/200, loss = 0.6890
epoch 1 / 2, step 100/200, loss = 0.6848
epoch 1 / 2, step 120/200, loss = 0.6036
epoch 1 / 2, step 140/200, loss = 0.6023
epoch 1 / 2, step 160/200, loss = 0.6412
epoch 1 / 2, step 180/200, loss = 0.5669
epoch 1 / 2, step 200/200, loss = 0.6102
accuracy = 65.5
epoch 2 / 2, step 20/200, loss = 0.5022
epoch 2 / 2, step 40/200, loss = 0.4131
epoch 2 / 2, step 60/200, loss = 0.5875
epoch 2 / 2, step 80/200, loss = 0.5911
epoch 2 / 2, step 100/200, loss = 0.4226
epoch 2 / 2, step 120/200, loss = 0.5808
epoch 2 / 2, step 140/200, loss = 0.4618
epoch 2 / 2, step 160/200, loss = 0.5281
epoch 2 / 2, step 180/200, loss = 0.6574
epoch 2 / 2, step 200/200, loss = 0.5082
accuracy = 69.625
