# Sentiment Analysis using RNN
### By: Dhruv Jayant Verma

#### Loading Data

In [6]:
import pandas as pd

# Define the column names as they are not included in the CSV file
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Load the dataset
df = pd.read_csv('twitter.csv',
                 encoding='ISO-8859-1', names=column_names) # avoids errors in reading fie

# Display the first few rows
print(df.head())

   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


#### Exploring Data

In [7]:
# Check the distribution of the target variable
print(df['target'].value_counts())

target
0    800000
4    800000
Name: count, dtype: int64


### Data Preprocessing
#### Mapping Labels
I'll map the labels from `[0, 4]` to `[0, 1]`.

In [8]:
# Map target labels to 0 and 1
df['target'] = df['target'].map({0: 0, 4: 1})

## Using a Sample of Original Data
Since original dataset is too big to compute, I'll use `10%` of the original data for faster processing.

In [9]:
# Use a 10% sample of the data for faster processing|
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

### Data Cleaning
I'll preprocess the text data to remove noise and prepare it for tokenization.

In [10]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters, numbers, punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and stem the tokens
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text

# Apply preprocessing to the text column
df['clean_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhruv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [11]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text,clean_text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",cool tweet app razr
2,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,know famili drama lamehey next time u hang kim...
3,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email wont open geographi stuff revis s...
4,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem


## Tokenization and Vocabulary Building
### Creating the Vocabulary
Here, I'll create a vocabulary dictionary by extracting all the unique words from the cleaned text.

In [12]:
from collections import Counter

# Combine all texts
all_words = ' '.join(df['clean_text']).split()

# Compute word frequencies
word_counts = Counter(all_words)

# Create a vocabulary dictionary mapping words to indices
vocab = {word: idx + 1 for idx, (word, count) in enumerate(word_counts.items())}

# Add a special token for unknown words
vocab_size = len(vocab) + 1  # Plus one for unknown token

## Encoding and Padding
I'll encode each tweet as a sequence of integers and pad them to a fixed length.

In [14]:
import torch
from torch.nn.utils.rnn import pad_sequence

def encode_tweet(text):
    return [vocab.get(word, 0) for word in text.split()]

df['encoded_text'] = df['clean_text'].apply(encode_tweet)

# Determine the maximum length (you can set a fixed length or compute the max)
max_len = 50

def pad_sequence_custom(sequence):
    if len(sequence) < max_len:
        return sequence + [0] * (max_len - len(sequence))
    else:
        return sequence[:max_len]

df['padded_text'] = df['encoded_text'].apply(pad_sequence_custom)

In [15]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text,clean_text,encoded_text,padded_text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok,"[1, 2, 3]","[1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",cool tweet app razr,"[4, 5, 6, 7]","[4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,know famili drama lamehey next time u hang kim...,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 14, 18,...","[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 14, 18,..."
3,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email wont open geographi stuff revis s...,"[24, 25, 26, 27, 28, 29, 30, 31, 24]","[24, 25, 26, 27, 28, 29, 30, 31, 24, 0, 0, 0, ..."
4,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem,"[32, 33, 34]","[32, 33, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Creating PyTorch Datasets and DataLoaders
### Creating the Dataset Class

In [16]:
from torch.utils.data import Dataset, DataLoader

class Sentiment(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = torch.tensor(self.texts[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return text, label

## Splitting the Data
Split the dataset into training and validation sets.

In [17]:
from sklearn.model_selection import train_test_split

# Extract features and labels
X = df['padded_text'].tolist()
y = df['target'].tolist()

# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y)

# Create datasets
train_dataset = Sentiment(X_train, y_train)
val_dataset = Sentiment(X_val, y_val)

## Creating DataLoaders

In [18]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Making the Neural Network Architecture
I'll use a RNN model architecture, utilizing an Embedding layer followed by an LSTM or GRU layer.

In [19]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers=2, bidirectional=True, dropout=0.5):
        super(SentimentRNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            bidirectional=bidirectional, batch_first=True, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Concatenate the final forward and backward hidden states
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        
        out = self.fc(hidden)
        return out

## Model Hyperparameters

In [20]:
vocab_size = vocab_size  # As calculated before
embedding_dim = 100
hidden_dim = 128
output_dim = 1  # Binary classification
n_layers = 2
bidirectional = True
dropout = 0.3

model = SentimentRNN(vocab_size, embedding_dim, hidden_dim, output_dim,
                     n_layers, bidirectional, dropout)

## Moving Model to Gpu for Computing

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print('Using CUDA')
else:
    print('CUDA not available')
model = model.to(device)

Using CUDA


## Model Training
### Defining Loss Function and Optimizer

In [22]:
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

## Training the Model

In [23]:
epochs = 2  # Increase the number of epochs as needed

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for texts, labels in train_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Validation Loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts = texts.to(device)
            labels = labels.to(device)
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}')
    print(f'Training Loss: {epoch_loss / len(train_loader):.3f}')
    print(f'Validation Loss: {val_loss / len(val_loader):.3f}')

Epoch 1
Training Loss: 0.545
Validation Loss: 0.482
Epoch 2
Training Loss: 0.478
Validation Loss: 0.469


## Evaluating the Model
### Calculating Metrics
I'll calculate accuracy, precision, recall, and F1-score.

In [24]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Lists to store all predictions and labels
all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for texts, labels in val_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        predictions = model(texts).squeeze(1)
        preds = torch.round(torch.sigmoid(predictions))
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average='binary')

print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')

Accuracy: 0.776
Precision: 0.762
Recall: 0.805
F1 Score: 0.782


## Making Predictions
Define a function to preprocess and predict the sentiment of new tweets.

In [25]:
def predict_sentiment(model, text):
    model.eval()
    text = preprocess_text(text)
    encoded = [vocab.get(word, 0) for word in text.split()]
    padded = pad_sequence_custom(encoded)
    tensor = torch.tensor(padded, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        prediction = torch.sigmoid(model(tensor))
    return prediction.item()

# Example usage
new_tweet = "I absolutely love the new design of your website!"
score = predict_sentiment(model, new_tweet)
sentiment = 'Positive' if score >= 0.5 else 'Negative'
print(f'Sentiment: {sentiment} (Score: {score:.3f})')

Sentiment: Positive (Score: 0.981)


## Saving the Model for future use and training

In [26]:
import joblib

# save model to a file
joblib.dump(model, 'sentiment_analysis.joblib')

# load the model
loaded_model = joblib.load('sentiment_analysis.joblib')

In [28]:
example = "She is a bad person. I really hate her!"
score = predict_sentiment(model, example)
sentiment = 'Positive' if score >= 0.5 else 'Negative'
print(f'Sentiment: {sentiment} (Score: {score:.3f})')

Sentiment: Negative (Score: 0.018)
