In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch.nn.utils.rnn as rnn_utils
import numpy as np
import pandas as pd
import random
import string
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from torch.utils.data import random_split
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau


In [2]:
# set random seeds for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)


In [3]:
# Load data
df_full = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep='\t', compression='zip')
# df_full.info()

# # shuffle the rows
# df_full = df_full.sample(frac=1, random_state=42)

# # reset the index after shuffling
# df_full = df_full.reset_index(drop=True)

In [4]:
# padding function for each movie review sentence
def create_padding(sentence):
    words = sentence.split()
    words = words[:10] if len(words) > 10 else words + ["<PAD>"] * (10 - len(words))
    words = ' '.join(words)
    return words



In [5]:
# Relevant columns only

data = df_full.drop(['SentenceId'], axis=1)
# data = data.iloc[:75000, :]

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   PhraseId   156060 non-null  int64 
 1   Phrase     156060 non-null  object
 2   Sentiment  156060 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB


In [6]:
# Remove punctuations

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

data["Phrase"] = data["Phrase"].apply(lambda text: remove_punctuation(text))

# Remove STOPWORDS

", ".join(stopwords.words('english'))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data["Phrase"] = data["Phrase"].apply(lambda text: remove_stopwords(text))

# Remove most common words

cnt = Counter()
for text in data["Phrase"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

data["Phrase"] = data["Phrase"].apply(lambda text: remove_freqwords(text))

# # Remove Stemming 

# stemmer = PorterStemmer()
# def stem_words(text):
#     return " ".join([stemmer.stem(word) for word in text.split()])

# data["Phrase"] = data["Phrase"].apply(lambda text: stem_words(text))

# # Lemmatisation

# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()
# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
# def lemmatize_words(text):
#     pos_tagged_text = nltk.pos_tag(text.split())
#     return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# data["Phrase"] = data["Phrase"].apply(lambda text: lemmatize_words(text))

In [7]:
# apply the function to the dataframe column 'Phrase'

# data = data[data['Phrase'].apply(lambda x: len(x.split()) >= 3)]

data['Phrase'] = data['Phrase'].apply(lambda x: create_padding(x))


In [8]:
# Tokenize and pad sequences
vocab = set(" ".join(data["Phrase"]).split())
vocab.add("<PAD>")
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}

In [9]:
# collate function to provide equal length of tokens in each row of the batch
def collate_fn(batch):
    # Assuming each element of batch is a sequence of tensors
    # Pad sequences to the same length
    x_batch, y_batch = zip(*batch)
    x_batch = rnn_utils.pad_sequence(x_batch, batch_first=True)
    return x_batch, y_batch

In [10]:
# Define dataset class
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.num_classes = len(set(data["Sentiment"]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data.iloc[index]["Phrase"]
        y = self.data.iloc[index]["Sentiment"]
        x_tokenized = self.tokenizer(x)
        x_tokenized_len = len(x_tokenized)
        x_tokenized_ids = [word_to_ix[word] if word in word_to_ix else word_to_ix['<PAD>'] for word in x_tokenized]
        x_tokenized_tensor = torch.tensor(x_tokenized_ids)
        return x_tokenized_tensor, y

In [11]:
# Define model architecture

class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, embedding_dim=512, dropout_prob=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_prob = dropout_prob
        
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        embeds = self.embedding(x)
        rnn_out, hidden = self.rnn(embeds, hidden)
        rnn_out = self.dropout(rnn_out)
        out = self.fc(rnn_out[:, -1, :])
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device))
        return hidden


In [12]:
# Define model parameters
input_size = len(vocab) + 1
hidden_size = 128
output_size = 5
num_layers = 8

In [13]:
# # Rebalancing an unbalanced dataset
# # Find class weights
# class_counts = data['Sentiment'].value_counts()
# print(data['Sentiment'].values)
# print(class_counts)
# class_weights = 1 / torch.tensor(class_counts, dtype=torch.float)
# print(class_weights)
# # Compute weights for each data point
# weights = class_weights[data['Sentiment'].values]
# print(weights[4])
# # Create a sampler for weighted random sampling
# sampler = WeightedRandomSampler(weights, len(weights), replacement=False)


In [14]:
def validate(model, data_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for x_batch, y_batch in data_loader:
            x_batch = torch.stack(tuple(x_batch)).to(device)
            y_batch = torch.LongTensor(y_batch).to(device)
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            running_loss += loss.item() * x_batch.size(0)
            _, predicted = torch.max(y_pred, 1)
            correct_preds += (predicted == y_batch).sum().item()
            total_preds += x_batch.size(0)
    val_loss = running_loss / len(data_loader.dataset)
    val_accuracy = correct_preds / total_preds
    return val_loss, val_accuracy


# Train model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentRNN(input_size, hidden_size, output_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.001)
batch_size = 32
num_epochs = 250
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
dataset = SentimentDataset(data, tokenizer)

# Split dataset into train and validation sets
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset))

# Loader with weighted sampling enabled
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=None, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, sampler=None, collate_fn=collate_fn, drop_last=True)

# variables to store best loss and best model
best_val_loss = float('inf')
best_model_path = '/kaggle/working/best_model.pth'

# Load best model:
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))

# Create a learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, factor=0.8, patience=1)

# Train loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0    
    for x_batch, y_batch in train_loader:
        x_batch = torch.stack(tuple(x_batch)).to(device)
        y_batch = torch.tensor(y_batch).to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x_batch.size(0)
    epoch_loss = running_loss / len(dataset)
    # Validation code here
    val_loss, val_accuracy = validate(model, val_loader, criterion)
    
    # Save best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = 'best_model.pth'
        torch.save(model.state_dict(), best_model_path)
    
    # Retrieve the current learning rate
    lr = optimizer.param_groups[0]['lr']
    
    # Update learning rate
    scheduler.step(val_loss)
        
    print(f"Epoch {epoch+1}/{num_epochs}, Train_loss: {epoch_loss:.6f}, Val_loss: {val_loss:.6f}, Val_accu: {val_accuracy:.6f}, LR: {lr:.6f}")

124848
Epoch 1/250, Train_loss: 1.034209, Val_loss: 1.279444, Val_accu: 0.512981, LR: 0.010000
Epoch 2/250, Train_loss: 1.031441, Val_loss: 1.283886, Val_accu: 0.512981, LR: 0.010000
Epoch 3/250, Train_loss: 1.031824, Val_loss: 1.281852, Val_accu: 0.512981, LR: 0.010000
Epoch 4/250, Train_loss: 1.030119, Val_loss: 1.286238, Val_accu: 0.512981, LR: 0.008000
Epoch 5/250, Train_loss: 1.030688, Val_loss: 1.285304, Val_accu: 0.512981, LR: 0.008000
Epoch 6/250, Train_loss: 1.029447, Val_loss: 1.278929, Val_accu: 0.512981, LR: 0.006400
Epoch 7/250, Train_loss: 1.030248, Val_loss: 1.279174, Val_accu: 0.512981, LR: 0.006400
Epoch 8/250, Train_loss: 1.029499, Val_loss: 1.280181, Val_accu: 0.512981, LR: 0.006400
Epoch 9/250, Train_loss: 1.029176, Val_loss: 1.279068, Val_accu: 0.512981, LR: 0.005120
Epoch 10/250, Train_loss: 1.029121, Val_loss: 1.278980, Val_accu: 0.512981, LR: 0.005120
Epoch 11/250, Train_loss: 1.029017, Val_loss: 1.278928, Val_accu: 0.512981, LR: 0.004096
Epoch 12/250, Train_los

In [15]:
# Load best model
best_model = SentimentRNN(input_size, hidden_size, output_size, num_layers).to(device)
best_model.load_state_dict(torch.load(best_model_path))

# Evaluate model

best_model.eval() # Set model to evaluation mode
test_sentence = "This movie was terrible. I can not stand this move!"
test_sequence = [word_to_ix[word] if word in word_to_ix else 0 for word in test_sentence.split()[:10]]
test_sequence += [0] * (10 - len(test_sequence))
test_sequence = torch.tensor(test_sequence, dtype=torch.long).unsqueeze(0).to(device)
with torch.no_grad():
    output = model(test_sequence)
    prediction = torch.argmax(output, dim=1).item()
print(f"Test Sentence: {test_sentence}")
print(f"Prediction: {prediction}")

Test Sentence: This movie was terrible. I can not stand this move!
Prediction: 2


In [16]:
# test file and predictions
# Load data
df_test = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep='\t', compression='zip')
# Relevant columns only
data_test = df_test.drop(['SentenceId'], axis=1)

# Remove punctuations

data_test["Phrase"] = data_test["Phrase"].apply(lambda text: remove_punctuation(text))

# Remove STOPWORDS

data_test["Phrase"] = data_test["Phrase"].apply(lambda text: remove_stopwords(text))

# Remove most common words

cnt = Counter()
for text in data_test["Phrase"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

data_test["Phrase"] = data_test["Phrase"].apply(lambda text: remove_freqwords(text))

# Remove Stemming 

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

data_test["Phrase"] = data_test["Phrase"].apply(lambda text: stem_words(text))

# # Lemmatisation

# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()
# wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
# def lemmatize_words(text):
#     pos_tagged_text = nltk.pos_tag(text.split())
#     return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# data_test["Phrase"] = data_test["Phrase"].apply(lambda text: lemmatize_words(text))

data_test.info()
# apply the function to the dataframe column 'Phrase'
# data_test = data_test.iloc[:10000, :]
data_test['Phrase'] = data_test['Phrase'].apply(lambda x: create_padding(x))

# feed into the evaluation model
best_model.eval() # Set model to evaluation mode
y_pred_submission = []
for sentence_test in data_test['Phrase']:
    sentence_test = [word_to_ix[word] if word in word_to_ix else 0 for word in sentence_test.split()[:10]]
    sentence_test += [0] * (10 - len(sentence_test))
    sentence_test = torch.tensor(sentence_test, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(sentence_test)
        prediction = torch.argmax(output, dim=1).item()
        y_pred_submission.append(prediction)
d = {'PhraseId': data_test['PhraseId'], 'Sentiment': prediction} 
submission = pd.DataFrame(data=d)
submission

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   PhraseId  66292 non-null  int64 
 1   Phrase    66292 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2
...,...,...
66287,222348,2
66288,222349,2
66289,222350,2
66290,222351,2


In [17]:
submission.to_csv('/kaggle/working/submission.csv', index=False, header=True)