In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils
import numpy as np
import pandas as pd

In [2]:
# Load data
df_full = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep='\t', compression='zip')
# df_full.info()

In [3]:
# Tokenize and pad sequences
vocab = set(" ".join(df_full["Phrase"]).split())
vocab.add("<PAD>")
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}

In [4]:
# padding function for each movie review sentence
def create_padding(sentence):
    words = sentence.split()
    words = words[:10] if len(words) > 10 else words + ["<PAD>"] * (10 - len(words))
    words = ' '.join(words)
    return words



In [5]:
# Relevant columns only
data = df_full.drop(['SentenceId'], axis=1)
data.info()
# apply the function to the dataframe column 'Phrase'
# data = data[data['Phrase'].apply(lambda x: len(x.split()) >= 3)]

data['Phrase'] = data['Phrase'].apply(lambda x: create_padding(x))
data.info()
# data = data.iloc[:10000, :]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   PhraseId   156060 non-null  int64 
 1   Phrase     156060 non-null  object
 2   Sentiment  156060 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   PhraseId   156060 non-null  int64 
 1   Phrase     156060 non-null  object
 2   Sentiment  156060 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   PhraseId   156060 non-null  int64 
 1   Phrase     156060 non-null  object
 2   Senti

In [6]:
# collate function to provide equal length of tokens in each row of the batch
def collate_fn(batch):
    # Assuming each element of batch is a sequence of tensors
    # Pad sequences to the same length
    x_batch, y_batch = zip(*batch)
    x_batch = rnn_utils.pad_sequence(x_batch, batch_first=True)
    return x_batch, y_batch

In [7]:
# Define dataset class
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.num_classes = len(set(data["Sentiment"]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data.iloc[index]["Phrase"]
        y = self.data.iloc[index]["Sentiment"]
        x_tokenized = self.tokenizer(x)
#         print(type(x_tokenized))
        x_tokenized_len = len(x_tokenized)
        x_tokenized_ids = [word_to_ix[word] if word in word_to_ix else word_to_ix['<PAD>'] for word in x_tokenized]
        x_tokenized_tensor = torch.tensor(x_tokenized_ids)
#         print(f'x_tokenized: {x_tokenized_tensor} and y: {y}')
        return x_tokenized_tensor, y

In [8]:
# Define model architecture
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(device)
        c0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(device)
        out, _ = self.rnn(x, (h0, c0))
        out = out[:, -1, :]
        out = self.fc(out)
        return out

In [9]:
# Define model parameters
input_size = len(vocab) + 1
hidden_size = 128
output_size = 5
num_layers = 5

In [10]:
# Train model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentimentRNN(input_size, hidden_size, output_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
batch_size = 32
num_epochs = 200
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
dataset = SentimentDataset(data, tokenizer)
print(type(dataset))
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
print(type(loader))

# variables to store best loss and best model
best_val_loss = float('inf')
best_model_path = None

# Train loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0    
    for x_batch, y_batch in loader:
#         print('epoch: '+ str(epoch))
#         print(f"x_batch is:  {x_batch}")
#         print(f"y_batch is:  {y_batch}")
        x_batch = torch.stack(tuple(x_batch)).to(device)
        y_batch = torch.tensor(y_batch).to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x_batch.size(0)
    epoch_loss = running_loss / len(dataset)
#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

<class '__main__.SentimentDataset'>
<class 'torch.utils.data.dataloader.DataLoader'>
Epoch 1/200, Loss: 1.0780
Epoch 2/200, Loss: 0.8509
Epoch 3/200, Loss: 0.7574
Epoch 4/200, Loss: 0.6961
Epoch 5/200, Loss: 0.6467
Epoch 6/200, Loss: 0.6047
Epoch 7/200, Loss: 0.5610
Epoch 8/200, Loss: 0.5199
Epoch 9/200, Loss: 0.4801
Epoch 10/200, Loss: 0.4429
Epoch 11/200, Loss: 0.4074
Epoch 12/200, Loss: 0.3747
Epoch 13/200, Loss: 0.3478
Epoch 14/200, Loss: 0.3225
Epoch 15/200, Loss: 0.3032
Epoch 16/200, Loss: 0.2832
Epoch 17/200, Loss: 0.2679
Epoch 18/200, Loss: 0.2536
Epoch 19/200, Loss: 0.2432
Epoch 20/200, Loss: 0.2339
Epoch 21/200, Loss: 0.2250
Epoch 22/200, Loss: 0.2171
Epoch 23/200, Loss: 0.2122
Epoch 24/200, Loss: 0.2047
Epoch 25/200, Loss: 0.2018
Epoch 26/200, Loss: 0.1961
Epoch 27/200, Loss: 0.1914
Epoch 28/200, Loss: 0.1892
Epoch 29/200, Loss: 0.1841
Epoch 30/200, Loss: 0.1816
Epoch 31/200, Loss: 0.1778
Epoch 32/200, Loss: 0.1783
Epoch 33/200, Loss: 0.1749
Epoch 34/200, Loss: 0.1722
Epoch 

In [11]:
# Evaluate model
model.eval() # Set model to evaluation mode
test_sentence = "This movie was terrible. I can not stand this move!"
test_sequence = [word_to_ix[word] if word in word_to_ix else 0 for word in test_sentence.split()[:10]]
test_sequence += [0] * (10 - len(test_sequence))
test_sequence = torch.tensor(test_sequence, dtype=torch.long).unsqueeze(0).to(device)
with torch.no_grad():
    output = model(test_sequence)
    prediction = torch.argmax(output, dim=1).item()
print(f"Test Sentence: {test_sentence}")
print(f"Prediction: {prediction}")

Test Sentence: This movie was terrible. I can not stand this move!
Prediction: 0


In [12]:
# test file and predictions
# Load data
df_test = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep='\t', compression='zip')
# Relevant columns only
data_test = df_test.drop(['SentenceId'], axis=1)
data_test.info()
# apply the function to the dataframe column 'Phrase'
# data_test = data_test.iloc[:75000, :]
data_test['Phrase'] = data_test['Phrase'].apply(lambda x: create_padding(x))

# feed into the evaluation model
model.eval() # Set model to evaluation mode
y_pred_submission = []
for sentence_test in data_test['Phrase']:
    sentence_test = [word_to_ix[word] if word in word_to_ix else 0 for word in sentence_test.split()[:10]]
    sentence_test += [0] * (10 - len(sentence_test))
    sentence_test = torch.tensor(sentence_test, dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(sentence_test)
        prediction = torch.argmax(output, dim=1).item()
        y_pred_submission.append(prediction)
d = {'PhraseId': data_test['PhraseId'], 'Sentiment': prediction} 
submission = pd.DataFrame(data=d)
submission

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   PhraseId  66292 non-null  int64 
 1   Phrase    66292 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2
...,...,...
66287,222348,2
66288,222349,2
66289,222350,2
66290,222351,2


In [13]:
submission.to_csv('/kaggle/working/submission.csv', index=False, header=True)