In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from gensim.models import Word2Vec
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/emre2020/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train=pd.read_csv("labeledTrainData.tsv.zip", header=0, \
                    delimiter="\t", quoting=3)
train=train.drop(['id'],axis=1)
train.head()

Unnamed: 0,sentiment,review
0,1,"""With all this stuff going down at the moment ..."
1,1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,0,"""The film starts with a manager (Nicholas Bell..."
3,0,"""It must be assumed that those who praised thi..."
4,1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
train.size

50000

In [4]:
def filter(sentence):
    result = [word.lower() for word in nltk.word_tokenize(sentence) if word.isalnum()]
    return result

In [5]:
example = "Hello, Buddy."
print(filter(example))

['hello', 'buddy']


In [6]:
train["review"] = train["review"].apply(filter)

In [7]:
from gensim.models import KeyedVectors

In [8]:
# Load vectors directly from the file
word2_vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [9]:
def word2_vec_func(sentence):
    
    result = [word2_vec[word] for word in sentence if word in word2_vec]

    return result

In [10]:
train["review"] = train["review"].apply(word2_vec_func)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train['review'], train['sentiment'], test_size=0.2)

In [13]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable

In [14]:
class MyModel(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super(MyModel,self).__init__()

        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        self.sigmoid = nn.Sigmoid()

        
    def forward(self, text):
        _, (hidden, _) = self.rnn(text)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.sigmoid(self.fc(hidden.squeeze(0)))
 

In [15]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = MyModel(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT)
model

MyModel(
  (rnn): LSTM(300, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
)

In [16]:
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [17]:
def acc(base,pred):
    correct = torch.eq(base,pred).sum().item()
    result = (correct/len(pred))*100
    return result

In [27]:
def collate_fn(batch):
    labels = torch.Tensor([item[1] for item in batch])
    sequences = [torch.FloatTensor(item[0]) for item in batch]
    lengths = torch.LongTensor([len(seq) for seq in sequences])

    # Pad the sequences
    sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True)

    return sequences, lengths, labels

# Create DataLoader
train_data = list(zip(X_train, y_train))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)

val_data = list(zip(X_test,y_test))
valid_loader = DataLoader(val_data,batch_size=32, shuffle=True,collate_fn=collate_fn)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)
criterion = criterion.to(device)

# Assuming we have a validation data loader: valid_loader
for epoch in range(10):  # assuming 10 epochs
    model.train()
    for i, (sequences, lengths, labels) in enumerate(train_loader):
        # Convert tensors to device
        sequences = sequences.to(device)
        labels = labels.to(device)

        sequences = pack_padded_sequence(sequences, lengths, batch_first=True, enforce_sorted=False)
        outputs = model(sequences).squeeze()
        
        loss = criterion(outputs, labels)
        accuracy = acc(labels, torch.round(outputs).detach())
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print (f"Epoch {epoch} , Training Loss: {loss:.4f} Training Accuracy:  {accuracy}")

    model.eval()
    total_val_loss = 0
    total_val_acc = 0

    # Evaluation with the validation data
    with torch.inference_mode():
        for i, (sequences, lengths, labels) in enumerate(valid_loader):
            sequences = sequences.to(device)
            labels = labels.to(device)
            
            sequences = pack_padded_sequence(sequences, lengths, batch_first=True, enforce_sorted=False)
            outputs = model(sequences).squeeze()
            total_val_loss += criterion(outputs, labels).item()
            total_val_acc += acc(labels, torch.round(outputs).detach())
            
    print(f"Epoch {epoch}, Validation Loss: {total_val_loss/len(valid_loader):.4f}, Validation Accuracy: {total_val_acc/len(valid_loader):.4f}")



Epoch 0 , Training Loss: 0.1696 Training Accuracy:  93.75
Epoch 0, Validation Loss: 0.2358, Validation Accuracy: 90.5454
Epoch 1 , Training Loss: 0.1074 Training Accuracy:  93.75
Epoch 1, Validation Loss: 0.2449, Validation Accuracy: 90.2667


In [None]:
example = "I AM IN LOVE WİTH İT"

In [None]:
vector = word2_vec_func(filter(example))
vector

In [None]:
# Convert the example to a word vector using `word2_vec_func`
vector = word2_vec_func(example)

# Create a tensor from the word vector
input_tensor = torch.tensor(vector)

# Add an extra dimension to match the batch size (assuming batch size of 1)
input_tensor = input_tensor.unsqueeze(0)

# Move the tensor to the appropriate device
input_tensor = input_tensor.to(device)

# Get the lengths of the sequences (in this case, the single example)
lengths = torch.tensor([input_tensor.size(1)])

# Pack the sequence using `pack_padded_sequence`
packed_sequence = pack_padded_sequence(input_tensor, lengths, batch_first=True)

# Pass the packed sequence through your model
output = model(packed_sequence)


In [None]:
# Apply sigmoid activation function
probabilities = torch.sigmoid(output)

# Set a threshold to determine the class
threshold = 0.5
predictions = (probabilities > threshold).int()

# Convert predictions to the desired format (positive/negative or 1/0)
class_labels = ['negative', 'positive']
predicted_class = class_labels[predictions.item()]

print("Predicted class:", predicted_class)
probabilities