In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchtext
import time
import random
import pandas as pd
import spacy
import torch.optim as optim


In [None]:
# !pip install -U torch==1.8.0 torchtext==0.9.0

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from google.colab import files
upload = files.upload()


# Import Required Libraries & Data Loading

In [None]:
#importing the training data
df=pd.read_csv('IMDB_Dataset.csv')
print(df.shape)
df.head(10)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df.describe()

In [None]:
df.loc[0:100]

# Data Preparation

In [None]:
"""
sentiment : 0 = negative, 1 = positive 
use the following to get the sentiment of a sentence :  
sentiment = 0 if sentiment is negative else 1


use np.where to get the sentiment of a sentence :
"""
df['sentiment'] = np.where(df['sentiment'] == 'positive', 1, 0)

In [None]:
df.head()

In [None]:
df.columns = ['TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME']

In [None]:

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')

print("the device being used is ", DEVICE)

In [None]:
"""
Load the spacy model and load the English language model from https://spacy.io/usage/models
"""
#loading the spacy model
nlp = spacy.load("en_core_web_sm")
### ADD YOUR SPACY MODEL HERE ###

# python -m spacy download en_core_web_sm
# import spacy
# nlp = spacy.load("en_core_web_sm")
# https://spacy.io/usage/models

In [None]:
# general Settings

RANDOM_SEED = 26
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 15000
### ADD YOUR LEARNING RATE HERE ###
LEARNING_RATE = 0.002
### ADD YOUR BATCH SIZE HERE ###
BATCH_SIZE = 112
### ADD YOUR NUMBER OF EPOCHS HERE ###
NUM_EPOCHS = 20
### ADD YOUR EMBEDDING DIMENSION HERE ###
EMBEDDING_DIM = 400
### ADD YOUR HIDDEN DIMENSION HERE ###
HIDDEN_DIM = 64
NUM_CLASSES = 2

# Text & label Preparation

In [None]:
# Define feature processing
"""
Define the fields for the data.
"""
TEXT = torchtext.legacy.data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')

In [None]:
# Define Label processing
LABEL = torchtext.legacy.data.LabelField(dtype = torch.long)

In [None]:
"""
Define the fields for the data.
"""

df.to_csv('moviedata.csv', index = None)
df = pd.read_csv('moviedata.csv')
df.head()

In [None]:
# process the dataset
# TEXT = LABEL = []
fields = [('TEXT_COLUMN_NAME', TEXT), ('LABEL_COLUMN_NAME', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
                    ### ADD YOUR DATASET PATH HERE ###
                    path = '/content/moviedata.csv',
                    ### ADD YOUR DATASET FORMAT HERE ###
                    format = 'csv', 
                    ### ADD YOUR SKIP HEADER HERE ###
                    skip_header = True,  
                    ### ADD YOUR FIELDS HERE ### 
                    fields =  fields
)

#classtorchtext.data.TabularDataset(path, format, fields, skip_header=False, csv_reader_params={}, **kwargs)
#classtorchtext.data.TabularDataset(path, format, fields, skip_header=False, csv_reader_params={}, **kwargs)
#https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.TabularDataset

# Data Split

In [None]:
# Split dataset into train and test set

train_data, test_data = dataset.split(split_ratio = [0.8, 0.2], random_state = random.seed(RANDOM_SEED))

print('Length of train data', len(train_data))
print('Length of test data', len(test_data))

In [None]:
train_data, val_data = train_data.split(split_ratio = [0.85, 0.15], random_state = random.seed(RANDOM_SEED))

print('Length of train data', len(train_data))
print('Length of valid data', len(val_data))

# Data Observation after Tokenization

In [None]:
# Look at first traning example

print(vars(train_data.examples[2000]))

In [None]:
# Build Vocabulary

TEXT.build_vocab(train_data, max_size = VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'vocabulary size: {len(TEXT.vocab)}')
print(f'Label Size: {len(LABEL.vocab)}')

 2 extra value in vocabulary is because added (unknown) and (padding)

In [None]:
# self.text = data.Field(
#             tokenize=tokenizer,
#             lower=True,
#             include_lengths=True,
#             preprocessing=generate_n_grams,
#         )
# https://discuss.pytorch.org/t/save-and-loading-vocabulary/83173
# self.text.vocab.freqs.most_common(25)

In [None]:
# Print the most common words: Use the most_common method of the TEXT vocabulary
most_common_words = TEXT.vocab.freqs.most_common(25)
print(most_common_words)

In [None]:
# Token corresponding to first 10 Indices

print(TEXT.vocab.itos[:20]) #itos = Integer to string

# Data Preparation for Batch wise Implimentation

In [None]:
# Define Dataloader

# classmethodsplits(path=None, root='.data', train=None, validation=None, test=None, **kwargs)
# Create Dataset objects for multiple splits of a dataset.

# Parameters:	
# path (str) – Common prefix of the splits’ file paths, or None to use the result of cls.download(root).
# root (str) – Root dataset storage directory. Default is ‘.data’.
# train (str) – Suffix to add to path for the train set, or None for no train set. Default is None.
# validation (str) – Suffix to add to path for the validation set, or None for no validation set. Default is None.
# test (str) – Suffix to add to path for the test set, or None for no test set. Default is None.
# keyword arguments (Remaining) – Passed to the constructor of the Dataset (sub)class being used.
# Returns:	
# Datasets for train, validation, and test splits in that order, if provided.

# Return type:	
# Tuple[Dataset]

# https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Dataset.splits

train_loader, valid_loader, test_loader = torchtext.legacy.data.BucketIterator.splits(
        ### ADD YOUR SPLIT DATA HERE (Make sure you add it in a tuple) ###
        #Adding in form of a tuple
        (train_data, val_data, test_data), 
        ### ADD YOUR BATCH SIZE HERE ###
        batch_size = 64,
        ### ADD YOUR SORT WITHIN BATCH HERE ### 
        sort_within_batch = False, 
        #using the lambda function to sort
        sort_key = lambda x : len(x.TEXT_COLUMN_NAME), 
        device = DEVICE
    )

In [None]:
#  Group similar length text sequences together in batches.
# torchtext_train_dataloader, torchtext_valid_dataloader = torchtext.data.BucketIterator.splits(
    
#                               # Datasets for iterator to draw data from
#                               (train_dataset, valid_dataset),

#                               # Tuple of train and validation batch sizes.
#                               batch_sizes=(train_batch_size, valid_batch_size),

#                               # Device to load batches on.
#                               device=device, 

#                               # Function to use for sorting examples.
#                               sort_key=lambda x: len(x['text']),


#                               # Repeat the iterator for multiple epochs.
#                               repeat=True, 

#                               # Sort all examples in data using `sort_key`.
#                               sort=False, 

#                               # Shuffle data on each epoch run.
#                               shuffle=True,

#                               # Use `sort_key` to sort examples in each batch.
#                               sort_within_batch=True,
#                               )

# # Print number of batches in each split.
# print('Created `torchtext_train_dataloader` with %d batches!'%len(torchtext_train_dataloader))
# print('Created `torchtext_valid_dataloader` with %d batches!'%len(torchtext_valid_dataloader))


In [None]:
# # Loop through regular dataloader.
# print('PyTorch DataLoader\n')
# for batch in torch_train_dataloader:
  
#   # Let's check batch size.
#   print('Batch size: %d\n'% len(batch['text']))
#   print('LABEL\tLENGTH\tTEXT'.ljust(10))

#   # Print each example.
#   for text, label in zip(batch['text'], batch['label']):
#     print('%s\t%d\t%s'.ljust(10) % (label, len(text), text))
#   print('\n')
  
#   # Only look at first batch. Reuse this code in training models.
#   break
  

# # Create batches - needs to be called before each loop.
# torchtext_train_dataloader.create_batches()

# # Loop through BucketIterator.
# print('PyTorchText BuketIterator\n')
# for batch in torchtext_train_dataloader.batches:

#   # Let's check batch size.
#   print('Batch size: %d\n'% len(batch))
#   print('LABEL\tLENGTH\tTEXT'.ljust(10))
  
#   # Print each example.
#   for example in batch:
#     print('%s\t%d\t%s'.ljust(10) % (example['label'], len(example['text']), example['text']))
#   print('\n')
  
#   # Only look at first batch. Reuse this code in training models.
#   break

In [None]:
# Testing the iterators (note that the number of rows depends on the longest document in the respective batch):

print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

# Model Building

In [None]:

class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        ### ADD YOUR CODE HERE ###
        self.embedding = nn.Embedding(input_dim,
                                      embedding_dim)
        self.rnn = nn.RNN(embedding_dim, 
                          hidden_dim)
        self.fc = nn.Linear(hidden_dim, 
                            output_dim)
        ### END YOUR CODE ### 

    def forward(self, text):
        ### ADD YOUR CODE HERE ###
        embedded = self.embedding(text)
        output = self.fc(hidden.squeeze(0))
        hidden = self.rnn(embedded)
        # text dim: [sentence length, batch size]
    
        # embedded dim: [sentence length, batch size, embedding dim]
        
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        # hidden dim: [batch size, hidden dim]

        ### END YOUR CODE ###
                
        return output


In [None]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=30000, ### ADD YOUR INPUT DIM HERE. This can be the length of your vocabulary or the embedding dim ###
            embedding_dim=EMBEDDING_DIM, ### ADD YOUR EMBEDDING DIM HERE ###
            hidden_dim=HIDDEN_DIM, ### ADD YOUR HIDDEN DIM HERE ###
            output_dim=2  ### ADD NUMBER OF CLASSES HERE ###
)

# model = model.to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Define Accuracy

In [None]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for i, (features, targets) in enumerate(data_loader):
            features = features.to(device)
            targets = targets.float().to(device)
            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

# Model Run

In [None]:
# loss = nn.BCEWithLogitsLoss()
# input = torch.randn(3, requires_grad=True)
# target = torch.empty(3).random_(2)
# output = loss(input, target)
# output.backward()

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
  model.train()
  for batch_idx, batch_data in enumerate(train_loader):

      text = batch_data.TEXT_COLUMN_NAME.to(DEVICE)
      labels = batch_data.LABEL_COLUMN_NAME.to(DEVICE)
        ### FORWARD AND BACK PROP
      optimizer.zero_grad()    
      predictions = model(batch.text).squeeze(1) 
       # torch.nn.functional.cross_entropy(input, 
        # target, 
        # weight=None,
        #  size_average=None, 
        #  ignore_index=- 100, 
        #  reduce=None, 
        #  reduction='mean', 
        #  label_smoothing=0.0)
      loss = nn.functional.cross_entropy(
            predictions,
            labels)
      loss.backward()
        ### UPDATE MODEL PARAMETERS
        
        ### LOGGING
      if not batch_idx % 50:
        print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

  with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
  print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

# Model Testing

In [None]:
import spacy

nlp = spacy.blank("en")

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][1].item()

print('Probability positive:')
predict_sentiment(model, "This is such an awesome movie, I really love it!")

In [None]:
print('Probability positive:')
predict_sentiment(model, "I really hate this movie. It is really bad and sucks!")