<a href="https://colab.research.google.com/github/coldsober-irene/NLP-fellowship/blob/main/NN_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/MyDrive/NLP-FELLOWSHIP/Week6")
full_dataset = pd.read_csv('50k_imdb_movie_reviews.csv')
full_dataset.head()

Mounted at /content/gdrive


Unnamed: 0,review,sentiment,set
0,I went and saw this movie last night after bei...,1,test
1,Actor turned director Bill Paxton follows up h...,1,test
2,As a recreational golfer with some knowledge o...,1,test
3,"I saw this film in a sneak preview, and it is ...",1,test
4,Bill Paxton has taken the true story of the 19...,1,test


In [None]:
!pip install torch==1.8.0 torchtext==0.9.0 #compatibility

In [None]:
import pandas as pd
import re
import torch
from torchtext.legacy import data
from torchtext.legacy.data import Dataset, Example
from torchtext.legacy.data import BucketIterator
import torch.nn as nn
import torch.nn.functional as F

In [None]:
full_dataset['review'].describe()

count                                                 50000
unique                                                49582
top       Loved today's show!!! It was a variety and not...
freq                                                      5
Name: review, dtype: object

In [None]:
full_dataset = full_dataset.drop_duplicates(subset=['review'])

In [None]:
full_dataset = full_dataset.drop_duplicates(subset=['review'])
train_dataset = full_dataset[(full_dataset['set'] == 'train')][['review','sentiment']]
test_dataset = full_dataset[(full_dataset['set'] == 'test')][['review','sentiment']]

In [None]:
def preprocessing(texts):
  cleaned_text = []
  for text in texts:
    text = text.lower()
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    html_pattern = re.compile('<.*?>')
    text = emoji_pattern.sub(r'', text)
    text = url_pattern.sub(r'', text)
    text = html_pattern.sub(r'', text)
    text = re.sub(r"[^\w\d'\s]+", ' ', text)
    cleaned_text.append(text)

  return cleaned_text

In [None]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # Check this
max_document_length = 300 #hyperparameter

TEXT = data.Field(lower=True, include_lengths=True,  tokenize='spacy',preprocessing=preprocessing,batch_first=True,  fix_length=max_document_length)
LABEL = data.Field(sequential=False, use_vocab=False)

class DataFrameDataset(Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [None]:
torch_valid_dataset, torch_test_dataset = DataFrameDataset(
    df=test_dataset, 
    fields=(
        ('review', TEXT),
        ('sentiment', LABEL)
    )
).split() 

torch_train_dataset = DataFrameDataset(
    df=train_dataset, 
    fields=(
        ('review', TEXT),
        ('sentiment', LABEL)
    )
)

In [None]:
max_size = 30000 #hyperparameter
TEXT.build_vocab(torch_train_dataset, max_size=max_size,vectors='fasttext.simple.300d')
vocab_size = len(TEXT.vocab)

BATCH_SIZE = 64 #hyperparameter
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (torch_train_dataset, torch_valid_dataset, torch_test_dataset), 
    batch_size = BATCH_SIZE ,
    sort_key=lambda x: len(x.review),
    sort_within_batch=True)

In [None]:
class LR(nn.Module):
    def __init__(self, input_size, hidden_size,hidden_size2,hidden_size3, num_classes):
        super(LR, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) # 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3) 
        self.fc4 = nn.Linear(hidden_size3, num_classes)

    def forward(self, text):
        text = text.float() # dense layer deals just with float type data
        x = self.fc1(text) #(m x n) with (n x p)
        x = self.relu(x)
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        
        preds = self.fc4(x) # crossentropyloss handles the softmax
        # preds = F.softmax(preds,1) # nn.softmax
        return preds

In [None]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size2, hidden_size3, hidden_size4, output_dim, dropout, max_document_length):
        super().__init__()
        # embedding and convolution layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(embed_size*max_document_length, hidden_size2)  # dense layer
        self.fc2 = nn.Linear(hidden_size2, hidden_size3)  # dense layer
        self.fc3 = nn.Linear(hidden_size3, hidden_size4)  # dense layer
        self.fc4 = nn.Linear(hidden_size4, output_dim)  # dense layer

    def forward(self, text):
         # text shape = (batch_size, num_sequences)
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
        
        x = embedded.view(embedded.shape[0], -1)  # x = Flatten()(x)
        #embedded = embedded.unsqueeze(1) # fc gets 4 dimension
        
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        preds = self.fc4(x)
        # preds = F.softmax(preds, 1)
        return preds

In [None]:
lr = 1e-3
batch_size = 64
dropout_keep_prob = 0.3
embedding_size = 300
max_document_length = 300 # each sentence has until 100 words
vocab_size = len(TEXT.vocab)
dev_size = 0.8 # split percentage to train\validation data
max_size = 30000 # maximum vocabulary size
seed = 30
num_classes = 2

num_epochs = 15
hidden_size = 256
hidden_size1 = 300
hidden_size2 = 128
hidden_size3 = 64

to_train = True
# model = LR(max_document_length, hidden_size,hidden_size2,hidden_size3, num_classes)
model = MLP(vocab_size, embedding_size, hidden_size1, hidden_size2, hidden_size3,  num_classes, dropout_keep_prob, max_document_length)

In [None]:
LRlr = 1e-3
LRbatch_size = 64
LRdropout_keep_prob = 0.3
LRembedding_size = 300
LRmax_document_length = 300 # each sentence has until 100 words
LRvocab_size = len(TEXT.vocab)
LRdev_size = 0.8 # split percentage to train\validation data
LRmax_size = 30000 # maximum vocabulary size
LRseed = 30
LRnum_classes = 2

LRnum_epochs = 15
LRhidden_size = 256
LRhidden_size1 = 300
LRhidden_size2 = 128
LRhidden_size3 = 64

to_train = True
LRmodel = LR(LRmax_document_length, LRhidden_size,LRhidden_size2,LRhidden_size3, LRnum_classes)

In [None]:
def accuracy(probs, target):
  winners = probs.argmax(dim=1)
  corrects = (winners == target)
  accuracy = corrects.sum().float() / float(target.size(0))
  return accuracy

In [None]:
LRbest_valid_loss = float('inf')
LRoptimizer = torch.optim.Adam(LRmodel.parameters(), lr=lr)

LRloss_func = nn.CrossEntropyLoss()

for epoch in range(LRnum_epochs):
  LRtrain_epoch_loss = 0
  LRtrain_epoch_acc = 0
  for batch in train_iterator:
      LRoptimizer.zero_grad()
      # retrieve text and no. of words
      text, text_lengths = batch.review

      #feedforward
      # model.to(device)
      LRpredictions = LRmodel(text).squeeze(1)
      
      
      loss = LRloss_func(LRpredictions, batch.sentiment)

      acc = accuracy(LRpredictions, batch.sentiment)

      # perform backpropagation
      loss.backward()

      LRoptimizer.step()

      LRtrain_epoch_loss += loss.item()
      LRtrain_epoch_acc += acc.item()

  

  LRvalid_epoch_loss = 0
  LRvalid_epoch_acc = 0

  LRmodel.eval()

  with torch.no_grad():
      for batch in valid_iterator:
          text, text_lengths = batch.review

          LRpredictions = LRmodel(text).squeeze(1)

          loss = LRloss_func(LRpredictions, batch.sentiment)

          acc = accuracy(LRpredictions, batch.sentiment)

          LRvalid_epoch_loss += loss.item()
          LRvalid_epoch_acc += acc.item()

   

  if LRvalid_epoch_loss < LRbest_valid_loss:
            LRbest_valid_loss = LRvalid_epoch_loss
            torch.save(LRmodel.state_dict(), 'LRsaved_weights'+'_LRlinear.pt')

  print(f'\tTrain Loss: {LRtrain_epoch_loss / len(train_iterator):.3f} | Train Acc: {LRtrain_epoch_acc  / len(train_iterator)* 100:.2f}%')
  print(f'\t Val. Loss: {LRvalid_epoch_loss / len(valid_iterator):.3f} |  Val. Acc: {LRvalid_epoch_acc / len(valid_iterator) * 100:.2f}%')

	Train Loss: 4.126 | Train Acc: 49.80%
	 Val. Loss: 0.760 |  Val. Acc: 50.23%
	Train Loss: 0.750 | Train Acc: 53.17%
	 Val. Loss: 0.739 |  Val. Acc: 50.39%
	Train Loss: 0.688 | Train Acc: 56.65%
	 Val. Loss: 0.804 |  Val. Acc: 50.06%
	Train Loss: 0.677 | Train Acc: 58.56%
	 Val. Loss: 0.748 |  Val. Acc: 50.55%
	Train Loss: 0.634 | Train Acc: 62.13%
	 Val. Loss: 0.779 |  Val. Acc: 51.27%
	Train Loss: 0.589 | Train Acc: 66.60%
	 Val. Loss: 0.818 |  Val. Acc: 51.34%
	Train Loss: 0.537 | Train Acc: 70.82%
	 Val. Loss: 0.841 |  Val. Acc: 50.78%
	Train Loss: 0.523 | Train Acc: 72.66%
	 Val. Loss: 0.879 |  Val. Acc: 50.24%
	Train Loss: 0.489 | Train Acc: 75.19%
	 Val. Loss: 0.954 |  Val. Acc: 50.34%
	Train Loss: 0.466 | Train Acc: 77.30%
	 Val. Loss: 0.993 |  Val. Acc: 50.61%
	Train Loss: 0.443 | Train Acc: 78.81%
	 Val. Loss: 1.013 |  Val. Acc: 50.63%
	Train Loss: 0.409 | Train Acc: 81.05%
	 Val. Loss: 0.999 |  Val. Acc: 50.16%
	Train Loss: 0.382 | Train Acc: 82.62%
	 Val. Loss: 1.055 |  Val

In [None]:
best_valid_loss = float('inf')
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_func = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
  train_epoch_loss = 0
  train_epoch_acc = 0
  for batch in train_iterator:
      optimizer.zero_grad()
      # retrieve text and no. of words
      text, text_lengths = batch.review

      #feedforward
      # model.to(device)
      predictions = model(text).squeeze(1)
      
      
      loss = loss_func(predictions, batch.sentiment)

      acc = accuracy(predictions, batch.sentiment)

      # perform backpropagation
      loss.backward()

      optimizer.step()

      train_epoch_loss += loss.item()
      train_epoch_acc += acc.item()

  

  valid_epoch_loss = 0
  valid_epoch_acc = 0

  model.eval()

  with torch.no_grad():
      for batch in valid_iterator:
          text, text_lengths = batch.review

          predictions = model(text).squeeze(1)

          loss = loss_func(predictions, batch.sentiment)

          acc = accuracy(predictions, batch.sentiment)

          valid_epoch_loss += loss.item()
          valid_epoch_acc += acc.item()

   

  if valid_epoch_loss < best_valid_loss:
            best_valid_loss = valid_epoch_loss
            torch.save(model.state_dict(), 'saved_weights'+'_linear.pt')

  print(f'\tTrain Loss: {train_epoch_loss / len(train_iterator):.3f} | Train Acc: {train_epoch_acc  / len(train_iterator)* 100:.2f}%')
  print(f'\t Val. Loss: {valid_epoch_loss / len(valid_iterator):.3f} |  Val. Acc: {valid_epoch_acc / len(valid_iterator) * 100:.2f}%')

	Train Loss: 0.738 | Train Acc: 50.96%
	 Val. Loss: 0.698 |  Val. Acc: 51.92%
	Train Loss: 0.659 | Train Acc: 58.59%
	 Val. Loss: 0.683 |  Val. Acc: 54.59%
	Train Loss: 0.522 | Train Acc: 67.41%
	 Val. Loss: 0.726 |  Val. Acc: 57.16%
	Train Loss: 0.461 | Train Acc: 74.52%
	 Val. Loss: 0.671 |  Val. Acc: 66.46%
	Train Loss: 0.376 | Train Acc: 81.83%
	 Val. Loss: 0.790 |  Val. Acc: 71.13%
	Train Loss: 0.214 | Train Acc: 90.82%
	 Val. Loss: 0.793 |  Val. Acc: 72.54%
