In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 3.9MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 17.2MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     

In [3]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import requests
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.utils import class_weight
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as I
from torch.autograd import Variable
from transformers import AutoModel, BertTokenizerFast
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification

### Load data

In [4]:
data_name = 'user_reviews.csv'
path = F"/content/gdrive/My Drive/{data_name}"

reviews_df = pd.read_csv(path,lineterminator='\n')
reviews_df = reviews_df[:5000]
reviews_df.head(5)

Unnamed: 0,userID,id,date_posted,review,recommended,year,month
0,0,1250,November 5 2011,Simple yet with great replayability. In my opi...,True,2011,November
1,0,22200,July 15 2011,Its unique and worth a playthrough.,True,2011,July
2,0,43110,April 21 2011,Great atmosphere. The gunplay can be a bit chu...,True,2011,April
3,1,251610,June 24 2014,I know what you think when you see this title ...,True,2014,June
4,1,227300,September 8 2013,For a simple (its actually not all that simple...,True,2013,September


In [5]:
reviews_df.tail(5)

Unnamed: 0,userID,id,date_posted,review,recommended,year,month
4995,1991,226700,March 19 2014,Alot of bugs and hackers. DONT Buy it even dur...,False,2014,March
4996,1992,440,April 6 2013,Lots of items and Achievements make this fun w...,True,2013,April
4997,1992,730,December 11 2013,Pretty fun game to play :) loved the match mak...,True,2013,December
4998,1992,550,July 17 2013,Very fun game to play with friends. Co-operati...,True,2013,July
4999,1992,570,March 4 2013,Its a good team and individual based game.,True,2013,March


In [6]:
reviews_df.isnull().sum()

userID         0
id             0
date_posted    0
review         2
recommended    0
year           0
month          0
dtype: int64

In [7]:
reviews_df.dropna(inplace = True)
reviews_df.isnull().sum()

userID         0
id             0
date_posted    0
review         0
recommended    0
year           0
month          0
dtype: int64

In [8]:
print("The length of the dataset:",len(reviews_df))

The length of the dataset: 4998


In [9]:
reviews_data = reviews_df[["review","recommended"]]
reviews_data["recommended"].value_counts()

True     4357
False     641
Name: recommended, dtype: int64

In [10]:
labels = reviews_data["recommended"].unique()
label_dict = {}
for idx, labl in enumerate(labels):
    label_dict[labl] = idx

In [11]:
reviews_data["label"] = reviews_data["recommended"].replace(label_dict)
reviews_data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,review,recommended,label
0,Simple yet with great replayability. In my opi...,True,0
1,Its unique and worth a playthrough.,True,0
2,Great atmosphere. The gunplay can be a bit chu...,True,0
3,I know what you think when you see this title ...,True,0
4,For a simple (its actually not all that simple...,True,0


In [12]:
reviews_data["label"].value_counts()

0    4357
1     641
Name: label, dtype: int64

### Split the data into train/val/test

In [13]:
X = reviews_data["review"]
y = reviews_data["label"]

In [14]:
X_train, X_val1, y_train, y_val1 = train_test_split(X, y, test_size = 0.3, random_state = 17, stratify = reviews_data['label'])
X_val, X_test, y_val, y_test = train_test_split(X_val1, y_val1, test_size = 0.12, random_state = 18, stratify = y_val1)

In [15]:
print("X_train shape:",X_train.shape)
print("y_train shape:",y_train.shape)
print("X_val shape:",X_val.shape)
print("y_val shape:",y_val.shape)
print("X_test shape:",X_test.shape)
print("y_test shape:",y_test.shape)

X_train shape: (3498,)
y_train shape: (3498,)
X_val shape: (1320,)
y_val shape: (1320,)
X_test shape: (180,)
y_test shape: (180,)


In [16]:
reviews_data.set_index("review", inplace = True)
reviews_data.loc[X_train,"data_type"] = "train"
reviews_data.loc[X_val,"data_type"] = "val"
reviews_data.loc[X_test,"data_type"] = "test"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [17]:
reviews_data.reset_index(inplace = True)
reviews_data.head(3)

Unnamed: 0,review,recommended,label,data_type
0,Simple yet with great replayability. In my opi...,True,0,train
1,Its unique and worth a playthrough.,True,0,val
2,Great atmosphere. The gunplay can be a bit chu...,True,0,train


In [18]:
reviews_data.groupby(["recommended","label","data_type"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,review
recommended,label,data_type,Unnamed: 3_level_1
False,1,test,24
False,1,train,447
False,1,val,170
True,0,test,170
True,0,train,2977
True,0,val,1210


### Tokenization

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#do_lower_case = True

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [20]:
encoded_data_train = tokenizer.batch_encode_plus(X_train.to_list(),
                                                add_special_tokens = True, 
                                                return_attention_mask = True, 
                                                max_length = 256,
                                                padding = 'max_length', 
                                                truncation = True,  
                                                return_tensors = 'pt' )

encoded_data_val = tokenizer.batch_encode_plus(X_val.to_list(),
                                               add_special_tokens = True, 
                                               return_attention_mask = True, 
                                               max_length = 256,
                                               padding = 'max_length',   
                                               truncation = True, 
                                               return_tensors = 'pt')

encoded_data_test = tokenizer.batch_encode_plus(X_test.to_list(),
                                                add_special_tokens = True, 
                                                return_attention_mask = True, 
                                                max_length = 256, 
                                                padding = 'max_length',                                            
                                                truncation = True, 
                                                return_tensors = 'pt')

In [21]:
input_ids_train = encoded_data_train['input_ids']
#torch.tensor(encoded_data_train['input_ids'])
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(y_train.to_list())

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(y_val.to_list())

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(y_test.to_list())

In [22]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [23]:
len(dataset_train),len(dataset_val),len(dataset_test)

(3498, 1320, 180)

In [24]:
batch_size = 64

dataloader_train = DataLoader(dataset_train, 
                              sampler = RandomSampler(dataset_train), 
                              batch_size = batch_size)

dataloader_val = DataLoader(dataset_val, 
                            sampler = SequentialSampler(dataset_val), 
                            batch_size = batch_size)

dataloader_test = DataLoader(dataset_test, 
                             sampler = SequentialSampler(dataset_test), 
                             batch_size = batch_size)

### BERT model

In [25]:
# import BERT-base pretrained model
#bert = AutoModel.from_pretrained('bert-base-uncased')

bert = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                     num_labels = len(label_dict),
                                                     output_attentions = False,
                                                     output_hidden_states = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [26]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [27]:
'''
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

'''

'\nclass BERT_Arch(nn.Module):\n\n    def __init__(self, bert):\n      \n      super(BERT_Arch, self).__init__()\n\n      self.bert = bert \n      \n      # dropout layer\n      self.dropout = nn.Dropout(0.1)\n      \n      # relu activation function\n      self.relu =  nn.ReLU()\n\n      # dense layer 1\n      self.fc1 = nn.Linear(768,512)\n      \n      # dense layer 2 (Output layer)\n      self.fc2 = nn.Linear(512,2)\n\n      #softmax activation function\n      self.softmax = nn.LogSoftmax(dim=1)\n\n    #define the forward pass\n    def forward(self, sent_id, mask):\n\n      #pass the inputs to the model  \n      _, cls_hs = self.bert(sent_id, attention_mask=mask)\n      \n      x = self.fc1(cls_hs)\n\n      x = self.relu(x)\n\n      x = self.dropout(x)\n\n      # output layer\n      x = self.fc2(x)\n      \n      # apply softmax activation\n      x = self.softmax(x)\n\n      return x\n\n'

In [78]:
class bertModel(nn.Module):

  def __init__(self, bert):

    super(bertModel, self).__init__()

    self.bert = bert
    self.dropout1 = nn.Dropout(0.1)
    self.relu =  nn.ReLU()
    self.fc1 = nn.Linear(768, 512)
    #self.dropout2 = nn.Dropout(0.1)
    self.fc2 = nn.Linear(512, 2)
    self.softmax = nn.LogSoftmax(dim = 1)

  def forward(self, **inputs):

    _, x = self.bert(**inputs)

    print("shape after self.bert:",x.shape)

    x = x.view(x.size(0), -1)
 
    print("after flatten",x.shape)

    x = self.fc1(x)
    print("shape after fc1:",x.shape)
    x = self.relu(x)
    x = self.dropout1(x)
    print("shape after dropout of fc1:",x.shape)
    x = self.fc2(x)
    x = self.softmax(x)

    return x

In [79]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cpu


In [80]:
model = bertModel(bert)
#BERT_Arch
model = model.to(device)

In [81]:
model

bertModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
  

In [82]:
optimizer = AdamW(model.parameters(), lr = 0.001)

epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = len(dataloader_train)*epochs)

In [83]:
#calculate class weights
class_wts = class_weight.compute_class_weight("balanced", np.unique(y_train), y_train)
print("Class weights:",dict(enumerate(class_wts)))

class_weights = torch.tensor(class_wts, dtype = torch.float)
#class_weights = torch.FloatTensor(class_wts)
#class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight = class_weights)
criterion = criterion.to(device)

Class weights: {0: 0.5736306985897015, 1: 3.89532293986637}


In [84]:
#criterion = criterion.to(device)
#model = bertModel(bert)
#model.to(device)

train model function

In [85]:
seed_val = 24
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [86]:
'''
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(dataloader_train):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader_train)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = criterion(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(dataloader_train)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds
'''

"\n# function to train the model\ndef train():\n  \n  model.train()\n\n  total_loss, total_accuracy = 0, 0\n  \n  # empty list to save model predictions\n  total_preds=[]\n  \n  # iterate over batches\n  for step,batch in enumerate(dataloader_train):\n    \n    # progress update after every 50 batches.\n    if step % 50 == 0 and not step == 0:\n      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader_train)))\n\n    # push the batch to gpu\n    batch = [r.to(device) for r in batch]\n \n    sent_id, mask, labels = batch\n\n    # clear previously calculated gradients \n    model.zero_grad()        \n\n    # get model predictions for the current batch\n    preds = model(sent_id, mask)\n\n    # compute the loss between actual and predicted values\n    loss = criterion(preds, labels)\n\n    # add on to the total loss\n    total_loss = total_loss + loss.item()\n\n    # backward pass to calculate the gradients\n    loss.backward()\n\n    # clip the the gradients to 1.0. It helps 

In [87]:
'''
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(dataloader_val):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader_val)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = criterion(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(dataloader_val) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds
'''

'\n# function for evaluating the model\ndef evaluate():\n  \n  print("\nEvaluating...")\n  \n  # deactivate dropout layers\n  model.eval()\n\n  total_loss, total_accuracy = 0, 0\n  \n  # empty list to save the model predictions\n  total_preds = []\n\n  # iterate over batches\n  for step,batch in enumerate(dataloader_val):\n    \n    # Progress update every 50 batches.\n    if step % 50 == 0 and not step == 0:\n      \n      # Calculate elapsed time in minutes.\n      elapsed = format_time(time.time() - t0)\n            \n      # Report progress.\n      print(\'  Batch {:>5,}  of  {:>5,}.\'.format(step, len(dataloader_val)))\n\n    # push the batch to gpu\n    batch = [t.to(device) for t in batch]\n\n    sent_id, mask, labels = batch\n\n    # deactivate autograd\n    with torch.no_grad():\n      \n      # model predictions\n      preds = model(sent_id, mask)\n\n      # compute the validation loss between actual and predicted values\n      loss = criterion(preds,labels)\n\n      total_lo

In [88]:

def model_evaluate(dataloader):

  model.eval()

  pred_loss = 0

  for idx, batch in enumerate(dataloader):

    data = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':      data[0],
              'attention_mask': data[1],
              'labels':         data[2],
              'return_dict' : False
             }    
    #data = [b.to(device) for b in batch]
    #inputs_ids, attn_masks, labels = data[0], data[1], data[2]
    
    #print("input shape:",data.shape)

    with torch.no_grad():
      outputs = model(**inputs)
      #inputs_ids = inputs_ids, token_type_ids = None, attention_mask = attn_masks, labels = labels

      loss = criterion(outputs, labels)

      pred_loss += loss.item()

  avg_pred_loss = pred_loss/len(dataloader)

  return avg_pred_loss, outputs, labels


In [89]:

def model_train(model, device, criterion, scheduler, optimizer, n_epochs):

  train_loss = []

  model.train()

  for epoch in range(1, epochs+1):
           
      total_train_loss, training_loss = 0,0 

      #progress_bar = tqdm(dataloader_train, desc = 'Epoch {:1d}'.format(epoch), leave = False, disable = False)
      
      for idx, batch in enumerate(dataloader_train):

          model.zero_grad()
          
          data = tuple(b.to(device) for b in batch)
          inputs = {'input_ids':      data[0],
                    'attention_mask': data[1],
                    'labels':         data[2],
                    'return_dict' : False
                   }
          #data = [b.to(device) for b in batch]
          #input_ids, attn_mask, labels = data[0], data[1], data[2]  
          
          #print("input shape:",data.shape)
          print("input id: {}, attn mask: {}, labels: {}".format(data[0].shape, data[1].shape, data[2].shape))

          outputs = model(**inputs)
          
          loss = criterion(outputs, labels)
          #loss_train_total += loss.item()
          loss.backward()

          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          #update the weights
          optimizer.step()
          scheduler.step()
          
          #progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
          training_loss += loss.item()
          total_train_loss += training_loss

          if idx % 25 == 0:
            print('Epoch: {}, Batch: {}, Training Loss: {}'.format(epoch, idx, training_loss/10))
            training_loss = 0
      
      #avg training loss
      avg_train_loss = total_train_loss/len(dataloader_train)
      #validation data loss
      avg_pred_loss = model_evaluate(dataloader_val)

      #print for every end of epoch
      print('End of Epoch {}, Avg. Training Loss: {}, Avg. validation Loss: {} \n'.format(epoch, avg_train_loss, avg_pred_loss))

  print("Finished Training")


In [90]:

epochs = 3
'''
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')
'''

"\n# set initial loss to infinite\nbest_valid_loss = float('inf')\n\n# empty lists to store training and validation loss of each epoch\ntrain_losses=[]\nvalid_losses=[]\n\n#for each epoch\nfor epoch in range(epochs):\n     \n    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))\n    \n    #train model\n    train_loss, _ = train()\n    \n    #evaluate model\n    valid_loss, _ = evaluate()\n    \n    #save the best model\n    if valid_loss < best_valid_loss:\n        best_valid_loss = valid_loss\n        torch.save(model.state_dict(), 'saved_weights.pt')\n    \n    # append training and validation loss\n    train_losses.append(train_loss)\n    valid_losses.append(valid_loss)\n    \n    print(f'\nTraining Loss: {train_loss:.3f}')\n    print(f'Validation Loss: {valid_loss:.3f}')\n"

In [91]:
model_train(model, device, criterion, scheduler, optimizer, epochs)

input id: torch.Size([64, 256]), attn mask: torch.Size([64, 256]), labels: torch.Size([64])
shape after self.bert: torch.Size([64, 2])
after flatten torch.Size([64, 2])


RuntimeError: ignored