# **SemEval 2017 Task 4 - Training Phase**
---


This is the second piece of code within the series of SemEval 2017 Task 4 challenge.

In this notebook, we will perform hyperparameter tuning in order to obtain the best model architecture.

In [0]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Libraries and Variables

* Reading `config.yaml` which contains all ML parameters as well as filepaths
* Import all dependencies and libraries







In [0]:
# load configuration file that store all the constant and parameters settings
import yaml
CONFIG_YAML_FILEPATH = '/content/drive/My Drive/public/AMLSII_19-20_SN18154195/config.yaml'
with open(CONFIG_YAML_FILEPATH, 'r') as file:
  cfg = yaml.safe_load(file)

TASK_NUMBER = cfg['task_number']

In [0]:
# install additional dependencies
! pip install -r {cfg['paths']['requirements']}

import pandas as pd
import os
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
import csv
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torchtext.data import TabularDataset, Field, LabelField, BucketIterator
from sklearn.preprocessing import LabelEncoder
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.preprocessor import TextPreProcessor



## Task Logistics

In [0]:
# GENERAL TRAIN-TEST VARS
if TASK_NUMBER == 'A':
  train_csv_filepath = cfg['paths']['cleaned_train_a']
  test_csv_filepath = cfg['paths']['cleaned_test_a']
  saved_model_filepath = cfg['paths']['task_a_model']
elif TASK_NUMBER == 'B':
  train_csv_filepath = cfg['paths']['cleaned_train_b']
  test_csv_filepath = cfg['paths']['cleaned_test_b']
  saved_model_filepath = cfg['paths']['task_b_model']

# List of Hyperparameter Trials

In [0]:
params = {
    'vocab': {
        'batch_size' : [32, 64],
	      'max_vocab_size' : [50000, 100000],
  	    'pretrained_embedding': [{
            'url' : 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip',
            'filepath': './crawl-300d-2M.vec',
            'embedding_dim': 300,
        },{
            'url' : 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
            'filepath': './glove.twitter.27B.200d.txt',
            'embedding_dim' : 200,
        },],
        'preprocessing' : [{
            'normalize' : ['url', 'email', 'percent', 'money', 'phone', 'user',
                'time', 'date', 'number'],
            'annotate' : ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'],
            'spell_correct_elong' : True,
            'to_lowercase': True,    
        },
        ]
    },
    'nn': { 
          'hidden_dim': [128, 256],
          'n_layers': [2, 3],
          'is_bidirectional': [True],
          'dropout': [0.5, 0.8],
          'n_epochs': [20],
    }
}

## Build Torch Dataset


In [0]:
# read csv file as torchtext's TabularDataset
def csv_to_tabular_dataset(filepath, fields):
  tabular_daset = TabularDataset(
      path=filepath,
      fields = fields,
      format='tsv',
      skip_header=True
  )
  return tabular_daset

## RNN-LSTM

three layers:
1. embedding layer (transform one-hot encoding vector into a dense embedding vector)
2. RNN
3. linear layer (output)

In [0]:
import torch.nn as nn
import time
import requests, zipfile, io
from pathlib import Path
from torchtext.vocab import Vectors
import torch.optim as optim
import dill

class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
               bidirectional, dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
    # self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.rnn = nn.LSTM(embedding_dim,
                       hidden_dim,
                       num_layers=n_layers,
                       bidirectional=bidirectional,
                       dropout=dropout,
                       )
    self.fully_connected = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text, text_length):
    # text dimention is [sentence len, batch size]
    
    # embedded dimention is [sentence len, batch size, embedding_dim]
    embedded = self.dropout(self.embedding(text))

    # pack the sequence
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
    # print('packed_embedded:')
    # print(packed_embedded)
    packet_output, (hidden, cell) = self.rnn(packed_embedded)

    # unpack the sequence
    output, output_length = nn.utils.rnn.pad_packed_sequence(packet_output)

    # output dimention is [sentence len, batch size, hidden dim * num directions]
    # output over padding tokens are zero tensors

    # hidden dimension is [num layers*num directions, batch size, hidden dim]
    # cell dimension is also [num layers*num directions, batch size, hidden dim]

    #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
    #and apply dropout

    # hidden dimension is [batch size, hid dim*num directions]
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
    
    return self.fully_connected(hidden)


In [0]:
def count_model_params(model):
  # number of parameters that are trainable
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
def calculate_performance(y, preds):
  # get the index of the max probability 
  Y_pred = preds.argmax(dim = 1, keepdim = True).squeeze(1)
  Y_pred = Y_pred.detach().cpu().clone().numpy()
  
  Y_true = y.detach().cpu().clone().numpy()

  acc = accuracy_score(Y_true, Y_pred)
  rec = recall_score(Y_true, Y_pred, average='macro')
  prec= precision_score(Y_true, Y_pred, average='macro')
  f1  = f1_score(Y_true, Y_pred, average='macro')
  return acc, rec, prec, f1, Y_true, Y_pred



# Train the Model

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    epoch_rec = 0
    epoch_prec = 0
    epoch_f1 = 0

    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        text, text_length = batch.text
        predictions = model(text, text_length).squeeze(1)
        
        loss = criterion(predictions, batch.sentiment.long())
        
        acc, rec, prec, f1, Y_true, Y_pred = calculate_performance(batch.sentiment, predictions)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_rec += rec.item()
        epoch_prec += prec.item()
        epoch_f1 += f1.item()

        avg_loss = epoch_loss / len(iterator)
        avg_acc = epoch_acc / len(iterator)
        avg_rec = epoch_rec / len(iterator)
        avg_prec = epoch_prec / len(iterator)
        avg_f1 = epoch_f1 / len(iterator)

    return avg_loss, avg_acc, avg_rec, avg_prec, avg_f1, Y_true, Y_pred

In [0]:
def evaluate(model, iterator, criterion):   
    epoch_loss = 0
    epoch_acc = 0
    epoch_rec = 0
    epoch_prec = 0
    epoch_f1 = 0
    
    Y_true = []
    Y_pred = []
    model.eval()

    with torch.no_grad():
    
        for batch in iterator:
            text, text_length = batch.text
            predictions = model(text, text_length).squeeze(1)
            actuals = batch.sentiment
 
            loss = criterion(predictions, batch.sentiment.long())
            
            acc, rec, prec, f1, y_true, y_pred = calculate_performance(batch.sentiment, predictions)
            Y_true = np.concatenate([Y_true,y_true])
            Y_pred = np.concatenate([Y_pred,y_pred])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_rec += rec.item()
            epoch_prec += prec.item()
            epoch_f1 += f1.item()

            avg_loss = epoch_loss / len(iterator)
            avg_acc = epoch_acc / len(iterator)
            avg_rec = epoch_rec / len(iterator)
            avg_prec = epoch_prec / len(iterator)
            avg_f1 = epoch_f1 / len(iterator)
    return avg_loss, avg_acc, avg_rec, avg_prec, avg_f1, Y_true, Y_pred

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
def build_torch_dataset(csv_filepath, fields, vocab_params, is_training_data=True):
	batch_size = vocab_params['batch_size']
	max_vocab_size = vocab_params['max_vocab_size']
	pretrained_embedding_filepath = vocab_params['pretrained_embedding']['filepath']
	pretrained_embedding_url = vocab_params['pretrained_embedding']['url']
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
	data = csv_to_tabular_dataset(
	  csv_filepath,
	  fields = fields,
	)
	if not is_training_data:
		# split into batches
		empty, empty, test_iterator = BucketIterator.splits(
				(None, None, data), 
				batch_sizes = (None, None, batch_size),
				sort_key=lambda x: len(x.text),
				sort_within_batch = True,
				device = device)
		return test_iterator
	else:		
		# split train data as traid:validation
		train_data, valid_data = data.split(split_ratio=0.8)

		# split into batches
		train_iterator, valid_iterator = BucketIterator.splits(
				(train_data, valid_data), 
				batch_sizes = (batch_size, batch_size),
				sort_key=lambda x: len(x.text),
				sort_within_batch = True,
				device = device)

		# build vocabulary
		# check if we need to download vector file
		if not Path(pretrained_embedding_filepath).is_file():
			# download vector file (in .zip)
			r = requests.get(pretrained_embedding_url)
			z = zipfile.ZipFile(io.BytesIO(r.content))
			# unzip the file
			z.extractall()

		ROW_NUM.build_vocab(train_data)
		SENTIMENT.build_vocab(train_data)
		TEXT.build_vocab(train_data,
										max_size = max_vocab_size,
										vectors = Vectors(pretrained_embedding_filepath),
										unk_init = torch.Tensor.normal_
										)

		return ROW_NUM, TEXT, SENTIMENT, train_iterator, valid_iterator

In [0]:
def tune_hyperparams(config, ROW_NUM, TEXT, SENTIMENT, train_iterator, valid_iterator, model_filepath, best_valid_loss):
	UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
	PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

	# build the RNN object
	model = RNN(
		len(TEXT.vocab),
		config['embedding_dim'],
		config['hidden_dim'],
		len(SENTIMENT.vocab),
		config['n_layers'],
		config['is_bidirectional'],
		config['dropout'],
		PAD_IDX,
	)

	# print the model and its number of params
	print(model.parameters)
	print(count_model_params(model))
	print(model)
	for p in model.parameters():
	  print(p.numel())

	# replace the initial weights of the embedding layer with the pretrained embeddings
	pretrained_embeddings = TEXT.vocab.vectors
	model.embedding.weight.data.copy_(pretrained_embeddings)

  # initialise UNK and PAD tokens to zeros
	model.embedding.weight.data[UNK_IDX] = torch.zeros(config['embedding_dim'])
	model.embedding.weight.data[PAD_IDX] = torch.zeros(config['embedding_dim'])

	optimiser = optim.Adam(model.parameters())
	criterion = nn.CrossEntropyLoss()

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = model.to(device)
	criterion = criterion.to(device)

	# --- EARLY STOPPING
	use_early_stopping = True
	early_stopping_patience = 2

	for epoch in range(config['n_epochs']):
		start_time = time.time()
		train_loss, train_acc, train_rec, train_prec, train_f1, train_Y_true, train_Y_pred = train(model, train_iterator, optimiser, criterion)
		valid_loss, valid_acc, valid_rec, valid_prec, valid_f1, valid_Y_true, valid_Y_pred = evaluate(model, valid_iterator, criterion)
		end_time = time.time()

		epoch_mins, epoch_secs = epoch_time(start_time, end_time)

		if valid_loss < best_valid_loss:
			best_valid_loss = valid_loss
			# save the model as a file
			checkpoint = {
					'model' : model,
					'state_dict' : model.state_dict(),
					'optimiser' : optimiser.state_dict(),
			}
			torch.save(checkpoint, model_filepath)
	 		# save the Field(s)
			with open(cfg['paths']['fields']['row_num'], "wb")as f:
					dill.dump(ROW_NUM,f)
			with open(cfg['paths']['fields']['text'], "wb")as f:
					dill.dump(TEXT,f)
			with open(cfg['paths']['fields']['sentiment'], "wb")as f:
					dill.dump(SENTIMENT,f)
		else:
			early_stopping_patience -= 1   
		print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
		print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')
		print(f'\tTrain Acc : {train_acc*100:.2f}% | Val. Acc : {valid_acc*100:.2f}%')
		print(f'\tTrain Rec : {train_rec*100:.2f}% | Val. Rec : {valid_rec*100:.2f}%')
		print(f'\tTrain Prec: {train_prec*100:.2f}% | Val. Prec: {valid_prec*100:.2f}%')
		print(f'\tTrain F1  : {train_f1*100:.2f}% | Val. F1  : {valid_f1*100:.2f}%')

		# check if early stopping is needed
		if use_early_stopping:
			if early_stopping_patience < 0:
				print('Early stopping!' )
				break

	# after completing all epochs, visualise the word vectors
	vectors = model.embedding.weight.data
	labels = [l for l in TEXT.vocab.itos]

	print('best valid loss: ', best_valid_loss)
	return best_valid_loss

In [0]:
def generate_text_processor(preprocessing_params):
  text_processor = TextPreProcessor(
    # normalized these terms (ex: "google.com" into "<url>")
    normalize = preprocessing_params['normalize'],

    # annotate these terms (ex: "#win" into ["<hashtag>", "win", "</hashtag>"])
    annotate = preprocessing_params['annotate'],
    fix_html=True,
    segmenter = 'twitter',
    corrector='twitter',
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=preprocessing_params['spell_correct_elong'],
    tokenizer = SocialTokenizer(lowercase=preprocessing_params['to_lowercase']).tokenize,
    dicts = [emoticons]
  )
  return text_processor

def custom_tokenizer(example):
  return text_processor.pre_process_doc(example)

from itertools import product
def get_cartesian_product(d):
  # get all possible combination of a dictionary containing lists
  return [dict(zip(d, v)) for v in product(*d.values())]


# RUN EXPERIMENTS / MAIN
vocab_params_trials = get_cartesian_product(params['vocab'])
nn_params_trials = get_cartesian_product(params['nn'])
best_valid_loss = float('inf')

for vocab_params in vocab_params_trials:
  # build torch dataset and build the vocab.
  # this should be done only once during hyperparam searching
  print('-'*25  )
  print(vocab_params)

  # text_processor will be used inside the custom_tokenizer
  text_processor = generate_text_processor(vocab_params['preprocessing'])
  TEXT = Field(tokenize=custom_tokenizer,
              include_lengths=True)
  SENTIMENT = LabelField(dtype = torch.int)
  ROW_NUM = Field()
 

  fields = [('row_num', ROW_NUM), ('text', TEXT), ('sentiment', SENTIMENT)]
  ROW_NUM, TEXT, SENTIMENT, train_iterator, valid_iterator = build_torch_dataset(train_csv_filepath, fields, vocab_params, True)

  for nn_params in nn_params_trials:
    # execute the training process
      nn_params['embedding_dim'] = vocab_params['pretrained_embedding']['embedding_dim']
      print('#'*25)
      print(nn_params)
  
      best_valid_loss = tune_hyperparams(nn_params, ROW_NUM, TEXT, SENTIMENT, train_iterator, valid_iterator, saved_model_filepath, best_valid_loss)

-------------------------
{'batch_size': 32, 'max_vocab_size': 50000, 'pretrained_embedding': {'url': 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip', 'filepath': './crawl-300d-2M.vec', 'embedding_dim': 300}, 'preprocessing': {'normalize': ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'], 'annotate': ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'], 'spell_correct_elong': True, 'to_lowercase': True}}
Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


  0%|          | 0/1999995 [00:00<?, ?it/s]Skipping token b'1999995' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 1999919/1999995 [04:33<00:00, 8550.43it/s]

#########################
{'hidden_dim': 128, 'n_layers': 2, 'is_bidirectional': True, 'dropout': 0.5, 'n_epochs': 20, 'embedding_dim': 300}
<bound method Module.parameters of RNN(
  (embedding): Embedding(18488, 300, padding_idx=1)
  (rnn): LSTM(300, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (fully_connected): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)>
6382498
RNN(
  (embedding): Embedding(18488, 300, padding_idx=1)
  (rnn): LSTM(300, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (fully_connected): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
5546400
153600
65536
512
512
153600
65536
512
512
131072
65536
512
512
131072
65536
512
512
512
2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  "type " + obj.__name__ + ". It won't be checked "


Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.339 | Val. Loss: 0.283
	Train Acc : 86.00% | Val. Acc : 89.04%
	Train Rec : 73.64% | Val. Rec : 80.67%
	Train Prec: 78.07% | Val. Prec: 83.19%
	Train F1  : 73.75% | Val. F1  : 80.99%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 0.246 | Val. Loss: 0.268
	Train Acc : 89.88% | Val. Acc : 89.51%
	Train Rec : 83.32% | Val. Rec : 81.20%
	Train Prec: 85.70% | Val. Prec: 85.51%
	Train F1  : 83.26% | Val. F1  : 81.91%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.188 | Val. Loss: 0.314
	Train Acc : 92.53% | Val. Acc : 88.80%
	Train Rec : 87.80% | Val. Rec : 79.60%
	Train Prec: 89.60% | Val. Prec: 84.01%
	Train F1  : 87.78% | Val. F1  : 80.38%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 0.145 | Val. Loss: 0.289
	Train Acc : 94.41% | Val. Acc : 88.32%
	Train Rec : 90.30% | Val. Rec : 80.92%
	Train Prec: 92.02% | Val. Prec: 83.05%
	Train F1  : 90.49% | Val. F1  : 80.66%
Epoch: 05 | Epoch Time: 0m 8s
	Train Loss: 0.110 | Val. Loss: 0.336
	Train Acc : 95.77% 


  0%|          | 0/1193514 [00:00<?, ?it/s][A
  0%|          | 1062/1193514 [00:00<01:52, 10611.46it/s][A
  0%|          | 2126/1193514 [00:00<01:52, 10619.81it/s][A
  0%|          | 3215/1193514 [00:00<01:51, 10698.57it/s][A
  0%|          | 4218/1193514 [00:00<01:53, 10487.93it/s][A
  0%|          | 5364/1193514 [00:00<01:50, 10761.08it/s][A
  1%|          | 6335/1193514 [00:00<01:53, 10420.22it/s][A
  1%|          | 7398/1193514 [00:00<01:53, 10478.87it/s][A
  1%|          | 8537/1193514 [00:00<01:50, 10735.73it/s][A
  1%|          | 9676/1193514 [00:00<01:48, 10923.27it/s][A
  1%|          | 10816/1193514 [00:01<01:46, 11060.88it/s][A
  1%|          | 11956/1193514 [00:01<01:45, 11158.68it/s][A
  1%|          | 13054/1193514 [00:01<01:47, 10939.46it/s][A
  1%|          | 14150/1193514 [00:01<01:47, 10944.14it/s][A
  1%|▏         | 15236/1193514 [00:01<01:51, 10581.60it/s][A
  1%|▏         | 16347/1193514 [00:01<01:49, 10732.91it/s][A
  1%|▏         | 17450/1193514 

#########################
{'hidden_dim': 128, 'n_layers': 2, 'is_bidirectional': True, 'dropout': 0.5, 'n_epochs': 20, 'embedding_dim': 200}
<bound method Module.parameters of RNN(
  (embedding): Embedding(18488, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (fully_connected): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)>
4431298
RNN(
  (embedding): Embedding(18488, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, dropout=0.5, bidirectional=True)
  (fully_connected): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
3697600
102400
65536
512
512
102400
65536
512
512
131072
65536
512
512
131072
65536
512
512
512
2
Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 0.346 | Val. Loss: 0.276
	Train Acc : 85.40% | Val. Acc : 88.25%
	Train Rec : 72.44% | Val. Rec : 80.20%
	Train Prec: 76.98% | Val. Prec: 82.41%
	Train F1  : 72.67% | Val. F1  : 80.34%


100%|█████████▉| 1193468/1193514 [02:06<00:00, 11421.47it/s][A

Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 0.265 | Val. Loss: 0.275
	Train Acc : 89.08% | Val. Acc : 89.35%
	Train Rec : 81.36% | Val. Rec : 79.93%
	Train Prec: 84.29% | Val. Prec: 85.63%
	Train F1  : 81.59% | Val. F1  : 81.14%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 0.221 | Val. Loss: 0.294
	Train Acc : 90.95% | Val. Acc : 89.35%
	Train Rec : 84.97% | Val. Rec : 79.02%
	Train Prec: 87.05% | Val. Prec: 86.59%
	Train F1  : 85.06% | Val. F1  : 80.55%
Early stopping!
best valid loss:  0.2607204321490115
#########################
{'hidden_dim': 128, 'n_layers': 2, 'is_bidirectional': True, 'dropout': 0.8, 'n_epochs': 20, 'embedding_dim': 200}
<bound method Module.parameters of RNN(
  (embedding): Embedding(18488, 200, padding_idx=1)
  (rnn): LSTM(200, 128, num_layers=2, dropout=0.8, bidirectional=True)
  (fully_connected): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.8, inplace=False)
)>
4431298
RNN(
  (embedding): Embedding(18488, 200, padding_idx=1)
  (r