<a href="https://colab.research.google.com/github/cmari038/Language-Translator/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install torch torchvision torchaudio
!pip install spacy
#!pip install collections

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [6]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m86.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-ne

In [4]:
import torch
import torchtext; torchtext.disable_torchtext_deprecation_warning()
import pandas as pd
import numpy as np
import spacy
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torch.utils.data import random_split
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import io
from collections import Counter, OrderedDict

In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

dataset = 'https://raw.githubusercontent.com/cmari038/Language-Translator/main/data.csv'
data = pd.read_csv(dataset)

# processing data

english_tokenizer = get_tokenizer('spacy', language = 'en_core_web_sm')
spanish_tokenizer = get_tokenizer('spacy', language= 'es_core_news_sm')

#print(data)

train = data.sample(frac=0.7, random_state=50)
validate = data.drop(train.index).sample(frac=0.1, random_state = 25)
test = data.drop(validate.index)

#train, validate, test = random_split(data, [0.7*len(data), 0.10*len(data), 0.20*len(data)])

counter1 = Counter()
counter2 = Counter()

for sentence in data['english']:
  counter1.update(english_tokenizer(sentence))

for sentence in data['spanish']:
  counter2.update(spanish_tokenizer(sentence))

en_dict = OrderedDict(counter1.most_common())
es_dict = OrderedDict(counter2.most_common())

vocab1 = vocab(en_dict, specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
vocab2 = vocab(es_dict, specials = ['<unk>', '<pad>', '<bos>', '<eos>'])

#vocab1.set_default_index(vocab1['unk'])
#vocab2.set_default_index(1)



In [8]:
class TensorSet(Dataset):
  def __init__(self, data, en_tokenizer, es_tokenizer, en_vocab, es_vocab):
    self.data = data
    self.en_tokenizer = en_tokenizer
    self.es_tokenizer = es_tokenizer
    self.en_vocab = en_vocab
    self.es_vocab = es_vocab

  def __len__(self):
    return len(self.data)

  def build_Tensor(self, index):
    english = self.data.iloc[index]['english']
    spanish = self.data.iloc[index]['spanish']
    en_indices = []
    es_indices = []

    en_tokens = self.en_tokenizer(english)
    es_tokens = self.es_tokenizer(spanish)

    for token in en_tokens:
      en_indices.append(self.en_vocab[token])

    for token in es_tokens:
      es_indices.append(self.es_vocab[token])

    en_tensor = torch.tensor([self.en_vocab['<bos>']] + en_indices + [self.en_vocab['<eos>']], dtype=torch.long)
    es_tensor = torch.tensor([self.en_vocab['<bos>']] + es_indices + [self.en_vocab['<eos>']], dtype=torch.long)

    return en_tensor, es_tensor

def collate_fn(batch):
  # used for making sure sequences are similar lengths
  en_batch = []
  es_batch = []
  for en_sample, es_sample in batch:
    en_batch.append(en_sample)
    es_batch.append(es_sample)

  en_batch = pad_sequence(en_batch, padding_value=vocab1['<pad>'])
  es_batch = pad_sequence(es_batch, padding_value=vocab2['<pad>'])

  return en_batch, es_batch

tensors = TensorSet(data, english_tokenizer, spanish_tokenizer, vocab1, vocab2)
dataLoad = DataLoader(tensors, shuffle=True, collate_fn=collate_fn)



In [None]:
class RNN_Encoder(nn.Module):
    def __init__(self, input, hidden_dimension, dropout_p=0.1):
        super(RNN_Encoder, self).__init()__()
        self.hidden_dimension = hidden_dimension
        #self.num_layers = num_layers
        #self.rnn = nn.RNN(input, hidden_dimension, num_layers, batch_first=True, nonlinearity='relu')
        #self.fc = nn.Linear(hidden_dimension, output)
        self.embed = nn.Embedding(input, hidden_dimension)
        self.gru = nn.GRU(hidden_dimension, hidden_dimension, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        #h = torch.zeros(self.num_layers, x.size(0), self.hidden_dimension)
        #out, hn = self.rnn(x, h)
        #out = self.fc(out[:, -1, :])
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded)
        return output, hidden

class RNN_Deocder(nn.module):
    def __init__(self, hidden_dimension, output):
      super(RNN_Decoder, self).__init()__()
      self.embed = nn.Embedding(output, hidden_dimension)
      self.gru = nn.GRU(hidden_dimension, hidden_dimension, batch_first=True)
      self.output = nn.Linear(hidden_dimension, output)

    def forward(self, input, hidden):
      output = self.embed(input)
      output = F.relu(output)
      output, hidden = self.gru(output, hidden)
      return output, hidden

class Sequence(nn.Module):
    def __intit__(self, encoder, decoder):
      super(Sequence, self).__init()__()
      self.encoder = encoder
      self.decoder = decoder

    def forward()





Cuda
