In [None]:
#pip install transformer
#pip install scentencepiece

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [3]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [7]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [8]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

In [9]:

# Specify the path to your tab-delimited file
file_path = '/content/ron.txt'

# Read the data from the file
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()[:1000]

# Create a simple vocabulary mapping words to numerical indices
vocab = {"<PAD>": 0, "<UNK>": 1}  # You can extend this vocabulary as needed
current_index = len(vocab)

# Extract three parts from each line
src_sentences = []
tgt_sentences = []
additional_part = []

for line in lines:
    parts = line.strip().split('\t')
    if len(parts) == 3:
        src_sentence, tgt_sentence, additional = parts
        src_sentences.append(src_sentence)
        tgt_sentences.append(tgt_sentence)
        additional_part.append(additional)
    else:
        print(f"Skipping line: {line}")

# Tokenize sentences and convert to numerical indices
def tokenize_and_index(sentence, vocab):
    return [vocab.get(word, vocab["<UNK>"]) for word in sentence.split()]

src_data = [tokenize_and_index(sentence, vocab) for sentence in src_sentences]
tgt_data = [tokenize_and_index(sentence, vocab) for sentence in tgt_sentences]
additional_data = [tokenize_and_index(sentence, vocab) for sentence in additional_part]

# Convert lists of indices to tensors
src_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in src_data], batch_first=True)
tgt_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in tgt_data], batch_first=True)
additional_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in additional_data], batch_first=True)

# Print the loaded data shapes
print("Source Data Shape:", src_data.shape)
print("Target Data Shape:", tgt_data.shape)
print("Additional Data Shape:", additional_data.shape)


Source Data Shape: torch.Size([1000, 5])
Target Data Shape: torch.Size([1000, 6])
Additional Data Shape: torch.Size([1000, 10])


In [18]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(10):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 0.014217894524335861
Epoch: 2, Loss: 0.010627048090100288
Epoch: 3, Loss: 0.009530439972877502
Epoch: 4, Loss: 0.008606387302279472
Epoch: 5, Loss: 0.007666945923119783
Epoch: 6, Loss: 0.006816623732447624
Epoch: 7, Loss: 0.006113996729254723
Epoch: 8, Loss: 0.005523843690752983
Epoch: 9, Loss: 0.005033179651945829
Epoch: 10, Loss: 0.004612462129443884


In [12]:
from transformers import pipeline
translator = pipeline("translation_en_to_de")
text = "Hello world! Hugging Face is the best NLP tool."
translation = translator(text)

print(translation)

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


[{'translation_text': 'Hello world, Hugging Face ist das beste NLP-Tool.'}]


In [13]:
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForTokenClassification


In [14]:
import sentencepiece


In [21]:
# Load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ro"  # Replace with the model you want to use
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenize and encode the source sentences
#src_sentences = ["This is a test sentence."]  # Replace with your actual source sentences
input_ids = tokenizer(src_sentence, return_tensors="pt", padding=True, truncation=True)["input_ids"]

# Forward pass through the model
with torch.no_grad():
    outputs = model.generate(input_ids)

# Decode the generated target sentences
generated_sentences = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the results
print("Source Sentence:", src_sentence)
print("Generated Sentence:", generated_sentences)


Source Sentence: Don't trust Tom.
Generated Sentence: ['Să n-ai încredere în Tom.']


In [41]:
# Assuming 'transformer' is your trained model and 'tgt_vocab_size' is defined

# Set the model to evaluation mode
transformer.eval()

# Generate translations using greedy decoding
def generate_translation(model, src_data, max_length=50):
    with torch.no_grad():
        # Initialize the target sequence with the start-of-sequence (SOS) token index
        sos_token_idx = 2  # Adjust this index based on your vocabulary
        tgt_seq = torch.ones((src_data.shape[0], 1), dtype=torch.long).fill_(sos_token_idx).to(src_data.device)

        for _ in range(max_length):
            # Generate the next token
            output = model(src_data, tgt_seq)
            next_token = torch.argmax(output[:, -1, :], dim=-1).unsqueeze(1)

            # Concatenate the next token to the target sequence
            tgt_seq = torch.cat([tgt_seq, next_token], dim=-1)

            # Stop if the end-of-sequence (EOS) token index is generated for all sequences
            if torch.all(next_token == 1):
                break

        return tgt_seq[:, 1:]  # Exclude the initial SOS token

# Generate translations
translated_output = generate_translation(transformer, src_data)

# Print the generated translations
for i in range(len(src_data)):
    src_sentence = " ".join([str(word) for word in src_data[i].tolist() if word != 0])
    tgt_sentence = " ".join([str(word) for word in translated_output[i].tolist() if word != 0])
    print(f"Source: {src_sentence}\nTranslated: {tgt_sentence}\n")


Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1
Translated: 1

Source: 1 1
Translated: 1

Source: 1