<a href="https://colab.research.google.com/github/elangbijak4/LLM-Research/blob/main/Simple5_GPU_Corpus1M_N_Enkoder_M_Dekoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
# Pertama, mount Google Drive untuk akses corpus
from google.colab import drive
drive.mount('/content/drive')

# Pastikan corpus berada di dalam Google Drive
corpus_path = '/content/drive/My Drive/path_to_your_corpus.txt'

In [None]:
#Melatih model word2vec

# Membaca corpus dan memproses data
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [simple_preprocess(line) for line in lines]

sentences = read_corpus(corpus_path)

# Melatih model Word2Vec
word2vec_model = Word2Vec(sentences, vector_size=32, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Mempersiapkan vocabulary dan embedding matrix
vocab = list(word_vectors.key_to_index.keys())
vocab.append('<eos>')
vocab_size = len(vocab)
embed_dim = word_vectors.vector_size

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}
embedding_matrix = torch.zeros((vocab_size, embed_dim))
for word, idx in word2idx.items():
    if word in word_vectors:
        embedding_matrix[idx] = torch.tensor(word_vectors[word])

In [None]:
# Definisikan model Transformer
class TransformerModel(nn.Module):
    def __init__(self, embed_dim, nhead, dim_feedforward, num_layers, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.transformer = nn.Transformer(
            d_model=embed_dim,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward
        )
        self.fc_out = nn.Linear(embed_dim, output_dim)

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        src = src.permute(1, 0, 2)  # (seq_len, batch_size, embed_dim)
        tgt = tgt.permute(1, 0, 2)  # (seq_len, batch_size, embed_dim)
        output = self.transformer(src, tgt)
        output = output.permute(1, 0, 2)  # (batch_size, seq_len, embed_dim)
        output = self.fc_out(output)
        return output

# Hyperparameters
nhead = 2
dim_feedforward = 128
num_layers = 4
output_dim = vocab_size
seq_length = 10  # Disesuaikan dengan panjang kalimat dalam corpus
batch_size = 64  # Batch size yang lebih besar untuk memanfaatkan GPU
num_epochs = 10  # Sesuaikan dengan resource yang tersedia
learning_rate = 0.001

# Mempersiapkan data pelatihan
def encode_sentence(sentence, word2idx, seq_length):
    return [word2idx.get(word, word2idx['<eos>']) for word in sentence] + [word2idx['<eos>']] * (seq_length - len(sentence))

# Membuat data pelatihan
src_sentences = sentences[:50000]  # Misal menggunakan 50k kalimat untuk pelatihan
tgt_sentences = [sent + ['<eos>'] for sent in src_sentences]  # Target adalah kalimat dengan tambahan token akhir

src = torch.tensor([encode_sentence(sent, word2idx, seq_length) for sent in src_sentences])
tgt = torch.tensor([encode_sentence(sent, word2idx, seq_length) for sent in tgt_sentences])
target_output = tgt.clone()

# Model, loss function, dan optimizer
model = TransformerModel(embed_dim, nhead, dim_feedforward, num_layers, output_dim).cuda()  # Menggunakan GPU
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(src), batch_size):
        src_batch = src[i:i+batch_size].cuda()
        tgt_batch = tgt[i:i+batch_size, :-1].cuda()
        target_batch = target_output[i:i+batch_size, 1:].cuda()

        optimizer.zero_grad()
        output = model(src_batch, tgt_batch)
        loss = criterion(output.reshape(-1, output_dim), target_batch.reshape(-1))
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# Fungsi untuk menghasilkan kalimat secara autoregresif
def generate_sentence(model, prompt, max_length, word2idx, idx2word):
    model.eval()
    with torch.no_grad():
        src = torch.tensor([encode_sentence(prompt, word2idx, max_length)]).cuda()
        tgt = torch.zeros((1, max_length), dtype=torch.long).cuda()
        for i in range(max_length):
            output = model(src, tgt[:, :i+1])
            next_word_idx = output.argmax(dim=-1)[:, i].item()
            tgt[0, i] = next_word_idx
            if idx2word[next_word_idx] == '<eos>':
                break
        generated_sentence = [idx2word[idx.item()] for idx in tgt[0] if idx2word[idx.item()] != '<eos>']
    return ' '.join(generated_sentence)

# Contoh penggunaan
prompt = ['halo']
generated_sentence = generate_sentence(model, prompt, seq_length, word2idx, idx2word)
print("Generated sentence:", generated_sentence)