In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fasttext-vietnamese-word-vectors-full/cc.vi.300.bin
/kaggle/input/fasttext-vietnamese-word-vectors-full/cc.vi.300.vec
/kaggle/input/train-nlp/train.txt


In [54]:
import os
import re
import pickle
import random
import math
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Word2Vec (optional, only if strategy = word2vec)
try:
    import gensim
    from gensim.models import Word2Vec
except:
    gensim = None
    print("‚ö†Ô∏è gensim not available, will use random init")


In [55]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üîß Device: {DEVICE}")
print(f"üîß PyTorch version: {torch.__version__}")

üîß Device: cpu
üîß PyTorch version: 2.6.0+cu124


## 1.DATA ANALYSIS FUNCTION

In [56]:
def analyze_text_file(path, num_lines=1000):
    """
    Ph√¢n t√≠ch data ƒë·ªÉ quy·∫øt ƒë·ªãnh embedding strategy
    """
    print(f"\n{'='*70}")
    print(f"üîç ANALYZING FILE: {path}")
    print(f"{'='*70}\n")
    
    # Read file
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        lines = [line.strip() for line in f.readlines()]
    
    # Show samples
    print("üìÑ FIRST 5 LINES:")
    print("-" * 70)
    for i, line in enumerate(lines[:5], 1):
        display = line[:100] + '...' if len(line) > 100 else line
        print(f"{i}. {display}")
    print()
    
    # Combine text
    text = " ".join(lines)
    total_chars = len(text)
    
    # Tokenize
    words = re.findall(r"\w+", text.lower())
    total_words = len(words)
    vocab = Counter(words)
    vocab_size = len(vocab)
    
    # Calculate stats
    avg_word_len = sum(len(w) for w in words) / max(total_words, 1)
    
    # Detect language
    vi_chars = len(re.findall(r"[ƒÉ√¢ƒë√™√¥∆°∆∞√°√†·∫£√£·∫°√©√®·∫ª·∫Ω·∫π√≥√≤·ªè√µ·ªç√∫√π·ªß≈©·ª•√Ω·ª≥·ª∑·ªπ·ªµ]", text.lower()))
    en_chars = len(re.findall(r"[a-zA-Z]", text))
    
    if vi_chars > 0.05 * en_chars:
        language = "Vietnamese"
    else:
        language = "English"
    
    # Noise detection
    noise = len(re.findall(r"[^a-zA-Z0-9\s.,!?''\-]", text))
    noise_ratio = noise / max(total_chars, 1)
    
    # Sentence length
    sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
    sent_lens = [len(s.split()) for s in sentences]
    avg_sent_len = np.mean(sent_lens) if sent_lens else 0
    
    # Top words
    common_words = vocab.most_common(30)
    
    # Print stats
    print("üìä STATISTICS:")
    print("-" * 70)
    print(f"Total lines:           {len(lines):,}")
    print(f"Total characters:      {total_chars:,}")
    print(f"Total words:           {total_words:,}")
    print(f"Vocab size (unique):   {vocab_size:,}")
    print(f"Avg word length:       {avg_word_len:.2f}")
    print(f"Avg sentence length:   {avg_sent_len:.1f} words")
    print(f"Language:              {language}")
    print(f"Noise ratio:           {noise_ratio:.2%}")
    
    print(f"\nüî§ TOP 30 MOST COMMON WORDS:")
    print("-" * 70)
    print([w for w, _ in common_words])
    
    # =========================================================================
    # DECISION LOGIC
    # =========================================================================
    print(f"\n{'='*70}")
    print("üí° EMBEDDING STRATEGY RECOMMENDATION:")
    print(f"{'='*70}\n")
    config = {}
    
    # Case 1: Vietnamese
    if language == "Vietnamese":
        print("‚úÖ DETECTED: VIETNAMESE TEXT")
        print("\nRECOMMENDATION: FastText Vietnamese pretrained")
        print("\nReasons:")
        print("  ‚Ä¢ Vietnamese c·∫ßn embeddings t·ªët cho d·∫•u thanh")
        print("  ‚Ä¢ FastText Vietnamese 300d r·∫•t hi·ªáu qu·∫£")
        print("  ‚Ä¢ Download: https://fasttext.cc/docs/en/crawl-vectors.html")
        print("\nFallback: Train Word2Vec on data n·∫øu kh√¥ng c√≥ FastText\n")
        
        config = {
            'strategy': 'word_level',
            'embedding_method': 'fasttext_pretrained',
            'fasttext_path': '/kaggle/input/fasttext-vietnamese-word-vectors-full/cc.vi.300.vec',
            'embed_dim': 300,
            'min_word_freq': 2,
            'trainable': True,
            'fallback': 'word2vec'
        }
    
    # Case 2: Small vocab
    elif vocab_size < 10000:
        print("‚úÖ CASE: SMALL VOCABULARY (< 10K)")
        print("\nRECOMMENDATION: Random Init or GloVe 100d")
        print("\nReasons:")
        print("  ‚Ä¢ Vocab nh·ªè ‚Üí c√≥ th·ªÉ h·ªçc t·ª´ scratch")
        print("  ‚Ä¢ N·∫øu c√≥ GloVe ‚Üí s·ª≠ d·ª•ng ƒë·ªÉ tƒÉng t·ªëc converge")
        print("  ‚Ä¢ GloVe 100d nh·∫π, ph√π h·ª£p small vocab\n")
        
        config = {
            'strategy': 'word_level',
            'embedding_method': 'glove_or_random',
            'glove_path': '/kaggle/input/glove6b100d/glove.6B.100d.txt',
            'embed_dim': 100,
            'min_word_freq': 2,
            'trainable': True,
            'fallback': 'random_init'
        }
    
    # Case 3: Medium vocab, enough data
    elif vocab_size < 50000 and total_words > 500000:
        print("‚úÖ CASE: MEDIUM VOCAB (10-50K) + LARGE DATA")
        print("\nRECOMMENDATION: FastText English OR train Word2Vec")
        print("\nReasons:")
        print("  ‚Ä¢ Data ƒë·ªß l·ªõn ‚Üí Word2Vec h·ªçc t·ªët t·ª´ domain")
        print("  ‚Ä¢ FastText 300d n·∫øu mu·ªën t·∫≠n d·ª•ng pretrained")
        print("  ‚Ä¢ Word2Vec t·ª´ scratch n·∫øu domain-specific\n")
        
        config = {
            'strategy': 'word_level',
            'embedding_method': 'fasttext_or_word2vec',
            'fasttext_path': '/kaggle/input/fasttext-en/cc.en.300.bin',
            'embed_dim': 300,
            'min_word_freq': 3,
            'trainable': True,
            'w2v_config': {
                'vector_size': 300,
                'window': 5,
                'min_count': 3,
                'epochs': 30,
                'workers': 4
            }
        }
    
    # Case 4: Medium vocab, less data
    elif vocab_size < 50000:
        print("‚úÖ CASE: MEDIUM VOCAB (10-50K) + LESS DATA")
        print("\nRECOMMENDATION: GloVe 300d or FastText")
        print("\nReasons:")
        print("  ‚Ä¢ Data kh√¥ng ƒë·ªß train Word2Vec t·ªët")
        print("  ‚Ä¢ Pretrained embeddings gi√∫p generalization")
        print("  ‚Ä¢ GloVe/FastText coverage t·ªët cho English\n")
        
        config = {
            'strategy': 'word_level',
            'embedding_method': 'glove_or_fasttext',
            'glove_path': '/kaggle/input/glove840b300d/glove.840B.300d.txt',
            'fasttext_path': '/kaggle/input/fasttext-en/cc.en.300.bin',
            'embed_dim': 300,
            'min_word_freq': 3,
            'trainable': True,
            'max_vocab_size': 30000
        }
    
    # Case 5: Large vocab
    else:
        print("‚úÖ CASE: LARGE VOCABULARY (> 50K)")
        print("\nRECOMMENDATION: FastText 300d + Vocab Limit")
        print("\nReasons:")
        print("  ‚Ä¢ Vocab l·ªõn ‚Üí c·∫ßn gi·ªõi h·∫°n")
        print("  ‚Ä¢ FastText handle OOV t·ªët")
        print("  ‚Ä¢ Gi·ªØ top frequent words\n")
        
        config = {
            'strategy': 'word_level',
            'embedding_method': 'fasttext',
            'fasttext_path': '/kaggle/input/fasttext-en/cc.en.300.bin',
            'embed_dim': 300,
            'min_word_freq': 5,
            'trainable': True,
            'max_vocab_size': 40000
        }
    
    # Print config
    print("üìã CONFIGURATION:")
    print("-" * 70)
    for k, v in config.items():
        if isinstance(v, dict):
            print(f"{k}:")
            for k2, v2 in v.items():
                print(f"  {k2:20s}: {v2}")
        else:
            print(f"{k:20s}: {v}")
    
    print("\n" + "="*70)
    
    return text, config, vocab_size, total_words

In [57]:
DATA_PATH = "/kaggle/input/train-nlp/train.txt"

# Uncomment khi c√≥ data th·∫≠t:
text, EMBEDDING_CONFIG, vocab_size, total_words = analyze_text_file(DATA_PATH)


üîç ANALYZING FILE: /kaggle/input/train-nlp/train.txt

üìÑ FIRST 5 LINES:
----------------------------------------------------------------------
1. "6 ContributorsCh√∫ng Ta C·ªßa T∆∞∆°ng Lai Lyrics  Li·ªáu mai sau phai v·ªôi mau kh√¥ng b∆∞·ªõc b√™n c·∫°nh nhau (Kh...
2. "2 ContributorsChuy·ªán H·ª£p Tan Lyricsƒê√™m nay l·∫∑ng l·∫Ω S∆∞∆°ng m√π v·ªÅ giƒÉng tr√™n m·∫£nh t√¨nh qu√™ C√≥ ai ƒë·ªÉ bu...
3. "2 ContributorsT·∫øt N√†y Con S·∫Ω V·ªÅ LyricsT·∫øt n√†y con s·∫Ω v·ªÅ, d·∫´u ·ªü ƒë√¢u con c≈©ng s·∫Ω v·ªÅ V·ªÅ ƒëem h·∫øt chuy·ªán...
4. "1 ContributorT·∫øt Nh·ªõ T·ªõi Gi√† LyricsIntro: M·ªói ƒë·ªùi ng∆∞·ªùi ch·ªâ s·ªëng 1 l·∫ßn Vi·ªác g√¨ ng·ªìi ƒë√≥ ph√¢n v√¢n T·∫øt...
5. "2 ContributorsV√¨ Ch√≠nh L√† Em Lyrics  D√°ng ai qua, cho anh ng∆° ng·∫©n G√≥t ki√™u sa, cho anh v∆∞∆°ng v·∫•n N...

üìä STATISTICS:
----------------------------------------------------------------------
Total lines:           1,078
Total characters:      1,821,664
Total words:           414,751
Vocab size (uniqu

## 2.TEXT PREPROCESSING

In [58]:
import re

def clean_text(text):
    """Lo·∫°i b·ªè metadata, chu·∫©n h√≥a kho·∫£ng tr·∫Øng"""
    text = re.sub(r'(?i)(contributors|lyrics|intro[:\-])', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def split_into_sentences(text):
    """T√°ch c√¢u m·ªÅm cho ti·∫øng Vi·ªát"""
    sentences = re.split(r'[.!?]+', text)
    return [s.strip() for s in sentences if len(s.strip()) > 0]

def tokenize_word_level(sentence):
    """Tokenize gi·ªØ d·∫•u ti·∫øng Vi·ªát v√† d·∫•u c√¢u"""
    tokens = re.findall(r"[A-Za-z√Ä-·ªπ]+|[^\w\s]", sentence)
    return [t.lower() for t in tokens]

def load_and_tokenize_data(path, max_len=50):
    """Load v√† tokenize corpus lyric"""
    print("\nüìö LOADING & TOKENIZING DATA...")
    print("-" * 70)
    
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        text = f.read()

    text = clean_text(text)
    sentences = split_into_sentences(text)
    
    tokenized_sentences = []
    for sent in sentences:
        tokens = tokenize_word_level(sent)
        if 2 <= len(tokens) <= max_len:
            tokenized_sentences.append(tokens)
    
    print(f"‚úÖ Sentences: {len(tokenized_sentences):,}")
    print(f"‚úÖ Example: {' '.join(tokenized_sentences[0][:15])}")
    
    return tokenized_sentences


In [59]:
tokenized_sentences = load_and_tokenize_data(DATA_PATH)


üìö LOADING & TOKENIZING DATA...
----------------------------------------------------------------------
‚úÖ Sentences: 2,780
‚úÖ Example: " ch√∫ng ta c·ªßa t∆∞∆°ng lai li·ªáu mai sau phai v·ªôi mau kh√¥ng b∆∞·ªõc b√™n


##  3.BUILD VOCABULARY

In [60]:
def build_vocabulary(sentences, min_freq=2, max_vocab_size=None):
    """Build vocabulary"""
    print("\nüìñ BUILDING VOCABULARY...")
    print("-" * 70)
    
    counter = Counter()
    for sent in sentences:
        counter.update(sent)
    
    # Filter by frequency
    vocab_words = [w for w, c in counter.items() if c >= min_freq]
    
    # Limit vocab size
    if max_vocab_size and len(vocab_words) > max_vocab_size:
        print(f"‚ö†Ô∏è  Limiting vocab: {len(vocab_words)} ‚Üí {max_vocab_size}")
        vocab_words = [w for w, _ in counter.most_common(max_vocab_size)]
    
    # Special tokens
    special = ["<pad>", "<unk>"]
    vocab = special + sorted(vocab_words)
    
    stoi = {w: i for i, w in enumerate(vocab)}
    itos = vocab
    
    print(f"‚úÖ Vocab size: {len(vocab):,}")
    print(f"   Min frequency: {min_freq}")
    
    # Coverage
    total = sum(counter.values())
    covered = sum(counter[w] for w in vocab_words)
    print(f"   Coverage: {covered/total:.2%}")
    
    return stoi, itos, counter

In [61]:
MIN_FREQ = 2
MAX_VOCAB_SIZE = None # Kh√¥ng gi·ªõi h·∫°n

stoi, itos, counter = build_vocabulary(
    sentences=tokenized_sentences, 
    min_freq=MIN_FREQ, 
    max_vocab_size=MAX_VOCAB_SIZE
)

if stoi and itos:
    print("\n--- K·∫øt qu·∫£ tr·∫£ v·ªÅ ---")
    print(f"K√≠ch th∆∞·ªõc Vocab (itos): {len(itos)}")
    print(f"V√≠ d·ª• 10 t·ª´ ƒë·∫ßu ti√™n trong vocab: {itos[:10]}")
    
    # Ki·ªÉm tra √°nh x·∫° t·ª´-sang-s·ªë (string-to-index)
    word_example = itos[10] # L·∫•y m·ªôt t·ª´ ng·∫´u nhi√™n
    index_example = stoi[word_example]
    print(f"V√≠ d·ª• √°nh x·∫°: '{word_example}' -> {index_example}")


üìñ BUILDING VOCABULARY...
----------------------------------------------------------------------
‚úÖ Vocab size: 2,039
   Min frequency: 2
   Coverage: 97.29%

--- K·∫øt qu·∫£ tr·∫£ v·ªÅ ---
K√≠ch th∆∞·ªõc Vocab (itos): 2039
V√≠ d·ª• 10 t·ª´ ƒë·∫ßu ti√™n trong vocab: ['<pad>', '<unk>', '"', '&', "'", '(', ')', '*', ',', '-']
V√≠ d·ª• √°nh x·∫°: '/' -> 10


## 4. LOAD PRETRAINED EMBEDDINGS

In [62]:
import os
import numpy as np

def load_fasttext_embeddings(fasttext_path, stoi, embed_dim=300):
    """Load FastText pretrained .vec embeddings (text format)"""
    print(f"\nüé® LOADING FASTTEXT (.vec): {fasttext_path}")
    print("-" * 70)
    
    if not os.path.exists(fasttext_path):
        print(f"‚ùå File not found, using random init")
        return None
    
    vocab_size = len(stoi)
    embedding_matrix = np.random.randn(vocab_size, embed_dim).astype(np.float32) * 0.01
    
    found = 0
    with open(fasttext_path, 'r', encoding='utf8', errors='ignore') as f:
        first_line = f.readline()
        # M·ªôt s·ªë file .vec c√≥ header: "vocab_size dim"
        if len(first_line.split()) != embed_dim + 1:
            f.seek(0)
        else:
            print(f"Header detected: {first_line.strip()}")
        
        for line in f:
            parts = line.rstrip().split(' ')
            if len(parts) < embed_dim + 1:
                continue
            word = parts[0]
            if word in stoi:
                vec = np.asarray(parts[1:], dtype=np.float32)
                embedding_matrix[stoi[word]] = vec
                found += 1
    
    embedding_matrix[0] = 0.0  # padding token
    print(f"‚úÖ Found {found:,}/{vocab_size:,} pretrained vectors ({found/vocab_size:.1%} coverage)")
    print(f"‚úÖ Embedding matrix shape: {embedding_matrix.shape}")
    
    return embedding_matrix


def initialize_embeddings(stoi, config):
    """
    Initialize embedding matrix based on config
    """
    embed_dim = config.get('embed_dim', 300)
    embedding_method = config.get('embedding_method', 'random')
    vocab_size = len(stoi)

    print(f"\nüöÄ Initializing embeddings using method: {embedding_method}")

    if embedding_method == 'fasttext_pretrained':
        fasttext_path = config.get('fasttext_path')
        embedding_matrix = load_fasttext_embeddings(fasttext_path, stoi, embed_dim)
        if embedding_matrix is None:
            print("‚ö†Ô∏è Falling back to random initialization")
            embedding_matrix = np.random.randn(vocab_size, embed_dim).astype(np.float32) * 0.01
    else:
        print("‚ÑπÔ∏è Using random initialization (no pretrained embeddings)")
        embedding_matrix = np.random.randn(vocab_size, embed_dim).astype(np.float32) * 0.01

    # ƒê·∫∑t vector cho padding token (n·∫øu c√≥ index 0)
    embedding_matrix[0] = 0.0
    print(f"‚úÖ Final embedding matrix shape: {embedding_matrix.shape}")

    return embedding_matrix


In [63]:
config = {
    'embedding_method': 'fasttext_pretrained',
    'fasttext_path': '/kaggle/input/fasttext-vietnamese-word-vectors-full/cc.vi.300.vec',
    'embed_dim': 300,
    'min_word_freq': 2,
    'trainable': True
}



In [64]:
embedding_matrix = initialize_embeddings(stoi, config)





üöÄ Initializing embeddings using method: fasttext_pretrained

üé® LOADING FASTTEXT (.vec): /kaggle/input/fasttext-vietnamese-word-vectors-full/cc.vi.300.vec
----------------------------------------------------------------------
‚úÖ Found 2,005/2,039 pretrained vectors (98.3% coverage)
‚úÖ Embedding matrix shape: (2039, 300)
‚úÖ Final embedding matrix shape: (2039, 300)


In [65]:
print(embedding_matrix.shape)
print(embedding_matrix[:2])


(2039, 300)
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+0

## 5.DATASET (Sliding Window)

In [66]:
def create_sliding_window_dataset(sentences, stoi, max_len=40):
    """Create sliding window training examples"""
    print(f"\nü™ü CREATING SLIDING WINDOWS (max_len={max_len})...")
    print("-" * 70)
    
    PAD_ID = stoi["<pad>"]
    UNK_ID = stoi["<unk>"]
    
    inputs, targets = [], []
    
    for sent in sentences:
        ids = [stoi.get(w, UNK_ID) for w in sent]
        
        for start in range(len(ids)):
            window = ids[start:start + max_len + 1]
            if len(window) < 2:
                continue
            
            inp = window[:-1]
            tgt = window[1:]
            
            # Left-pad
            inp_padded = [PAD_ID] * (max_len - len(inp)) + inp
            tgt_padded = [PAD_ID] * (max_len - len(tgt)) + tgt
            
            inputs.append(inp_padded)
            targets.append(tgt_padded)
    
    X = np.array(inputs, dtype=np.int64)
    Y = np.array(targets, dtype=np.int64)
    
    print(f"‚úÖ Examples: {len(X):,}")
    print(f"   Shape: {X.shape}")
    
    return X, Y


class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X).long()
        self.Y = torch.from_numpy(Y).long()
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [68]:
# X√°c ƒë·ªãnh chi·ªÅu d√†i t·ªëi ƒëa c·ªßa c·ª≠a s·ªï tr∆∞·ª£t (ng·ªØ c·∫£nh)
# Gi√° tr·ªã n√†y n√™n kh·ªõp v·ªõi SEQUENCE_LENGTH b·∫°n ƒë·ªãnh d√πng trong Model (CELL 8)
SEQUENCE_LENGTH = 40
# L·∫•y c√°c bi·∫øn t·ª´ c√°c b∆∞·ªõc tr∆∞·ªõc:
# 1. tokenized_sentences (t·ª´ load_and_tokenize_data)
# 2. stoi (t·ª´ build_vocabulary)

X, Y = create_sliding_window_dataset(
    sentences=tokenized_sentences, 
    stoi=stoi, 
    max_len=SEQUENCE_LENGTH # S·ª≠ d·ª•ng bi·∫øn SEQUENCE_LENGTH ƒë√£ khai b√°o
)

# T·∫°o PyTorch Dataset
full_dataset = TextDataset(X, Y)

print(f"‚úÖ ƒê√£ t·∫°o Dataset PyTorch v·ªõi {len(full_dataset):,} v√≠ d·ª• hu·∫•n luy·ªán.")


ü™ü CREATING SLIDING WINDOWS (max_len=40)...
----------------------------------------------------------------------
‚úÖ Examples: 39,665
   Shape: (39665, 40)
‚úÖ ƒê√£ t·∫°o Dataset PyTorch v·ªõi 39,665 v√≠ d·ª• hu·∫•n luy·ªán.


## 5.5.Train,val,test

In [69]:
# =============================================================================
# ‚öôÔ∏è CHIA T·∫¨P TRAIN / VAL / TEST T·ª™ T·∫¨P TRAIN BAN ƒê·∫¶U
# =============================================================================
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

SEED = 42
BATCH_SIZE = 64
SEQUENCE_LENGTH = 40

# -------------------------------------------------------------------------
# B∆∞·ªõc 1: t√°ch Test tr∆∞·ªõc, r·ªìi t√°ch Val t·ª´ ph·∫ßn c√≤n l·∫°i
# -------------------------------------------------------------------------
X_temp, X_test, Y_temp, Y_test = train_test_split(
    X, Y,
    test_size=0.1,   # 10% cho test
    random_state=SEED,
    shuffle=True
)

X_train, X_val, Y_train, Y_val = train_test_split(
    X_temp, Y_temp,
    test_size=0.1,   # 10% c·ªßa ph·∫ßn c√≤n l·∫°i ‚Üí ~9% t·ªïng
    random_state=SEED,
    shuffle=True
)

print(f"üìä Dataset Split:")
print(f"   - Train samples: {len(X_train):,}")
print(f"   - Val samples:   {len(X_val):,}")
print(f"   - Test samples:  {len(X_test):,}")

# -------------------------------------------------------------------------
# B∆∞·ªõc 2: T·∫°o Dataset v√† DataLoader
# -------------------------------------------------------------------------
train_dataset = TextDataset(X_train, Y_train)
val_dataset = TextDataset(X_val, Y_val)
test_dataset = TextDataset(X_test, Y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

print(f"‚úÖ Train loader: {len(train_loader)} batches")
print(f"‚úÖ Val loader:   {len(val_loader)} batches")
print(f"‚úÖ Test loader:  {len(test_loader)} batches")


üìä Dataset Split:
   - Train samples: 32,128
   - Val samples:   3,570
   - Test samples:  3,967
‚úÖ Train loader: 502 batches
‚úÖ Val loader:   56 batches
‚úÖ Test loader:  62 batches


## 6.LSTM MODEL

In [70]:
class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2,
                 dropout=0.3, embedding_matrix=None, trainable_emb=True):
        super().__init__()
        
        # Embedding
        if embedding_matrix is not None:
            weights = torch.tensor(embedding_matrix, dtype=torch.float32)
            self.embedding = nn.Embedding.from_pretrained(
                weights, freeze=not trainable_emb, padding_idx=0
            )
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        self.embed_dim = self.embedding.embedding_dim
        
        # Layers
        self.input_ln = nn.LayerNorm(self.embed_dim)
        self.input_dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            self.embed_dim, hidden_dim, num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        
        self.hidden_ln = nn.LayerNorm(hidden_dim)
        self.hidden_dropout = nn.Dropout(dropout)
        
        # Project to embedding space for weight tying
        self.hidden_to_embed = nn.Linear(hidden_dim, self.embed_dim)
        nn.init.xavier_uniform_(self.hidden_to_embed.weight)
        
        self.output_ln = nn.LayerNorm(self.embed_dim)
        self.output_dropout = nn.Dropout(dropout)
        self.output_bias = nn.Parameter(torch.zeros(vocab_size))
    
    def forward(self, x):
        # Embedding
        emb = self.embedding(x)  # (B, T, E)
        emb = self.input_ln(emb)
        emb = self.input_dropout(emb)
        
        # LSTM
        lstm_out, _ = self.lstm(emb)  # (B, T, H)
        lstm_out = self.hidden_ln(lstm_out)
        lstm_out = self.hidden_dropout(lstm_out)
        
        # Project + residual
        proj = self.hidden_to_embed(lstm_out)
        proj = proj + emb
        proj = self.output_ln(proj)
        proj = self.output_dropout(proj)
        
        # Weight tying
        logits = torch.matmul(proj, self.embedding.weight.t()) + self.output_bias
        
        return logits

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è  S·ª≠ d·ª•ng thi·∫øt b·ªã: {DEVICE}") 

üñ•Ô∏è  S·ª≠ d·ª•ng thi·∫øt b·ªã: cpu


In [72]:
model = LSTMTextGenerator(
    vocab_size=len(stoi),
    embed_dim=300,
    hidden_dim=256,
    num_layers=2,
    dropout=0.3,
    embedding_matrix=embedding_matrix,
    trainable_emb=True
).to(device)


## 7.Training

In [73]:
class EarlyStopping:
    def __init__(self, patience=5, path='best_model.pt'):
        self.patience = patience
        self.path = path
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    
    def __call__(self, val_loss, model):
        if self.best_loss is None or val_loss < self.best_loss:
            self.best_loss = val_loss
            torch.save(model.state_dict(), self.path)
            self.counter = 0
            print(f"   üíæ Checkpoint saved")
        else:
            self.counter += 1
            print(f"   ‚è≥ EarlyStopping: {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True


def train_model(model, train_loader, val_loader, num_epochs=50,
                lr=3e-4, clip_norm=5.0, patience=5, device='cuda'):
    """Training loop"""
    print(f"\n{'='*70}")
    print("üöÄ TRAINING START")
    print(f"{'='*70}\n")
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', factor=0.5, patience=3, verbose=True
    )
    
    early_stopping = EarlyStopping(patience=patience)
    history = {'train_loss': [], 'train_ppl': [], 'val_loss': [], 'val_ppl': []}
    
    for epoch in range(1, num_epochs + 1):
        # Train
        model.train()
        total_loss = 0.0
        
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            logits = model(x)
            
            B, T, V = logits.size()
            loss = criterion(logits.view(B*T, V), y.view(B*T))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
            optimizer.step()
            
            total_loss += loss.item()
        
        train_loss = total_loss / len(train_loader)
        train_ppl = math.exp(min(train_loss, 100))
        
        # Validation
        model.eval()
        total_val = 0.0
        
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits = model(x)
                B, T, V = logits.size()
                total_val += criterion(logits.view(B*T, V), y.view(B*T)).item()
        
        val_loss = total_val / len(val_loader)
        val_ppl = math.exp(min(val_loss, 100))
        
        history['train_loss'].append(train_loss)
        history['train_ppl'].append(train_ppl)
        history['val_loss'].append(val_loss)
        history['val_ppl'].append(val_ppl)
        
        print(f"Epoch {epoch:3d}/{num_epochs} | "
              f"Train: {train_loss:.4f} (PPL {train_ppl:7.2f}) | "
              f"Val: {val_loss:.4f} (PPL {val_ppl:7.2f})")
        
        scheduler.step(val_loss)
        early_stopping(val_loss, model)
        
        if early_stopping.early_stop:
            print("\n‚ö†Ô∏è  Early stopping!")
            break
    
    model.load_state_dict(torch.load('best_model.pt'))
    return model, history

In [74]:
# KH·ªûI T·∫†O TR·ªåNG S·ªê (T√πy ch·ªçn, nh∆∞ng n√™n l√†m)
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight.data)
        if m.bias is not None:
            m.bias.data.zero_()
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
model.apply(initialize_weights)

# Chuy·ªÉn model sang DEVICE (CPU trong tr∆∞·ªùng h·ª£p n√†y)
model.to(DEVICE)

LSTMTextGenerator(
  (embedding): Embedding(2039, 300, padding_idx=0)
  (input_ln): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (input_dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.3)
  (hidden_ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (hidden_dropout): Dropout(p=0.3, inplace=False)
  (hidden_to_embed): Linear(in_features=256, out_features=300, bias=True)
  (output_ln): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (output_dropout): Dropout(p=0.3, inplace=False)
)

In [75]:
# ƒê·ªãnh nghƒ©a c√°c tham s·ªë Hu·∫•n luy·ªán
NUM_EPOCHS = 15
LEARNING_RATE = 1e-3
CLIP_VALUE = 5.0 # Gi√° tr·ªã cho Gradient Clipping
PATIENCE = 5     # S·ªë epoch ch·ªù tr∆∞·ªõc khi d·ª´ng s·ªõm (Early Stopping)

print("üöÄ B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN M√î H√åNH...")
print("-" * 50)

# >> G·ªåI H√ÄM HU·∫§N LUY·ªÜN:
best_model, history = train_model(
    model=model, 
    train_loader=train_loader, 
    val_loader=val_loader, 
    num_epochs=NUM_EPOCHS, 
    lr=LEARNING_RATE, 
    clip_norm=CLIP_VALUE, 
    patience=PATIENCE, 
    device=DEVICE # Bi·∫øn DEVICE ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a ·ªü B∆∞·ªõc 1
)

print("-" * 50)
print("‚úÖ HU·∫§N LUY·ªÜN HO√ÄN T·∫§T.")
print(f"M√¥ h√¨nh t·ªët nh·∫•t (Best Model) ƒë√£ ƒë∆∞·ª£c l∆∞u.")

üöÄ B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN M√î H√åNH...
--------------------------------------------------

üöÄ TRAINING START





Epoch   1/15 | Train: 5.3382 (PPL  208.13) | Val: 3.5907 (PPL   36.26)
   üíæ Checkpoint saved
Epoch   2/15 | Train: 3.4337 (PPL   30.99) | Val: 2.2240 (PPL    9.24)
   üíæ Checkpoint saved
Epoch   3/15 | Train: 2.5451 (PPL   12.74) | Val: 1.5571 (PPL    4.75)
   üíæ Checkpoint saved
Epoch   4/15 | Train: 2.0663 (PPL    7.90) | Val: 1.2498 (PPL    3.49)
   üíæ Checkpoint saved
Epoch   5/15 | Train: 1.7792 (PPL    5.93) | Val: 1.0781 (PPL    2.94)
   üíæ Checkpoint saved
Epoch   6/15 | Train: 1.6054 (PPL    4.98) | Val: 0.9751 (PPL    2.65)
   üíæ Checkpoint saved
Epoch   7/15 | Train: 1.4792 (PPL    4.39) | Val: 0.9107 (PPL    2.49)
   üíæ Checkpoint saved
Epoch   8/15 | Train: 1.3893 (PPL    4.01) | Val: 0.8616 (PPL    2.37)
   üíæ Checkpoint saved
Epoch   9/15 | Train: 1.3218 (PPL    3.75) | Val: 0.8343 (PPL    2.30)
   üíæ Checkpoint saved
Epoch  10/15 | Train: 1.2662 (PPL    3.55) | Val: 0.7978 (PPL    2.22)
   üíæ Checkpoint saved
Epoch  11/15 | Train: 1.2233 (PPL    3.4

## 8.TEXT GENERATION

In [84]:
import torch

import torch
import torch.nn.functional as F
import random

import torch
import torch.nn.functional as F
import random

def generate_text(
    model, seed, stoi, itos,
    max_len=50, device="cpu",
    temperature=0.8, top_k=10,
    repetition_penalty=1.2,
    stop_tokens=["<eos>"],
    verbose=False
):
    """Generate text with top-k sampling, repetition penalty, and early stopping."""
    model.eval()
    
    UNK_ID = stoi.get("<unk>", 0)
    seed = seed.lower().strip()
    tokens = [stoi.get(w, UNK_ID) for w in seed.split()]
    generated = tokens[:]

    with torch.no_grad():
        for step in range(max_len):
            x = torch.tensor([generated], dtype=torch.long, device=device)
            logits = model(x)[0, -1, :] / temperature

            # Repetition penalty
            for t in set(generated):
                logits[t] /= repetition_penalty

            probs = F.softmax(logits, dim=-1)
            top_k_probs, top_k_ids = torch.topk(probs, k=top_k)
            top_k_probs = top_k_probs / top_k_probs.sum()

            next_id = random.choices(
                top_k_ids.cpu().tolist(),
                weights=top_k_probs.cpu().tolist()
            )[0]

            generated.append(next_id)

            # Early stop if meet stop token
            if itos[next_id] in stop_tokens:
                break

            if verbose:
                print(f"Step {step}: {itos[next_id]}")

    words = [itos[i] if i < len(itos) else "<unk>" for i in generated]
    return ' '.join(words)




# ==============================
# üå∏ V√≠ d·ª• s·ª≠ d·ª•ng:
# ==============================
seeds = [
    "Em y√™u anh",
    "Tr·ªùi ƒë√™m nay",
    "Gi·ªçt m∆∞a r∆°i",
    "Anh v·∫´n nh·ªõ",
    "M·ªôt ng√†y n√†o ƒë√≥"
]

for s in seeds:
    print("Seed:", s)
    try:
        generated_text = generate_text(
            model=best_model,
            seed=s,
            stoi=stoi,
            itos=itos,
            max_len=50,
            device=device
        )
        print("Generated:", generated_text)
    except Exception as e:
        print("‚ö†Ô∏è Please adapt generate_text() to your model:", e)
    print("-" * 60)


Seed: Em y√™u anh
Generated: em y√™u anh , yeah li·ªáu m√¨nh c√≤n y√™u nhau th√¨ kh√¥ng th·∫•m bi·∫øt ai c√≤n nh·ªõ ai , khu√¥n m·∫∑t ƒë√°ng th∆∞∆°ng c√°nh hoa √∫a t√†n b·ª©c tranh v√©n m√†n b√≥ng ai xa ng√∫t ng√†n n∆∞·ªõc m·∫Øt r∆°i ·ª©a tr√†n s·∫ßu l√†n mi kh√©p t√¨nh bu·ªìn ai √©p bi·∫øt ƒëi v·ªÅ ch·ªën
------------------------------------------------------------
Seed: Tr·ªùi ƒë√™m nay
Generated: tr·ªùi ƒë√™m nay sao em kh√¥ng vui l√™n ti·∫øng y√™u ·∫•m √™m nh√¨n l·∫°i ni·ªÅm tin t·ª´ng trao gi·ªù sao sau bao ngu mu·ªôi sai l·∫ßm anh v·∫´n y·∫øu m·ªÅm l√† v·∫øt th∆∞∆°ng l√≤ng x√≥t xa ƒë·∫øn n∆°i ch√∫ng ta , nh∆∞ng m√† v∆∞·ª£t qua h·∫øt nh·ªØng nghƒ© suy con ƒë∆∞·ªùng sau n√†y
------------------------------------------------------------
Seed: Gi·ªçt m∆∞a r∆°i
Generated: gi·ªçt m∆∞a r∆°i tr√™n <unk> tr∆∞·ªùng tu·ªïi th∆° nh∆∞ n·∫Øng ·∫•m √°p m√πa ƒë√¥ng " " t·ª± t√¨nh c√≥ ng∆∞·ªùi y√™u ch∆∞a v∆°i nh·ªõ nh·ªØng l√∫c h·ª©a h·∫πn , l√∫c h·ª©a h·∫πn m√† th√¥i ch·∫≥ng c√≤n g√¨ ƒë√¢u , righ

## 9.VISUALIZATION & REPORT

In [88]:
for x, y in test_loader:
    print("x shape:", x.shape)
    print("x sample:", x[0][:10])  # 10 token ƒë·∫ßu c·ªßa m·∫´u ƒë·∫ßu ti√™n
    print("y shape:", y.shape)
    break


x shape: torch.Size([64, 40])
x sample: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y shape: torch.Size([64, 40])


In [95]:
"""
H√†m ƒë√°nh gi√° model tr√™n test set v·ªõi metric Perplexity
"""
import torch
import torch.nn as nn
import math
import time
from tqdm import tqdm


def evaluate_on_test(model, test_loader, device='cuda', verbose=True):
    """
    ƒê√°nh gi√° model tr√™n test set
    
    Args:
        model: LSTM model ƒë√£ train
        test_loader: DataLoader cho test set
        device: 'cuda' or 'cpu'
        verbose: In chi ti·∫øt hay kh√¥ng
    
    Returns:
        dict: {
            'test_loss': float,
            'test_perplexity': float,
            'num_batches': int,
            'num_samples': int
        }
    """
    
    if verbose:
        print(f"\n{'='*70}")
        print("üìä EVALUATING ON TEST SET")
        print(f"{'='*70}\n")
    
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')  # sum ƒë·ªÉ t√≠nh ch√≠nh x√°c
    
    total_loss = 0.0
    total_tokens = 0
    num_batches = 0
    
    start_time = time.time()
    
    with torch.no_grad():
        # Wrap v·ªõi tqdm n·∫øu verbose
        iterator = tqdm(test_loader, desc="Testing") if verbose else test_loader
        
        for batch_idx, (x, y) in enumerate(iterator):
            x, y = x.to(device), y.to(device)
            
            # Forward pass
            logits = model(x)  # (B, T, V)
            B, T, V = logits.size()
            
            # Flatten
            logits_flat = logits.view(B * T, V)
            targets_flat = y.view(B * T)
            
            # Calculate loss (sum, not mean)
            loss = criterion(logits_flat, targets_flat)
            
            # Count non-padding tokens
            non_pad_tokens = (targets_flat != 0).sum().item()
            
            total_loss += loss.item()
            total_tokens += non_pad_tokens
            num_batches += 1
    
    elapsed_time = time.time() - start_time
    
    # Calculate metrics
    avg_loss = total_loss / total_tokens  # Loss per token
    perplexity = math.exp(min(avg_loss, 100))  # Cap ƒë·ªÉ tr√°nh overflow
    
    # Calculate samples (approximate - m·ªói sample c√≥ th·ªÉ c√≥ s·ªë tokens kh√°c nhau)
    num_samples = num_batches * test_loader.batch_size
    
    # Print results
    if verbose:
        print(f"\n{'='*70}")
        print("‚úÖ TEST RESULTS")
        print(f"{'='*70}")
        print(f"Test Loss:        {avg_loss:.6f}")
        print(f"Test Perplexity:  {perplexity:.4f}")
        print(f"Num Batches:      {num_batches:,}")
        print(f"Num Samples:      {num_samples:,}")
        print(f"Total Tokens:     {total_tokens:,}")
        print(f"Time Elapsed:     {elapsed_time:.2f}s")
        print(f"{'='*70}\n")
    
    results = {
        'test_loss': avg_loss,
        'test_perplexity': perplexity,
        'num_batches': num_batches,
        'num_samples': num_samples,
        'total_tokens': total_tokens,
        'time_elapsed': elapsed_time
    }
    
    return results
results = evaluate_on_test(model, test_loader, device=DEVICE)
print(f"Final Test Perplexity: {results['test_perplexity']:.2f}")


üìä EVALUATING ON TEST SET



Testing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:06<00:00,  9.11it/s]


‚úÖ TEST RESULTS
Test Loss:        0.680008
Test Perplexity:  1.9739
Num Batches:      62
Num Samples:      3,968
Total Tokens:     52,442
Time Elapsed:     6.84s

Final Test Perplexity: 1.97



