In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import SparseTopKCategoricalAccuracy
import numpy as np
import json
import pickle

2025-11-01 05:24:03.002457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761974643.258895      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761974643.333090      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load dữ liệu
import pickle
import json
import numpy as np

# Load metadata
with open('/kaggle/input/training_metadata.json', 'r') as f:
    metadata = json.load(f)

# Load mappings
with open('/kaggle/input/word_to_id.json', 'r') as f:
    word_to_id = json.load(f)

with open('/kaggle/input/id_to_word.json', 'r') as f:
    id_to_word = json.load(f)

# Load sequences
with open('/kaggle/input/training_sequences.pkl', 'rb') as f:
    sequences = pickle.load(f)

print(f"Loaded {len(sequences)} sequences")
print(f"Vocab size: {metadata['vocab_size']}")

Loaded 338084 sequences
Vocab size: 15004


In [3]:
class AdvancedRNNModel:
    def __init__(self, vocab_size, sequence_length=30):
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.model = None
        
    def create_model(self):
        """Tạo model RNN phức tạp với nhiều layer"""
        model = Sequential([
            # Embedding layer với dropout
            Embedding(
                input_dim=self.vocab_size,
                output_dim=256,
                input_length=self.sequence_length,
                mask_zero=True
            ),
            Dropout(0.3),
            
            # LSTM layer 1
            LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # LSTM layer 2
            LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # LSTM layer 3
            LSTM(256, dropout=0.2, recurrent_dropout=0.2),
            BatchNormalization(),
            
            # Dense layers
            Dense(512, activation='relu'),
            Dropout(0.3),
            BatchNormalization(),
            
            Dense(256, activation='relu'),
            Dropout(0.2),
            
            # Output layer
            Dense(self.vocab_size, activation='softmax')
        ])
        
        # Compile với optimizer tối ưu (sparse labels)
        model.compile(
            optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999),
            loss='sparse_categorical_crossentropy',
            metrics=[
                'accuracy',
                SparseTopKCategoricalAccuracy(k=3, name='sparse_top_3_accuracy'),
                SparseTopKCategoricalAccuracy(k=5, name='sparse_top_5_accuracy'),
            ]
        )
        
        return model
    
    def prepare_data(self, sequences):
        """Chuẩn bị dữ liệu training"""
        X = sequences[:, :-1]  # Input sequences
        y = sequences[:, -1].astype('int32')   # Target word ids (sparse)
        
        return X, y

print("✅ AdvancedRNNModel class đã được định nghĩa")

✅ AdvancedRNNModel class đã được định nghĩa


In [4]:
# Tạo model instance
rnn_model = AdvancedRNNModel(
    vocab_size=metadata['vocab_size'],
    sequence_length=metadata['sequence_length']
)

# Tạo model architecture
rnn_model.model = rnn_model.create_model()

# Build model với input shape để có thể đếm parameters
rnn_model.model.build(input_shape=(None, metadata['sequence_length']))

# Hiển thị model summary
rnn_model.model.summary()

# Đếm parameters sau khi build
total_params = rnn_model.model.count_params()
print(f"✅ Model đã được tạo với {total_params:,} parameters")

# Hiển thị thông tin chi tiết
print(f"\n📊 Model Architecture:")
print(f"- Input shape: (batch_size, {metadata['sequence_length']})")
print(f"- Output shape: (batch_size, {metadata['vocab_size']})")
print(f"- Total parameters: {total_params:,}")
print(f"- Trainable parameters: {total_params:,}")

I0000 00:00:1761925749.191092      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1761925749.191832      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


✅ Model đã được tạo với 12,428,700 parameters

📊 Model Architecture:
- Input shape: (batch_size, 30)
- Output shape: (batch_size, 15004)
- Total parameters: 12,428,700
- Trainable parameters: 12,428,700


In [5]:
# Chuẩn bị dữ liệu
X, y = rnn_model.prepare_data(sequences)

print(f"📈 Training Data:")
print(f"- X shape: {X.shape}")
print(f"- y shape: {y.shape}")
print(f"- Memory usage: {(X.nbytes + y.nbytes) / 1024**2:.1f} MB")

# Kiểm tra một sample
print(f"\n🔍 Sample data:")
print(f"- Input sequence: {X[0][:10]}...")
print(f"- Target word ID: {int(y[0])}")
print(f"- Target word: {id_to_word[str(int(y[0]))]}")

📈 Training Data:
- X shape: (338084, 30)
- y shape: (338084,)
- Memory usage: 78.7 MB

🔍 Sample data:
- Input sequence: [   0  506 2110 1510    6    4  124  346  921    5]...
- Target word ID: 509
- Target word: tiết


In [6]:
# Callbacks cho training
callbacks = [
    # Lưu model tốt nhất
    ModelCheckpoint(
        'best_rnn_model.h5',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    ),
    
    # Early stopping
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    
    # Giảm learning rate
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    )
]

print("🚀 Bắt đầu training...")
print("⏰ Thời gian training dự kiến: 2-4 giờ")

# Training
history = rnn_model.model.fit(
    X, y,
    batch_size=32,
    epochs=30,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)

print("✅ Training hoàn thành!")

🚀 Bắt đầu training...
⏰ Thời gian training dự kiến: 2-4 giờ
Epoch 1/30
[1m8453/8453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step - accuracy: 0.0177 - loss: 7.3244 - sparse_top_3_accuracy: 0.0530 - sparse_top_5_accuracy: 0.0814
Epoch 1: val_loss improved from inf to 7.08148, saving model to best_rnn_model.h5
[1m8453/8453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2181s[0m 256ms/step - accuracy: 0.0177 - loss: 7.3244 - sparse_top_3_accuracy: 0.0530 - sparse_top_5_accuracy: 0.0814 - val_accuracy: 0.0188 - val_loss: 7.0815 - val_sparse_top_3_accuracy: 0.0548 - val_sparse_top_5_accuracy: 0.0868 - learning_rate: 0.0010
Epoch 2/30
[1m8453/8453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 247ms/step - accuracy: 0.0203 - loss: 6.9191 - sparse_top_3_accuracy: 0.0570 - sparse_top_5_accuracy: 0.0876
Epoch 2: val_loss improved from 7.08148 to 6.96107, saving model to best_rnn_model.h5
[1m8453/8453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2177s[0m 258ms/s

In [7]:
def save_model_complete(model, model_path='rnn_model_complete'):
    """Lưu model hoàn chỉnh để sử dụng sau"""
    # Lưu model architecture và weights
    model.save(f'{model_path}.h5')
    
    # Lưu thông tin model
    model_info = {
        'vocab_size': metadata['vocab_size'],
        'sequence_length': metadata['sequence_length'],
        'model_type': 'AdvancedRNN',
        'architecture': 'LSTM_3layers_512_512_256',
        'total_params': int(model.count_params()),
        'training_epochs': len(history.history['loss'])
    }
    
    with open(f'{model_path}_info.json', 'w') as f:
        json.dump(model_info, f, indent=2)
    
    print(f"✅ Model đã được lưu:")
    print(f"  - {model_path}.h5 (model file)")
    print(f"  - {model_path}_info.json (model info)")

# Lưu model
save_model_complete(rnn_model.model, 'flora_rnn_model')

✅ Model đã được lưu:
  - flora_rnn_model.h5 (model file)
  - flora_rnn_model_info.json (model info)


In [8]:
def generate_text(model, id_to_word, word_to_id, seed_text, length=50):
    """Tạo văn bản mới từ model"""
    # Chuyển seed text thành sequence
    words = seed_text.split()
    if len(words) < 30:
        words = ['<PAD>'] * (30 - len(words)) + words
    
    generated_text = seed_text
    
    for _ in range(length):
        # Chuyển thành sequence số
        sequence = []
        for word in words[-30:]:
            word_id = word_to_id.get(word, word_to_id['<UNK>'])
            sequence.append(word_id)
        
        # Predict next word
        sequence = np.array(sequence).reshape(1, -1)
        prediction = model.predict(sequence, verbose=0)
        
        # Lấy từ có xác suất cao nhất
        next_word_id = np.argmax(prediction[0])
        next_word = id_to_word[str(next_word_id)]
        
        generated_text += " " + next_word
        words.append(next_word)
    
    return generated_text

print("✅ Text generation function đã được định nghĩa")

✅ Text generation function đã được định nghĩa


In [8]:
import json

# Load word_to_id mapping
with open('/kaggle/input/next-word-pred/tensorflow2/default/1/word_to_id.json', 'r') as f:
    word_to_id = json.load(f)

with open('/kaggle/input/next-word-pred/tensorflow2/default/1/id_to_word.json', 'r') as f:
    id_to_word = json.load(f)

In [3]:


# Test generation
print(f"🎯 Test text generation:")
sample_texts = [
    "tôi không chắc liệu",
    "co ay có tin rằng",
    "thời gian sẽ chứng minh"
]

for seed in sample_texts:
    generated = generate_text(rnn_model.model, id_to_word, word_to_id, seed, length=20)
    print(f"\nSeed: '{seed}'")
    print(f"Generated: {generated}")
    print("-" * 50)

🎯 Test text generation:


NameError: name 'generate_text' is not defined

In [10]:
import numpy as np

In [11]:
# Load model và predict next word
from tensorflow.keras.models import load_model

# Load saved model
print("📥 Loading model...")
loaded_model = load_model('/kaggle/input/next-word-pred/tensorflow2/default/1/flora_rnn_model.h5')
print("✅ Model đã được load thành công!")

def predict_next_word(model, id_to_word, word_to_id, seed_text, top_k=5):
    """
    Predict next word(s) từ seed text
    
    Args:
        model: Trained model
        id_to_word: Dictionary mapping id to word
        word_to_id: Dictionary mapping word to id
        seed_text: Input text để predict
        top_k: Số lượng top predictions muốn lấy
    
    Returns:
        List of tuples (word, probability)
    """
    # Chuyển seed text thành words
    words = seed_text.split()
    
    # Pad hoặc truncate để có đúng sequence_length (30)
    sequence_length = 30
    if len(words) < sequence_length:
        words = ['<PAD>'] * (sequence_length - len(words)) + words
    else:
        words = words[-sequence_length:]
    
    # Chuyển words thành sequence số
    sequence = []
    for word in words:
        word_id = word_to_id.get(word, word_to_id.get('<UNK>', 0))
        sequence.append(word_id)
    
    # Reshape cho model input
    sequence = np.array(sequence).reshape(1, -1)
    
    # Predict
    prediction = model.predict(sequence, verbose=0)[0]
    
    # Lấy top_k predictions
    top_indices = np.argsort(prediction)[-top_k:][::-1]
    top_predictions = [
        (id_to_word[str(idx)], float(prediction[idx]))
        for idx in top_indices
    ]
    
    return top_predictions

# Test predictions
print("\n🎯 Testing next word predictions:\n")
sample_texts = [
    "tôi không chắc liệu",
    "cô ấy có tin rằng",
    "thời gian sẽ chứng minh",
    "người dân trong thành phố",
    "anh ấy đã quyết định"
]

for seed in sample_texts:
    predictions = predict_next_word(loaded_model, id_to_word, word_to_id, seed, top_k=5)
    print(f"Input: '{seed}'")
    print("Top 5 predictions:")
    for i, (word, prob) in enumerate(predictions, 1):
        print(f"  {i}. {word:15s} (prob: {prob:.4f})")
    print("-" * 60)



📥 Loading model...
✅ Model đã được load thành công!

🎯 Testing next word predictions:

Input: 'tôi không chắc liệu'
Top 5 predictions:
  1. tôi             (prob: 0.0177)
  2. không           (prob: 0.0151)
  3. có              (prob: 0.0145)
  4. là              (prob: 0.0143)
  5. một             (prob: 0.0133)
------------------------------------------------------------
Input: 'cô ấy có tin rằng'
Top 5 predictions:
  1. tôi             (prob: 0.0147)
  2. của             (prob: 0.0131)
  3. một             (prob: 0.0115)
  4. không           (prob: 0.0106)
  5. có              (prob: 0.0100)
------------------------------------------------------------
Input: 'thời gian sẽ chứng minh'
Top 5 predictions:
  1. tôi             (prob: 0.0123)
  2. của             (prob: 0.0113)
  3. một             (prob: 0.0096)
  4. không           (prob: 0.0089)
  5. có              (prob: 0.0084)
------------------------------------------------------------
Input: 'người dân trong thành phố'
Top 5 pre