In [None]:
!python --version

In [None]:
import os

In [None]:
mind_data_path = "/kaggle/input/mind-data/mind_data"  
os.makedirs(mind_data_path, exist_ok=True) 

In [None]:
!pip install tensorflow==2.15.0 "numpy<2.0"

In [None]:
# C√†i cuDNN 8.9 t·ª´ pip
!pip uninstall tensorflow -y
!pip install nvidia-cudnn-cu12==8.9.7.29
!pip install tensorflow==2.15.0

# Thi·∫øt l·∫≠p LD_LIBRARY_PATH
import os
cudnn_path = '/usr/local/lib/python3.11/dist-packages/nvidia/cudnn/lib'
os.environ['LD_LIBRARY_PATH'] = f"{cudnn_path}:{os.environ.get('LD_LIBRARY_PATH', '')}"

print("‚úÖ cuDNN 8.9 installed. Please RESTART KERNEL!")

In [None]:
import os
cudnn_path = '/usr/local/lib/python3.11/dist-packages/nvidia/cudnn/lib'
os.environ['LD_LIBRARY_PATH'] = f"{cudnn_path}:{os.environ.get('LD_LIBRARY_PATH', '')}"

import tensorflow as tf
print("GPUs:", tf.config.list_physical_devices('GPU'))

In [None]:
# Cell 1: Monkey patch ƒë·ªÉ force embedding l√™n GPU
import os
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

import tensorflow as tf

# Monkey patch Embedding layer ƒë·ªÉ force GPU placement
original_embedding_init = tf.keras.layers.Embedding.__init__

def patched_embedding_init(self, *args, **kwargs):
    # Force embeddings on GPU
    with tf.device('/GPU:0'):
        original_embedding_init(self, *args, **kwargs)

tf.keras.layers.Embedding.__init__ = patched_embedding_init

print("‚úÖ Patched Embedding layer to use GPU")

In [None]:
# Cell 2: Setup
from recommenders.models.newsrec.newsrec_utils import prepare_hparams  
from recommenders.models.newsrec.models.npa import NPAModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator  

train_news_file = os.path.join(mind_data_path, "train", "news.tsv")  
train_behaviors_file = os.path.join(mind_data_path, "train", "behaviors.tsv")  
valid_news_file = os.path.join(mind_data_path, "valid", "news.tsv")  
valid_behaviors_file = os.path.join(mind_data_path, "valid", "behaviors.tsv")  
wordEmb_file = os.path.join(mind_data_path, "utils", "embedding.npy")  
userDict_file = os.path.join(mind_data_path, "utils", "uid2index.pkl")  
wordDict_file = os.path.join(mind_data_path, "utils", "word_dict.pkl")  
yaml_file = os.path.join(mind_data_path, "utils", "npa.yaml")  

hparams = prepare_hparams(  
    yaml_file,  
    wordEmb_file=wordEmb_file,  
    wordDict_file=wordDict_file,  
    userDict_file=userDict_file,  
    epochs=1,
    batch_size=384
)

# Create model with GPU
with tf.device('/GPU:0'):
    iterator = MINDIterator  
    model = NPAModel(hparams, iterator, seed=42)

print("‚úÖ Model created on GPU")

In [None]:
def disable_quick_scoring(m):
    if hasattr(m, "support_quick_scoring"):
        m.support_quick_scoring = False
    if hasattr(m, "hparams") and hasattr(m.hparams, "support_quick_scoring"):
        m.hparams.support_quick_scoring = False

disable_quick_scoring(model)

# Build encoders required by fast eval
def _get_submodules(m):
    try:
        return list(getattr(m, "submodules", []))
    except Exception:
        return []

_submodules = _get_submodules(model.model)
_embedding_layers = [m for m in _submodules if isinstance(m, tf.keras.layers.Embedding)]

embedding_layer = _embedding_layers[0] if len(_embedding_layers) >= 1 else None
user_embedding_layer = _embedding_layers[1] if len(_embedding_layers) >= 2 else embedding_layer

if not hasattr(model, "newsencoder") and hasattr(model, "_build_newsencoder"):
    model.newsencoder = model._build_newsencoder(embedding_layer, user_embedding_layer)

_submodules = _get_submodules(model.model)
_titleencoder_candidates = []
for m in _submodules:
    try:
        if isinstance(m, tf.keras.Model):
            n = getattr(m, "name", "")
            if "title" in n.lower():
                _titleencoder_candidates.append(m)
    except Exception:
        pass

if not _titleencoder_candidates:
    for m in _submodules:
        try:
            if isinstance(m, tf.keras.Model):
                n = getattr(m, "name", "")
                if "encoder" in n.lower() and "user" not in n.lower():
                    _titleencoder_candidates.append(m)
        except Exception:
            pass

titleencoder = _titleencoder_candidates[0] if _titleencoder_candidates else None

if not hasattr(model, "userencoder") and hasattr(model, "_build_userencoder"):
    model.userencoder = model._build_userencoder(titleencoder, user_embedding_layer)

print("‚úÖ Build encoders success")

In [None]:
# Check model dir
print("Wrapper encoder attrs:", [x for x in dir(model) if "encod" in x.lower()])
print("Inner model type:", type(model.model))
print("Inner model encoder attrs:", [x for x in dir(model.model) if "encod" in x.lower()])

# Smoke test (fail-fast): eval -> short train -> eval
print("üß™ Smoke test: checking encoders...")
print("has newsencoder:", hasattr(model, "newsencoder"))
print("has userencoder:", hasattr(model, "userencoder"))
assert hasattr(model, "newsencoder"), "Missing model.newsencoder (needed for fast eval)"
assert hasattr(model, "userencoder"), "Missing model.userencoder (needed for fast eval)"

print("‚úÖ Test passed")

In [None]:
# Cell 3: Load checkpoint
# checkpoint_path = "/kaggle/input/model-epoch-4-6/model/npa_ckpt"
# model.model.load_weights(checkpoint_path)
# print(f"‚úÖ Checkpoint loaded")

print(f"‚úÖ First time")

In [None]:
# Cell 4: Training
print("üî• Starting training...")
model.fit(  
    train_news_file,
    train_behaviors_file,
    valid_news_file,
    valid_behaviors_file
)
print("‚úÖ Training completed!")

In [None]:
# ƒê√°nh gi√° tr√™n validation set  
eval_results = model.run_eval(valid_news_file, valid_behaviors_file)  
print("K·∫øt qu·∫£ ƒë√°nh gi√°:")  
for metric, value in eval_results.items():  
    print(f"{metric}: {value:.4f}")

In [None]:
model_path = os.path.join("/kaggle/working/", "model")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "npa_ckpt"))

In [None]:
from tqdm import tqdm
import numpy as np

test_behaviors_file = "/kaggle/input/mind-data/mind_data/test/behaviors.tsv"  
test_news_file = "/kaggle/input/mind-data/mind_data/test/news.tsv"

print("üîß Patching iterator for test set (no labels)...")

# Backup original method
original_init = model.test_iterator.init_behaviors

def init_behaviors_no_labels(behaviors_file):
    """Modified init_behaviors for test set without labels"""
    model.test_iterator.histories = []
    model.test_iterator.imprs = []
    model.test_iterator.labels = []
    model.test_iterator.impr_indexes = []
    model.test_iterator.uindexes = []

    with open(behaviors_file, "r", encoding="utf-8") as rd:
        impr_index = 0
        for line in rd:
            uid, time, history, impr = line.strip("\n").split("\t")[-4:]

            # Parse history
            history = [model.test_iterator.nid2index[i] for i in history.split() if i in model.test_iterator.nid2index]
            history = [0] * (model.test_iterator.his_size - len(history)) + history[:model.test_iterator.his_size]

            # Parse impressions - TEST SET KH√îNG C√ì LABEL
            impr_news = []
            for item in impr.split():
                # Test set: ch·ªâ c√≥ news_id, KH√îNG c√≥ "-0" hay "-1"
                if "-" in item:
                    # Validation/train set format: N12345-1
                    news_id = item.split("-")[0]
                else:
                    # Test set format: N12345
                    news_id = item
                
                if news_id in model.test_iterator.nid2index:
                    impr_news.append(model.test_iterator.nid2index[news_id])
            
            # T·∫°o dummy labels (kh√¥ng d√πng cho test)
            label = [0] * len(impr_news)
            
            uindex = model.test_iterator.uid2index[uid] if uid in model.test_iterator.uid2index else 0

            model.test_iterator.histories.append(history)
            model.test_iterator.imprs.append(impr_news)
            model.test_iterator.labels.append(label)
            model.test_iterator.impr_indexes.append(impr_index)
            model.test_iterator.uindexes.append(uindex)
            impr_index += 1

# Apply patch
model.test_iterator.init_behaviors = init_behaviors_no_labels

print("‚úÖ Iterator patched for test set")

# Run evaluationa
print("\nüîç Running evaluation on test set...")
print("   Model will use trained weights to generate rankings")

# API-compatible evaluation: try fast_eval first, fallback to eval
try:
    # Check if run_fast_eval method exists and works
    if hasattr(model, 'run_fast_eval') and hasattr(model, '_get_news_feature_from_iter'):
        print("   Using run_fast_eval (fast method)...")
        group_impr_indexes, group_labels, group_preds = model.run_fast_eval(
            test_news_file, test_behaviors_file
        )
    else:
        print("   Using run_eval (compatible method)...")
        eval_results = model.run_eval(test_news_file, test_behaviors_file)
        # Extract results from eval_results dict if needed
        if isinstance(eval_results, dict):
            group_impr_indexes = eval_results.get('group_impr_indexes', [])
            group_labels = eval_results.get('group_labels', [])
            group_preds = eval_results.get('group_preds', [])
        else:
            # If eval_results returns tuple directly
            group_impr_indexes, group_labels, group_preds = eval_results
except AttributeError as e:
    print(f"   Fast eval failed: {e}")
    print("   Falling back to run_eval...")
    eval_results = model.run_eval(test_news_file, test_behaviors_file)
    if isinstance(eval_results, dict):
        group_impr_indexes = eval_results.get('group_impr_indexes', [])
        group_labels = eval_results.get('group_labels', [])
        group_preds = eval_results.get('group_preds', [])
    else:
        group_impr_indexes, group_labels, group_preds = eval_results

print(f"\n‚úÖ Generated predictions for {len(group_impr_indexes)} impressions")

# Write predictions
print("\nüíæ Writing predictions to file...")
prediction_file = "/kaggle/working/prediction.txt"

with open(prediction_file, 'w') as f:  
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds), 
                                   total=len(group_impr_indexes),
                                   desc="Writing"):  
        # MIND competition format: impression_id b·∫Øt ƒë·∫ßu t·ª´ 1
        impr_id = impr_index + 1
        
        # Calculate rankings (score cao nh·∫•t = rank 1)
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()  
        pred_rank_str = '[' + ','.join([str(i) for i in pred_rank]) + ']'  
        
        f.write(f"{impr_id} {pred_rank_str}\n")

print(f"\n‚úÖ Prediction file saved to: {prediction_file}")

# Validate output
print("\nüîç Validating output format...")
with open(prediction_file, 'r') as f:
    lines = f.readlines()
    print(f"   Total predictions: {len(lines)}")
    print(f"   First 3 lines:")
    for i in range(min(3, len(lines))):
        parts = lines[i].strip().split()
        print(f"      ImprID={parts[0]}, Rankings={parts[1][:50]}{'...' if len(parts[1]) > 50 else ''}")

print("\n‚úÖ File ready to submit to MIND competition!")
print(f"   Download: {prediction_file}")

In [None]:
import zipfile
import os

# ƒê∆∞·ªùng d·∫´n file input (ƒë√£ t·∫°o ·ªü b∆∞·ªõc tr∆∞·ªõc) v√† output
source_file = "/kaggle/working/prediction.txt"
zip_output_path = "/kaggle/working/prediction.zip"


try:
    # T·∫°o file zip v·ªõi ch·∫ø ƒë·ªô n√©n ZIP_DEFLATED
    with zipfile.ZipFile(zip_output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # arcname='prediction.txt' ƒë·∫£m b·∫£o file trong zip ch·ªâ c√≥ t√™n l√† prediction.txt
        # ch·ª© kh√¥ng ch·ª©a ƒë∆∞·ªùng d·∫´n th∆∞ m·ª•c m·∫π (/kaggle/working/...)
        zipf.write(source_file, arcname='prediction.txt')


    # (T√πy ch·ªçn) Ki·ªÉm tra n·ªôi dung b√™n trong file zip v·ª´a t·∫°o
    with zipfile.ZipFile(zip_output_path, 'r') as zipf:
        for info in zipf.infolist():
            print(f"File: {info.filename} | Size: {info.file_size / 1024:.2f} KB")

except Exception as e:
    print(e)