In [1]:
# make sure that the kernel is continually updated with changes in the functions on-the-fly
%load_ext autoreload

In [2]:
from comet import download_model, load_from_checkpoint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Choose your model from Hugging Face Hub
model_path = download_model("Unbabel/wmt22-cometkiwi-da")

# Load the model checkpoint:
model = load_from_checkpoint(model_path)

# Data must be in the following format:
data = [
    {
        "src": "10 到 15 分钟可以送到吗",
        "mt": "Can I receive my food in 10 to 15 minutes?",
        "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "src": "Pode ser entregue dentro de 10 a 15 minutos?",
        "mt": "Can you send it for 10 to 15 minutes?",
        "ref": "Can it be delivered between 10 to 15 minutes?"
    }
]
# Call predict method:
model_output = model.predict(data, batch_size=8, gpus=1)
print(model_output)
print(model_output.scores) # sentence-level scores
print(model_output.system_score) # system-level score

Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.1.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../cache/models--Unbabel--wmt22-cometkiwi-da/snapshots/b3a8aea5a5fc22db68a554b92b3d96eb6ea75cc9/checkpoints/model.ckpt`
Encoder model frozen.
/mnt/data/nunomg/COMET/comet-env/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_floa

Prediction([('scores', [0.7836741805076599, 0.6854020357131958]), ('system_score', 0.7345381081104279)])
[0.7836741805076599, 0.6854020357131958]
0.7345381081104279


## SLIDE

By defining a `model.window_size` and `model.stride_size`, COMET will now fall back to computing SLIDE.

In [4]:
from comet import download_model, load_from_checkpoint

# Choose your model from Hugging Face Hub
# model_path = download_model("Unbabel/XCOMET-XL")

# # Load the model checkpoint:
# model = load_from_checkpoint(model_path)

# Data must be in the following format:
data = [
    {
        "doc_id": "doc1",
        "src": "10 到 15 分钟可以送到吗",
        "mt": "Can I receive my food in 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc1",
        "src": "你能在10到15分钟内送到吗",
        "mt": "Can you deliver it in 10 to 15 minutes?",
        # "ref": "Can it be delivered within 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "10 到 15 分钟可以送到吗",
        "mt": "Can I receive my food in 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "Pode ser entregue dentro de 10 a 15 minutos?",
        "mt": "Can you send it for 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "O pedido pode ser entregue em 10 a 15 minutos?",
        "mt": "The order can be delivered in 10 to 15 minutes?",
        # "ref": "Can the order be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "Sou documento 2",
        "mt": "I am document 2",
        # "ref": "I am document 2"
    },
    {
        "doc_id": "doc3",
        "src": "Sou documento 3",
        "mt": "I am document 3",
        # "ref": "I am document 3"
    }
]

# Call predict method with SLIDE parameters:
model.window_size = 2  # Define the window size
model.stride_size = 2  # Define the stride size
model.include_partial_docs = False  # Decide whether to include partial documents

model_output = model.predict(data, batch_size=8, gpus=1)
print(model_output)
print(model_output.scores)  # sentence-level scores
print(model_output.system_score)  # system-level score


Length batching is disabled when using window_size > 1 (using SLIDE). Setting length_batching to False.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  8.88it/s]


Prediction([('scores', [0.7709373831748962, 0.7166000604629517, 0.7629761695861816, 0.78694748878479]), ('system_score', 0.7593652755022049)])
[0.7709373831748962, 0.7166000604629517, 0.7629761695861816, 0.78694748878479]
0.7593652755022049


Check the windowed data below.

In [5]:
from typing import List

def create_windows(self, sentences: List[str], doc_ids: List[str], window_size: int, stride_size: int, include_partial_docs: bool) -> List[str]:
    windows = []
    current_doc = []
    current_id = doc_ids[0]
    for i in range(len(sentences)):
        if doc_ids[i] != current_id:
            windows.extend(self._create_windows_for_doc(current_doc, window_size, stride_size, include_partial_docs))
            current_doc = []
            current_id = doc_ids[i]
        current_doc.append(sentences[i])
    if current_doc:
        windows.extend(self._create_windows_for_doc(current_doc, window_size, stride_size, include_partial_docs))
    return windows

def _create_windows_for_doc(self, sentences: List[str], window_size: int, stride_size: int, include_partial_docs: bool) -> List[str]:
    windows = []
    num_sentences = len(sentences)
    for i in range(0, num_sentences - window_size + 1, stride_size):
        window = sentences[i:i + window_size]
        if len(window) == window_size or (include_partial_docs and window):
            windows.append(" ".join(window))
    return windows

# Example usage:
data = [
    {
        "doc_id": "doc1",
        "src": "10 到 15 分钟可以送到吗",
        "mt": "Can I receive my food in 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc1",
        "src": "你能在10到15分钟内送到吗",
        "mt": "Can you deliver it in 10 to 15 minutes?",
        # "ref": "Can it be delivered within 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "Sou documento 2 #1",
        "mt": "Can I receive my food in 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "Pode ser entregue dentro de 10 a 15 minutos?",
        "mt": "Can you send it for 10 to 15 minutes?",
        # "ref": "Can it be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "O pedido pode ser entregue em 10 a 15 minutos?",
        "mt": "The order can be delivered in 10 to 15 minutes?",
        # "ref": "Can the order be delivered between 10 to 15 minutes?"
    },
    {
        "doc_id": "doc2",
        "src": "Sou documento 2",
        "mt": "I am document 2",
        # "ref": "I am document 2"
    },
    {
        "doc_id": "doc3",
        "src": "Sou documento 3",
        "mt": "I am document 3",
        # "ref": "I am document 3"
    }
]

# Set SLIDE parameters:
model.window_size = 2  # Define the window size
model.stride_size = 2  # Define the stride size
model.include_partial_docs = False  # Do not include partial documents

# Create windows for each segment (src, mt, ref)
src_sentences = [d["src"] for d in data]
mt_sentences = [d["mt"] for d in data]
# ref_sentences = [d["ref"] for d in data]
doc_ids = [d["doc_id"] for d in data]

src_windows = model.create_windows(src_sentences, doc_ids, model.window_size, model.stride_size, model.include_partial_docs)
mt_windows = model.create_windows(mt_sentences, doc_ids, model.window_size, model.stride_size, model.include_partial_docs)
# ref_windows = model.create_windows(ref_sentences, doc_ids, model.window_size, model.stride_size, model.include_partial_docs)

# Create new data for prediction based on windows
windowed_data = []
for src, mt in zip(src_windows, mt_windows):
    windowed_data.append({
        "src": src,
        "mt": mt,
        # "ref": ref
    })

windowed_data

[{'src': '10 到 15 分钟可以送到吗 你能在10到15分钟内送到吗',
  'mt': 'Can I receive my food in 10 to 15 minutes? Can you deliver it in 10 to 15 minutes?'},
 {'src': 'Sou documento 2 #1 Pode ser entregue dentro de 10 a 15 minutos?',
  'mt': 'Can I receive my food in 10 to 15 minutes? Can you send it for 10 to 15 minutes?'},
 {'src': 'O pedido pode ser entregue em 10 a 15 minutos? Sou documento 2',
  'mt': 'The order can be delivered in 10 to 15 minutes? I am document 2'},
 {'src': 'Sou documento 3', 'mt': 'I am document 3'}]