In [1]:
import argparse
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar
import evaluate
from src.clustsum import clustsum
from src.embeddings import forward_fn_pooler, forward_fn_cls
import torch
import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

disable_progress_bar()

from src.config import Configuration
args = Configuration()
args.sum_size = 3
args.device = 'cuda'
args.subset = 500
args.method = 'compression'

batch_size = 4

def process_batched(texts, method, args, tokenizer=None, model=None, forward_fn=None, return_scores=False):
    summaries = []
    for batch in tqdm(texts, desc="Processing batches"):
        sents_list = clustsum(batch, method, args, tokenizer=tokenizer, model=model, forward_fn=forward_fn, return_scores=return_scores)
        for sents in sents_list:
            summaries.append('. '.join(sents[:args.sum_size]))
    return summaries

# Load the CNN/DailyMail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')
text_column = 'article'
summary_column = 'highlights'

# Subset the dataset
if args.subset > 0:
    dataset = dataset.select(range(args.subset))

# Get the summaries
print(f"Getting the summaries with the {args.method}...")
summaries = []
start_time = time.time()
if args.method == 'transformer':

    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint)
    model = AutoModel.from_pretrained(args.checkpoint)

    # Set the device
    model.to(args.device)

    # Set the forward function
    if args.embedding_from == 'pooler':
        forward_fn = forward_fn_pooler
    elif args.embedding_from == 'cls':
        forward_fn = forward_fn_cls
    else:
        raise Exception("Please provide a valid method for the forward pass.")

    # Get the summaries in a batched manner
    summaries = []
    batches = [dataset[i:i+batch_size][text_column] for i in range(0, len(dataset), batch_size)]
    summaries = process_batched(batches, 'transformer', args, tokenizer=tokenizer, model=model, forward_fn=forward_fn, return_scores=False)


elif args.method == 'compression':
    sents = clustsum(dataset[text_column], 'compression', return_scores=False)
    summaries = ['. '.join(sents[i][:args.sum_size]) for i in range(len(sents))]

else:
    raise Exception("Please provide a valid method, either 'transformer' or 'compression'.")
end_time = time.time()
# Compute the ROUGE scores
print("Computing the ROUGE scores...")
rouge = evaluate.load('rouge')
scores = rouge.compute(predictions=summaries, references=dataset[summary_column], use_aggregator =True)

# Print the results
print(f"ROUGE-1: {scores['rouge1']}")
print(f"ROUGE-2: {scores['rouge2']}")
print(f"ROUGE-L: {scores['rougeL']}")
print(f"Time elapsed: {end_time - start_time} seconds.")

ImportError: cannot import name 'clustsum_batched' from 'src.clustsum' (/home/ereverter/projects/clustsum/src/clustsum.py)

In [None]:
# ROUGE-1: 0.26491231863673464
# ROUGE-2: 0.0899528567537094
# ROUGE-L: 0.18028466915316618
# Time elapsed: 53.298954486846924 seconds.

In [None]:
import gzip
from joblib import Parallel, delayed
import torch

def get_distance(s1, s2, cs1, cs2):
    lcs1 = len(cs1)
    lcs2 = len(cs2)
    lcss = len(get_compression(' '.join([s1, s2])))
    return (lcss - min(lcs1, lcs2)) / max(lcs1, lcs2)

def get_compression(sent):
    return gzip.compress(sent.encode('utf-8'))

def compute_sample_distances(samples_chunk):
    return [compute_single_sample_distance(sample) for sample in samples_chunk]

def compute_single_sample_distance(sample):
    compressions = [get_compression(s) for s in sample]
    n = len(sample)
    distances_2d = [[get_distance(sample[i], sample[j], compressions[i], compressions[j]) for j in range(n)] for i in range(n)]
    return distances_2d

def get_compression_distance(samples):
    n_cores = -1  # this uses all the available cores
    n_samples = len(samples)
    
    # Here, samples is a list of list of sentences
    # Split samples into chunks for each core
    chunk_size = n_samples // abs(n_cores) if n_cores != 0 else n_samples
    samples_chunks = [samples[i:i+chunk_size] for i in range(0, n_samples, chunk_size)]
    
    results = Parallel(n_jobs=n_cores)(delayed(compute_sample_distances)(samples_chunk) for samples_chunk in samples_chunks)

    # Flatten the results back into a list
    results = [item for sublist in results for item in sublist]
    
    # Convert each result to a torch tensor
    tensors = [torch.tensor(distances_2d, dtype=torch.float) for distances_2d in results]

    return tensors

# Example usage
samples = [["some strings", "to test", "gzip compression"], ["another text", "for testing"]]
print(get_compression_distance(samples))


In [6]:
from src.embeddings import get_batched_embeddings
from transformers import AutoTokenizer, AutoModel
from src.embeddings import forward_fn_pooler
from src.config import Configuration
    
config = Configuration()
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

sents = [['This is a sentence.', 'This is another sentence.'], ['This is a third sentence.', 'This is a fourth sentence.', 'This is a fifth sentence.']]
x = get_batched_embeddings(sents, tokenizer, model, forward_fn_pooler, config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
x

(tensor([[-0.9458, -0.4702, -0.8006,  ..., -0.7225, -0.6905,  0.9400],
         [-0.9115, -0.4632, -0.6918,  ..., -0.4507, -0.6873,  0.9250]]),
 tensor([[-0.8959, -0.4844, -0.7792,  ..., -0.6493, -0.7214,  0.8954],
         [-0.9014, -0.4858, -0.7883,  ..., -0.6132, -0.7288,  0.8970]]))

In [1]:
from src.embeddings import get_compression_distance

sents = [['This is a sentence.', 'This is another sentence.'], ['This is a third sentence.', 'This is a fourth sentence.', 'This is a fifth sentence.']]
y = get_compression_distance(sents)

In [2]:
y

[tensor([[0.1081, 0.2791],
         [0.2791, 0.0930]]),
 tensor([[0.0930, 0.2727, 0.2558],
         [0.2727, 0.0909, 0.2045],
         [0.2558, 0.2045, 0.0930]])]

In [5]:
y[0].shape

TypeError: 'generator' object is not subscriptable

In [1]:
!python eval.py \
    --subset 100 \
    --device 'cuda' \
    --alpha 1 \
    --beta 1 \
    --gamma 1 \
    --tau 0.95 \
    --method 'compression' \
    --checkpoint 'microsoft/deberta-v3-base' \
    --max_length 512 \
    --embedding_from 'cls' \
    --dataset 'cnn_dailymail' \
    --sum_size 3

Loading the cnn_dailymail dataset...
Found cached dataset cnn_dailymail (/home/ereverter/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
Getting the summaries with the compression...
100%|█████████████████████████████████████████| 100/100 [01:10<00:00,  1.43it/s]
Computing the ROUGE scores...
ROUGE-1: 0.26727255751607787
ROUGE-2: 0.09335021273243681
ROUGE-L: 0.1829428013795184
