In [2]:
import numpy as np
import pandas as pd 
import sklearn
from sklearn.model_selection import train_test_split
import torch

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    )

class Preprocessing:
    """Data preprocessing"""
    def __init__(self):
        self.model_name = "facebook/bart-large-cnn"
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
    
    def data_tokenize(self, data, batch_size = 2, max_source_length=None, max_target_length=None):
        source_tokenized = []
        target_tokenized = []
        
        num_batches = (len(data) + batch_size - 1) // batch_size  # Calculate the number of batches
    
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(data))
            batch_data = data[start_idx:end_idx]  # Extract a batch of data
            
            for _, row in batch_data.iterrows():
                source, target = row["original_text"], row["reference_summary"]
                source_encoded = self.tokenizer(
                    source, padding="max_length", truncation=True, max_length=max_source_length, return_tensors="pt"
                )["input_ids"]
                target_encoded = self.tokenizer(
                    target, padding="max_length", truncation=True, max_length=max_target_length, return_tensors="pt"
                )["input_ids"]
                
                source_tokenized.append(source_encoded)
                target_tokenized.append(target_encoded)
        
        return source_tokenized, target_tokenized
    
    def data_embed(self, source_tokenized, target_tokenized):
        source_embeddings = []
        target_embeddings = []
        
        for source_input_ids, target_input_ids in zip(source_tokenized, target_tokenized):
            eos_token_id_tensor = torch.tensor([[self.tokenizer.eos_token_id]])
            
            source_outputs = self.model(input_ids=source_input_ids, decoder_input_ids=eos_token_id_tensor)
            target_outputs = self.model(input_ids=target_input_ids, decoder_input_ids=eos_token_id_tensor)
            
            source_embeddings.append(source_outputs.encoder_last_hidden_state)
            target_embeddings.append(target_outputs.encoder_last_hidden_state)
        
        return source_embeddings, target_embeddings

In [4]:
source_tokens, target_tokens = Preprocessing().data_tokenize(data, batch_size=2, max_source_length=128, max_target_length=64)
source_embeds, target_embeds = Preprocessing().data_embed(source_tokens, target_tokens)

In [5]:
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

class Training:
    """Model training"""
    def __init__(self, source_embeddings, target_embeddings, output_dir="results", logging_dir=None):
        # Initialize the class attributes and specify the output and logging directories
        self.output_dir = output_dir
        
        os.makedirs(self.output_dir, exist_ok=True)  # Create output directory if it doesn't exist
        
        self.source_embeddings = source_embeddings
        self.target_embeddings = target_embeddings
        
        self.model_name = "facebook/bart-large-cnn"
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=self.model)
        
        # Combine source and target embeddings into datasets
        self.train_dataset = [{"source": source, "target": target} for source, target in zip(source_embeddings, target_embeddings)]
        self.eval_dataset = self.train_dataset  # For simplicity, use the same data for evaluation
        
        self.training_args = Seq2SeqTrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=1,
            do_train=True,
            do_eval=True,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=500,
            weight_decay=0.1,
            label_smoothing_factor=0.1,
            predict_with_generate=True,
            logging_dir=logging_dir,
            logging_steps=50,
            save_total_limit=3,
        )
        
        self.trainer = Seq2SeqTrainer(
            model=self.model,
            args=self.training_args,
            data_collator=self.data_collator,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self.tokenizer
        )
        
    def training(self):
        try:
            self.trainer.train()
        except Exception as e:
            print(f"An error occurred during training: {e}")
        
    def saving(self, model_name='GGM'):
        try:
            self.trainer.save_model(model_name)
            print(f"Model saved successfully as '{model_name}'")
        except Exception as e:
            print(f"An error occurred while saving the model: {e}")


In [6]:

# Example usage:
# Instantiate the Training class with source and target embeddings
training = Training(source_embeds, target_embeds, logging_dir= "C:\\Users\\tmanz\\Bureau\\M2_DS\\Hackathon\\logs")

# Train the model
training.training()

# Save the trained model
training.saving()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl

In [46]:
class CustomSummarizer:
    def __init__(self, model_name_or_path, device):
        self.model_name_or_path = "C:\\Users\\tmanz\\OneDrive - Aix-Marseille Université\\Bureau\\M2_DS\\Hackathon\\GGM"
        self.device = 'cpu'
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name_or_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        
    
    def summarize(self, text, max_length=128, min_length=10, num_beams=4):
        input_ids = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length").input_ids.to(self.device)
        
        # Generate summary
        summary_ids = self.model.generate(
            input_ids=input_ids,
            max_length=max_length,
            min_length=min_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=3,
            length_penalty=2.0,
            decoder_start_token_id=self.model.config.decoder.pad_token_id,
            eos_token_id=self.model.config.eos_token_id,
            pad_token_id=self.model.config.pad_token_id
        )
        
        # Decode the generated summary
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        return summary


In [47]:

# Example usage:
summarizer = CustomSummarizer("GGM", device="cuda" if torch.cuda.is_available() else "cpu")
summary = summarizer.summarize("Your input text goes here.")
print("Summary:", summary)

AttributeError: 'BartConfig' object has no attribute 'decoder'

In [48]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class CustomSummarizer:
    def __init__(self, model_name_or_path, device):
        self.model_name_or_path = model_name_or_path
        self.device = device
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name_or_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        
    
    def summarize(self, text, max_length=128, min_length=10, num_beams=4):
        input_ids = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length").input_ids.to(self.device)
        
        # Generate summary
        summary_ids = self.model.generate(
            input_ids=input_ids,
            max_length=max_length,
            min_length=min_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=3,
            length_penalty=2.0,
            )
        
        # Decode the generated summary
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        return summary

Summary: Your input text goes here. Your input text will be used in the next edition of this column.


In [50]:
summarizer.summarize("AH owns all foreground intellectual property and know-how, if any, generated by the outcome of the analysis of the Data, Processed data and/or Anonymous data.")

'AH owns all foreground intellectual property and know-how, if any, generated by the outcome of the analysis of the Data, Processed data and/or Anonymous data.'

In [None]:

# Example usage:
summarizer = CustomSummarizer("C:\\Users\\tmanz\\OneDrive - Aix-Marseille Université\\Bureau\\M2_DS\\Hackathon\\GGM", device="cpu")
summary = summarizer.summarize("Your input text goes here.")
print("Summary:", summary)


In [16]:
import pandas as pd
data = pd.read_excel("C:\\Users\\tmanz\\OneDrive - Aix-Marseille Université\\Bureau\\M2_DS\\Hackathon\\data\\train_set.xlsx")
# data = data[0]

In [17]:
data =data.drop(['Unnamed: 0', 'uid'], axis =1)

In [19]:
data[0]

KeyError: 0

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")


# def tokenize_function(examples):
#     return tokenizer(examples["original_text"], padding="max_length", truncation=True)


# tokenized_datasets = data.map(tokenize_function, batched=True)

In [4]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["original_text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["reference_summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
preprocess_function(data)

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).