In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
os.chdir("../")

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    tokenizer_name: str
    dataset_dir: Path
    save_to_dir: Path

In [4]:
from pathlib import Path
from textSummarizer.constants import CONFIG_FILE, PARAM_FILE
from textSummarizer.utils import read_yaml


class ConfigManager:
    def __init__(self, config_file: Path = CONFIG_FILE, param_file: Path = PARAM_FILE):
        self.config = read_yaml(config_file)
        self.param = read_yaml(param_file)

        Path(self.config.artifact_root).mkdir(parents=True, exist_ok=True)
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        return DataTransformationConfig(
            dataset_dir=Path(config.dataset_dir),
            tokenizer_name=config.tokenizer_name,
            save_to_dir=Path(config.save_to_dir)
            )

In [5]:
import os
import urllib.request as request
import zipfile
from textSummarizer.logging import logger
from datasets import load_from_disk
from transformers import AutoTokenizer

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
    
    def convert_example_to_feature(self, examples):
        dialogue_encoding = self.tokenizer(examples["dialogue"], max_length=1024, truncation=True)
        with self.tokenizer.as_target_tokenizer():
            summary_encoding = self.tokenizer(examples["summary"], max_length=128, truncation=True)
        
        return {
            "input_ids": dialogue_encoding["input_ids"],
            "labels": summary_encoding["input_ids"],
            "attention_mask": dialogue_encoding["attention_mask"]
        }
        

    def transform(self):
        logger.info(f"Tranforming samples to features...")
        dataset = load_from_disk(self.config.dataset_dir)
        feature = dataset.map(self.convert_example_to_feature, batched=True)
        self.config.save_to_dir.parent.mkdir(parents=True, exist_ok=True)
        logger.info(f"Saving features to {self.config.save_to_dir}...")
        feature.save_to_disk(self.config.save_to_dir)
        


  from .autonotebook import tqdm as notebook_tqdm


2024-03-01 13:38:43,687 - INFO - config - PyTorch version 2.2.1 available.


In [7]:

config_manager = ConfigManager()
data_transformation_config = config_manager.get_data_transformation_config()
data_transformation = DataTransformation(data_transformation_config)
data_transformation.transform()

2024-03-01 13:39:36,130 - INFO - __init__ - Reading configs\config.yaml ......
2024-03-01 13:39:36,133 - INFO - __init__ - Reading params\param.yaml ......
2024-03-01 13:39:37,452 - INFO - 1506944596 - Tranforming samples to features...


Map: 100%|██████████| 14732/14732 [00:04<00:00, 2989.05 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2934.14 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2735.51 examples/s]

2024-03-01 13:39:43,095 - INFO - 1506944596 - Saving features to artifacts\data_transformation\samsum_dataset...



Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 186270.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 58512.21 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 58436.79 examples/s]
