In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/macbookpro/Documents/semantic_preprocessor_model/semantic_preprocessor_model'

# Config

In [None]:
# Configuration related to data transformation
data_transformation:
  # Directory where data transformation results and artifacts are stored
  root_dir: artifacts/data_transformation
  
  # Path to the ingested data file that will be used for validation
  data_source_file: artifacts/data_ingestion/data.csv

  # Path to data validation status
  data_validation: artifacts/data_validation/status.txt

# Entity

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Configuration for the data transformation process.
    
    This configuration class captures the necessary paths and directories 
    required for the transformation of data post-ingestion and pre-model training.
    
    Attributes:
    - root_dir: Directory where data transformation results and artifacts are stored.
    - data_source_file: Path to the file where the ingested data is stored that needs to be transformed.
    """
    
    root_dir: Path  # Directory for storing transformation results and related artifacts
    data_source_file: Path  # Path to the ingested data file for transformation
    data_validation: Path # Path to the validated output file

# Configuration Manager

In [5]:
from src.semantic_preprocessor_model.constants import *
from src.semantic_preprocessor_model.utils.common import read_yaml, create_directories
from src.semantic_preprocessor_model import logger
from src.semantic_preprocessor_model.entity.config_entity import DataValidationConfig, DataTransformationConfig
import os

class ConfigurationManager:
    """
    The ConfigurationManager manages configuration settings needed throughout the data 
    pipeline processes, such as data validation and data transformation.

    It reads configuration, parameter, and schema settings from specified files and provides 
    a set of methods to access these settings. Additionally, it ensures that the required 
    directories specified in the configurations are created.
    """
    
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH, 
                 schema_filepath=SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration, if they don't exist.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")

        # Ensure the directory for storing artifacts exists
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Extract and return data transformation configurations as a DataTransformationConfig object.

        This method fetches settings related to data transformation, like directories and file paths,
        and returns them as a DataTransformationConfig object.

        Returns:
        - DataTransformationConfig: Object containing data transformation configuration settings.

        Raises:
        - AttributeError: If the 'data_transformation' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_transformation
            
            # Ensure the root directory for data transformation exists
            create_directories([config.root_dir])

            # Construct and return the DataTransformationConfig object
            return DataTransformationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                data_validation=Path(config.data_validation),
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_transformation' attribute does not exist in the config file.")
            raise e


# Component

In [16]:
from src.semantic_preprocessor_model import logger
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, save_npz
import nltk


class DataTransformation:
    """
    Class responsible for transforming the ingested dataset through various processes:
    - Text preprocessing
    - Handling missing values
    - TF-IDF vectorization of text features
    - Data filtering
    - Dataset splitting
    - Saving the transformed datasets
    """

    def __init__(self, config: DataTransformationConfig):
        """
        Initializes the DataTransformation component, loads data, and sets up prerequisites.
        
        Args:
        - config (DataTransformationConfig): Configuration settings for data transformation.
        """
        self.config = config
        self.df = self._load_data()
        self._download_nltk_resources()
        self._initialize_stop_words()
        self._handle_missing_values()

    def _load_data(self) -> pd.DataFrame:
        """
        Load data from the specified source file.

        Returns:
        - pd.DataFrame: Loaded data.
        """
        try:
            return pd.read_csv(self.config.data_source_file)
        except FileNotFoundError:
            logger.error(f"File not found: {self.config.data_source_file}")
            raise

    def _download_nltk_resources(self):
        """Download necessary NLTK resources if they aren't present."""
        if not nltk.data.find('tokenizers/punkt'):
            nltk.download('punkt')
        if not nltk.data.find('corpora/stopwords'):
            nltk.download('stopwords')

    def _initialize_stop_words(self):
        """Initialize a set of Russian stop words."""
        self.stop_words = set(stopwords.words('russian'))

    def _handle_missing_values(self):
        """Handle missing values by replacing NaN in 'upper_works' column with 'Unknown'."""
        self.df['upper_works'].fillna('Unknown', inplace=True)

    def preprocess_text(self, text: str) -> str:
        """
        Tokenize, convert to lowercase, and filter out stop words from the text.
        
        Args:
        - text (str): Input text.

        Returns:
        - str: Processed text.
        """
        tokens = word_tokenize(text.lower(), language='russian')
        tokens = [word for word in tokens if word.isalpha() and word not in self.stop_words]
        return ' '.join(tokens)

    def apply_text_preprocessing(self):
        """Apply text preprocessing to 'work_name' and 'upper_works' columns."""
        self.df['processed_work_name'] = self.df['work_name'].apply(self.preprocess_text)
        self.df['processed_upper_works'] = self.df['upper_works'].apply(self.preprocess_text)

    def vectorize_text_features(self) -> csr_matrix:
        """
        Vectorize text features using TF-IDF and combine them.

        Returns:
        - csr_matrix: Combined TF-IDF features.
        """
        vectorizer_work_name = TfidfVectorizer(max_features=5000)
        tfidf_work_name = vectorizer_work_name.fit_transform(self.df['processed_work_name'])

        vectorizer_upper_works = TfidfVectorizer(max_features=5000)
        tfidf_upper_works = vectorizer_upper_works.fit_transform(self.df['processed_upper_works'])

        return hstack([tfidf_work_name, tfidf_upper_works]).tocsr()

    def filter_data(self, combined_tfidf_features_csr) -> (pd.DataFrame, csr_matrix):
        """
        Filter out singleton classes and rows with missing 'generalized_work_class' values.

        Args:
        - combined_tfidf_features_csr (csr_matrix): Combined TF-IDF features.

        Returns:
        - pd.DataFrame, csr_matrix: Filtered data and corresponding TF-IDF features.
        """
        trainable_data = self.df[~self.df['generalized_work_class'].isnull()]
        class_counts = trainable_data['generalized_work_class'].value_counts()
        singleton_classes = class_counts[class_counts == 1]
        filtered_data = trainable_data[~trainable_data['generalized_work_class'].isin(singleton_classes.index)]
        return filtered_data, combined_tfidf_features_csr[filtered_data.index, :]

    def split_data(self, X, y) -> tuple:
        """
        Split data into training and test sets with stratification.

        Args:
        - X (csr_matrix): Features.
        - y (pd.Series): Labels.

        Returns:
        - tuple: Training and test datasets.
        """
        return train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

    def _save_datasets(self, X_train, y_train, X_val, y_val, train_features_filename: str, train_labels_filename: str, val_features_filename: str, val_labels_filename: str):
        """
        Save transformed datasets to specified paths in sparse format.

        Args:
        - X_train (csr_matrix): Training features.
        - y_train (pd.Series): Training labels.
        - X_val (csr_matrix): Validation features.
        - y_val (pd.Series): Validation labels.
        - train_features_filename (str): Name of the file for saving training features.
        - train_labels_filename (str): Name of the file for saving training labels.
        - val_features_filename (str): Name of the file for saving validation features.
        - val_labels_filename (str): Name of the file for saving validation labels.
        """
        train_features_output_path = self.config.root_dir / train_features_filename
        train_labels_output_path = self.config.root_dir / train_labels_filename
        val_features_output_path = self.config.root_dir / val_features_filename
        val_labels_output_path = self.config.root_dir / val_labels_filename
        
        try:
            # Save the training and validation features as csr matrices
            save_npz(train_features_output_path, X_train)
            save_npz(val_features_output_path, X_val)
            logger.info(f"Training and validation features saved in NPZ format.")

            # Save the training and validation labels as CSV files
            y_train.to_csv(train_labels_output_path, index=False)
            y_val.to_csv(val_labels_output_path, index=False)
            logger.info(f"Training and validation labels saved in CSV format.")

        except Exception as e:
            logger.error(f"Error while saving datasets: {e}")


    def transform(self, 
                train_features_filename: str = "train_features.npz", 
                train_labels_filename: str = "train_labels.csv", 
                val_features_filename: str = "val_features.npz", 
                val_labels_filename: str = "val_labels.csv") -> tuple:
        """
        Execute entire data transformation pipeline.

        Args:
        - train_features_filename (str): Name for saving training features (default: "train_features.npz").
        - train_labels_filename (str): Name for saving training labels (default: "train_labels.csv").
        - val_features_filename (str): Name for saving validation features (default: "val_features.npz").
        - val_labels_filename (str): Name for saving validation labels (default: "val_labels.csv").

        Returns:
        - tuple: Transformed training and validation datasets.
        """
        logger.info("Applying text processing")
        self.apply_text_preprocessing()

        logger.info("Vectorizing text features")
        combined_tfidf_features_csr = self.vectorize_text_features()

        logger.info("Filtering combined features")
        filtered_data, filtered_features = self.filter_data(combined_tfidf_features_csr)

        logger.info("Splitting data")
        X_train, X_val, y_train, y_val = self.split_data(filtered_features, filtered_data['generalized_work_class'])

        logger.info("Saving to artifacts")
        self._save_datasets(X_train, y_train, X_val, y_val, train_features_filename, train_labels_filename, val_features_filename, val_labels_filename)
        
        return X_train, X_val, y_train, y_val




# Pipeline

In [18]:
from src.semantic_preprocessor_model import logger

class DataTransformationPipeline:
    """
    Orchestrates data transformation processes:
    - Text preprocessing
    - Missing value handling
    - Text feature vectorization
    - Data filtering and splitting
    - Saving transformed datasets
    """

    STAGE_NAME = "Data Transformation Pipeline"

    def __init__(self):
        """Initialize the pipeline with a configuration manager."""
        self.config_manager = ConfigurationManager()

    def run_data_transformation(self):
        """
        Execute data transformation steps and log each stage.

        Raises:
            Exception: If any error occurs during the data transformation process.
        """
        try:
            logger.info("Fetching data transformation configuration...")
            data_transformation_config = self.config_manager.get_data_transformation_config()

            logger.info("Initializing data transformation...")
            data_transformer = DataTransformation(config=data_transformation_config)

            logger.info("Executing data transformation pipeline...")
            X_train, X_val, y_train, y_val = data_transformer.transform()
            
            logger.info(f"Shape of X_train: {X_train.shape}")
            logger.info(f"Shape of X_val: {X_val.shape}")
            logger.info(f"Shape of y_train: {y_train.shape}")
            logger.info(f"Shape of y_val: {y_val.shape}")

            logger.info("Data Transformation Pipeline completed successfully.")

        except Exception as e:
            logger.error(f"Error during data transformation: {e}")

    def run_pipeline(self):
        """
        Run the entire Data Transformation Pipeline, checking data validations before proceeding.

        Raises:
            Exception: If any error occurs during the pipeline execution.
        """
        try:
            with open(self.config_manager.get_data_transformation_config().data_validation, "r") as f:
                content = f.read()

            # Ensure the validations have passed before running the pipeline
            if "Overall Validation Status: All validations passed." in content:
                logger.info("Starting the Data Transformation Pipeline.")
                logger.info(f">>>>>> Stage: {DataTransformationPipeline.STAGE_NAME} started <<<<<<")
                self.run_data_transformation()
                logger.info(f">>>>>> Stage: {DataTransformationPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
            else:
                logger.error("Pipeline aborted due to validation errors.")

        except Exception as e:
            logger.error(f"Error during {DataTransformationPipeline.STAGE_NAME}: {e}")
            raise e

if __name__ == '__main__':
    pipeline = DataTransformationPipeline()
    pipeline.run_pipeline()


[2023-10-23 01:35:03,481: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-23 01:35:03,484: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-23 01:35:03,488: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-23 01:35:03,490: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts]
[2023-10-23 01:35:03,492: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts/data_transformation]
[2023-10-23 01:35:03,493: 59: semantic_preprocessor_model_logger: INFO: 2594316775:  Starting the Data Transformation Pipeline.]
[2023-10-23 01:35:03,495: 60: semantic_preprocessor_model_logger: INFO: 2594316775:  >>>>>> Stage: Data Transformation Pipeline started <<<<<<]
[2023-10-23 01:35:03,496: 27: semantic_preprocessor_model_logger: INFO: 2594316775:  Fetching dat