In [1]:
import os

In [2]:
!pwd

/home/gourav/ML/Text_Classification_Model_Builder/research


In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class DataProcessingConfig:
    model_name : str
    raw_data_dir: Path
    processed_data_dir: Path
    split_data_dir: Path

    text_col: str
    label_col: str

    train_data_size: float
    test_data_size: float
    val_data_size: float

In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories


In [6]:
class ConfigurationManager:

    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_processing_config(self) -> DataProcessingConfig:

        config = self.config.data_processing
        params = self.params.data_processing
        model_name = self.params.train_model.model_name

        create_directories([config.processed_data_dir, config.split_data_dir])

        data_processing_config = DataProcessingConfig(

            model_name = model_name,
            raw_data_dir= config.raw_data_dir,
            processed_data_dir= config.processed_data_dir,
            split_data_dir= config.split_data_dir,

            text_col= params.text_col,
            label_col= params.label_col,

            train_data_size= params.train_data_size,
            test_data_size= params.test_data_size,
            val_data_size= params.val_data_size,

        )

        return data_processing_config
       

In [7]:
config =ConfigurationManager()
data_processing_config = config.get_data_processing_config()

[2024-05-25 15:33:09,550: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-25 15:33:09,570: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-25 15:33:09,576: INFO: common: already created directory: artifacts]
[2024-05-25 15:33:09,579: INFO: common: already created directory: artifacts/processed_data]
[2024-05-25 15:33:09,583: INFO: common: already created directory: artifacts/split_data]


In [11]:
import os
import pandas as pd
from src.utils.common import join_path, save_json, load_json
from transformers import AutoTokenizer

In [20]:
class DataProcessing:
    def __init__(self,config = DataProcessingConfig) -> None:
        self.config = config


    def get_encoded_labels(self, data):
        
        result_data = data.astype('category').cat.codes
        return result_data


    def get_encoded_text(self, data_list):

        tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
        encoded_data = tokenizer(data_list, padding = True, truncation= True )
        
        return encoded_data


    def get_processed_data(self):
        file_name = os.listdir(self.config.raw_data_dir)[0]
        file_path = join_path(self.config.raw_data_dir,file_name)

        df = pd.read_csv(file_path, nrows= 100)
        

        # encoding text

        text_list = df[self.config.text_col].to_list()
        encoded_text = self.get_encoded_text(text_list)

        # encoding the labels
        encoded_labels = self.get_encoded_labels(df[self.config.label_col])
        encoded_labels = encoded_labels.to_list()

        # adding labels
        processed_data_dict = { 
            "input_ids": encoded_text["input_ids"],
            "token_type_ids": encoded_text["token_type_ids"],
            "attention_mask": encoded_text["attention_mask"],
            "labels": encoded_labels
                }
        
        
        
        save_json(Path(join_path(self.config.processed_data_dir,"processed_data.json")), processed_data_dict)


    def get_range_data(self,data, start_range, end_range):
        for k in data.keys():
            data[k] = data[k][start_range: end_range]

        return data
    
    def get_split_data(self):

        processed_data = load_json(Path(join_path(self.config.processed_data_dir, "processed_data.json")))


        data_len = len(processed_data["labels"])
        train_start_index, train_end_index = 0, int(self.config.train_data_size*data_len)
        val_start_index, val_end_index = train_end_index , train_end_index + int(self.config.val_data_size*data_len)
        test_start_index, test_end_index = val_end_index , data_len

        train_data = self.get_range_data(processed_data.copy(), train_start_index, train_end_index)
        val_data = self.get_range_data(processed_data.copy(),val_start_index, val_end_index)
        test_data = self.get_range_data(processed_data.copy(), test_start_index, test_end_index)

        # save train_data
        save_json(Path(join_path(self.config.split_data_dir, "train_data.json")), train_data)

        #save val data
        save_json(Path(join_path(self.config.split_data_dir, "val_data.json")), val_data)

        #save test data
        save_json(Path(join_path(self.config.split_data_dir, "test_data.json")), test_data)




In [21]:
config =ConfigurationManager()
data_processing_config = config.get_data_processing_config()

data_processing = DataProcessing(data_processing_config)

data_processing.get_processed_data()
data_processing.get_split_data()

[2024-05-25 15:57:13,794: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-25 15:57:13,804: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-25 15:57:13,807: INFO: common: already created directory: artifacts]
[2024-05-25 15:57:13,809: INFO: common: already created directory: artifacts/processed_data]
[2024-05-25 15:57:13,812: INFO: common: already created directory: artifacts/split_data]


[2024-05-25 15:57:13,852: INFO: common: json file loaded succesfully from: artifacts/processed_data/processed_data.json]
[2024-05-25 15:57:14,003: INFO: common: json file saved at: artifacts/split_data/train_data.json]
[2024-05-25 15:57:14,031: INFO: common: json file saved at: artifacts/split_data/val_data.json]
[2024-05-25 15:57:14,082: INFO: common: json file saved at: artifacts/split_data/test_data.json]
