In [1]:
import os

In [2]:
!pwd

/home/gourav/ML/QA_Models_Builder/research


In [3]:
os.chdir("../")

In [4]:
!pwd

/home/gourav/ML/QA_Models_Builder


In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class DataProcessing:
    raw_data_dir: Path
    processed_data_dir : Path
    split_data_dir : Path

    context_col: str
    question_col : str
    answer_col : str
    answer_start_col : str

    train_data_size: float
    val_data_size: float



In [6]:
from src.BERT.constants import *
from src.BERT.utils.common import read_yaml, create_directories, save_json

In [7]:
class ConfigurationManager:

    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_processing_config(self) ->DataProcessing:

        config = self.config.data_processing
        params = self.params.data_processing

        create_directories([config.processed_data_dir, config.split_data_dir])

        data_processing_config = DataProcessing(
            raw_data_dir= config.raw_data_dir,
            processed_data_dir= config.processed_data_dir,
            split_data_dir= config.split_data_dir,

            context_col= params.context_col,
            question_col= params.question_col,
            answer_col= params.answer_col,
            answer_start_col= params.answer_start_col,

            train_data_size= params.train_data_size,
            val_data_size= params.val_data_size
        )


        return data_processing_config


In [8]:
import pandas as pd
from src.BERT.utils.common import save_json, load_json, join_path

In [20]:
class DataPreprocessing():
    def __init__(self, config = DataProcessing):
        self.config = config

    def find_word_index(self, context, answer):
        for word in answer.split(" "):
            res = context.find(word)
            if res != -1:
                return res
        return -1
    
    def create_json_dataset(self, context_list, question_list, answer_list , ans_start_list):

        data = []
        ids = 1
        
        for i in range(len(context_list)):
            context = context_list[i]
            question = question_list[i]
            answer = answer_list[i]

            if ans_start_list == None:
                if context.find(answer) == -1:
                    ans_start = self.find_word_index(context, answer)
                else:
                    ans_start = context.find(answer)
            else:
                ans_start = ans_start_list[i]

            
            q_a_dict = {
                            "id": str(ids).zfill(5),
                            "is_impossible": False,
                            "question": question,
                            "answers": [
                                {
                                    "text": answer,
                                    "answer_start": int(ans_start),
                                }
                            ]
                    }
            ids = ids + 1

            row_dict = {
                "context" : context,
                "qas": q_a_dict
            }

            data.append(row_dict)

        return data

    
    def get_processed_data(self):

        data_file = os.listdir(self.config.raw_data_dir)[0]
        data_file_path = join_path(self.config.raw_data_dir, data_file)

        df = pd.read_csv(data_file_path, nrows= 10)

        context_list = df[self.config.context_col].to_list()
        question_list = df[self.config.question_col].to_list()
        answer_list = df[self.config.answer_col].to_list()
        answer_start_list = df[self.config.answer_start_col].to_list()

        processed_data = self.create_json_dataset(context_list,question_list, answer_list, answer_start_list)

        
        save_json(Path(os.path.join(self.config.processed_data_dir, "processed_data.json")), processed_data)

    
    def get_split_data(self):
        
        processed_data = load_json(Path(join_path(self.config.processed_data_dir, "processed_data.json")))

        train_range = int((100*self.config.train_data_size)/ len(processed_data))
        val_range = int((100*self.config.val_data_size)/ len(processed_data))

        train_data = processed_data[:train_range]
        val_data = processed_data[-(val_range):]

        
        # save training data
        save_json(Path(join_path(self.config.split_data_dir, "train_data.json")), train_data)

        # save val data
        save_json(Path(join_path(self.config.split_data_dir, "val_data.json")), val_data)


        


In [21]:
config = ConfigurationManager()

data_processing_config = config.get_data_processing_config()

data_preprocessing = DataPreprocessing(data_processing_config)
data_preprocessing.get_processed_data()

data_preprocessing.get_split_data()

[2024-05-15 14:08:17,264: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-15 14:08:17,273: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-15 14:08:17,282: INFO: common: already created directory: artifacts]
[2024-05-15 14:08:17,285: INFO: common: already created directory: artifacts/processed_data]
[2024-05-15 14:08:17,295: INFO: common: already created directory: artifacts/split_data]
Index(['Unnamed: 0', 'context', 'question', 'id', 'answer_start', 'text'], dtype='object')
context
<class 'str'>
Processing data started
artifacts/processed_data
artifacts/processed_data/processed_data.json
[2024-05-15 14:08:17,312: INFO: common: json file saved at: artifacts/processed_data/processed_data.json]
[2024-05-15 14:08:17,316: INFO: common: json file loaded succesfully from: artifacts/processed_data/processed_data.json]
[2024-05-15 14:08:17,321: INFO: common: json file saved at: artifacts/split_data/train_data.json]
[2024-05-15 14:08:17,324: INFO: c

In [12]:
try:
    config = ConfigurationManager()

    data_processing_config = config.get_data_processing_config()

    data_preprocessing = DataPreprocessing(data_processing_config)
    data_preprocessing.get_processed_data()

    data_preprocessing.get_split_data()

except Exception as e:
    print(e)

[2024-05-15 14:04:29,394: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-15 14:04:29,400: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-15 14:04:29,402: INFO: common: already created directory: artifacts]
[2024-05-15 14:04:29,404: INFO: common: already created directory: artifacts/processed_data]
[2024-05-15 14:04:29,405: INFO: common: already created directory: artifacts/split_data]
'context'
