In [11]:
import os

In [12]:
%pwd

'c:\\Users\\Dheer\\OneDrive\\Desktop\\MLOps\\MediWatch_project'

In [13]:
# os.chdir("../")

In [14]:
%pwd

'c:\\Users\\Dheer\\OneDrive\\Desktop\\MLOps\\MediWatch_project'

In [15]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [16]:
from Mediwatch_project.constants import *
from Mediwatch_project.utils.common import read_yaml, create_directories

In [17]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [18]:
import os
from Mediwatch_project import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [19]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_spliting(self):
        df = pd.read_csv(self.config.data_path)

        # Step 1: Basic Cleaning and Encoding
        df = df[df['gender'] != 'Unknown/Invalid']
        df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

        df['race'] = df['race'].replace('?', 'Other')
        age_map = {f'[{10*i}-{10*(i+1)})': i for i in range(10)}
        df['age'] = df['age'].map(age_map)

        for col in ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']:
            df[col] = df[col].fillna(0).astype(int)

        df['change'] = df['change'].map({'No': 0, 'Ch': 1, 'Yes': 1}).fillna(0)
        df['diabetesMed'] = df['diabetesMed'].map({'No': 0, 'Yes': 1}).fillna(0)

        medication_cols = [
            'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
            'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
            'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
            'insulin', 'glyburide-metformin','glipizide-metformin', 'glimepiride-pioglitazone',
            'metformin-rosiglitazone','metformin-pioglitazone']

        med_map = {'No': 0, 'Steady': 1, 'Up': 2, 'Down': 3}
        for col in medication_cols:
            if col in df.columns:
                df[col] = df[col].map(med_map).fillna(0).astype(int)
            else:
                print(f"Warning: Column '{col}' not found in dataset.")

        readmitted_mapping = {'NO': 0, '>30': 0, '<30': 1}
        df['readmitted'] = df['readmitted'].map(readmitted_mapping)

        # Step 2: Diagnosis Mapping
        def map_diag(code):
            try:
                code = str(code)
                if code.startswith('V') or code.startswith('E'):
                    return 'Other'
                code = float(code)
                if 390 <= code <= 459 or code == 785:
                    return 'Circulatory'
                elif 460 <= code <= 519 or code == 786:
                    return 'Respiratory'
                elif 520 <= code <= 579 or code == 787:
                    return 'Digestive'
                elif 250 <= code < 251:
                    return 'Diabetes'
                elif 800 <= code <= 999:
                    return 'Injury'
                elif 710 <= code <= 739:
                    return 'Musculoskeletal'
                elif 580 <= code <= 629 or code == 788:
                    return 'Genitourinary'
                elif 140 <= code <= 239:
                    return 'Neoplasms'
                else:
                    return 'Other'
            except:
                return 'Unknown'

        df['diag_1_cat'] = df['diag_1'].apply(map_diag)
        df['diag_2_cat'] = df['diag_2'].apply(map_diag)
        df['diag_3_cat'] = df['diag_3'].apply(map_diag)

        # Step 3: Drop columns
        cols_to_drop = ['encounter_id', 'patient_nbr', 'payer_code', 'weight', 'medical_specialty',
                        'max_glu_serum', 'A1Cresult', 'examide', 'citoglipton',
                        'diag_1', 'diag_2', 'diag_3']
        df = df.drop(columns=cols_to_drop, errors='ignore')

        df['gender'] = df['gender'].astype('category')
        df['readmitted'] = df['readmitted'].astype('category')

        # Step 4: One-hot encoding
        one_hot_encode_columns = [
            'race', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
            'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
            'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
            'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
            'insulin', 'glyburide-metformin','glipizide-metformin', 'glimepiride-pioglitazone',
            'metformin-rosiglitazone','metformin-pioglitazone', 'change', 'diabetesMed',
            'diag_1_cat', 'diag_2_cat', 'diag_3_cat']

        df_encoded = pd.get_dummies(df, columns=one_hot_encode_columns, drop_first=True, dtype=int)

        # Step 5: Train-test split (80/20)
        train, test = train_test_split(df_encoded, test_size=0.2, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Split data into training and test sets")
        logger.info(f"Train shape: {train.shape}")
        logger.info(f"Test shape: {test.shape}")

        print("Train shape:", train.shape)
        print("Test shape:", test.shape)



In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
except Exception as e:
    raise e

[2025-08-06 19:35:45,729: INFO: common: Raw YAML content from WindowsPath('config/config.yaml'): {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://raw.githubusercontent.com/dheeraj24797/MediWatch_project/main/diabetic_data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'unzip_data_dir': 'artifacts/data_ingestion/diabetic_data.csv', 'STATUS_FILE': 'artifacts/data_validation/status.txt'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/diabetic_data.csv'}} (type=<class 'dict'>)]
[2025-08-06 19:35:45,729: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-08-06 19:35:45,729: INFO: common: Raw YAML content from WindowsPath('params.yaml'): {'model_params': {'learning_rate': 0.01, 'n_estimators': 100}} (type=<class 'dict'>)]
[2025-08-