In [10]:
from collections import namedtuple
from cementStrength.constants import *
from cementStrength.utils import read_yaml, create_directories
from cementStrength.entity import *
import os
import sys

In [11]:
DataValidationConfig = namedtuple("DataValidationConfig", [
    "data_validation_dir",
    "schema_file_path",
    "report_file_path",
    "report_page_file_path"
])

In [None]:
class ConfigurationManager:
    def __init__(self, 
                config_filepath = CONFIG_FILE_PATH,
                params_filepath = PARAMS_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)-> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        create_directories([config.ingested_data_train_dir, config.ingested_data_test_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            ingested_data_train_dir = config.ingested_data_train_dir,
            ingested_data_test_dir= config.ingested_data_test_dir
        )
        return data_ingestion_config

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_vaidation

        data_validation_dir = config.root_dir
        report_file_path = os.path.join(data_validation_dir, config.report_file_name)
        report_page_file_path = os.path.join(data_validation_dir, config.report_page_file_name)

        data_validation_config = DataValidationConfig(
            data_validation_dir = data_validation_dir,
            schema_file_path = config.schema_dir, 
            report_file_path =  report_file_path,
            report_page_file_path = report_page_file_path
        )
        return data_validation_config

In [1]:
from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab

In [None]:
from cementStrength import logger
from cementStrength.entity.config_entity import DataIngestionConfig
import pandas as pd


class DataValidation:
    def __init__(self, config: DataValidationConfig, data_ingestion_config: DataIngestionConfig):
        self.config = config
        self.data_ingestion_config = data_ingestion_config

    def validate_dataset_schema(self) -> bool:
        validation_status = False

        config = self.data_ingestion_config
        validation_config = self.config
        train_data_path = Path(os.path.join(config.ingested_data_train_dir, 'train.csv'))
        
        schema = read_yaml(validation_config.schema_file_path)
        

        df = pd.read_csv(train_data_path)
        
        for column in df.columns:
            if df['column'].dtypes == schema.columns.column:
                validation_status = True
            else:
                validation_status=False

        return validation_status

    def get_train_test_df(self):
        train_filename = "train.csv"
        test_filename = "test.csv"

        train_filepath = os.path.join(self.data_ingestion_config.ingested_data_train_dir, train_filename)
        test_filepath = os.path.join(self.data_ingestion_config.ingested_data_test_dir, test_filename)

        train_df = pd.read_csv(train_filepath)
        test_df = pd.read_csv(test_filepath)

        return train_df, test_df

    def is_train_test_file_exist(self):
        is_train_file_exist = False
        is_test_file_exist = False

        train_filename = "train.csv"
        test_filename = "test.csv"

        train_filepath = os.path.join(self.data_ingestion_config.ingested_data_train_dir, train_filename)
        test_filepath = os.path.join(self.data_ingestion_config.ingested_data_test_dir, test_filename)

        is_train_file_exist = os.path.exists(train_filepath)
        is_test_file_exist = os.path.exist(test_filepath)

        is_available = is_train_file_exist and is_test_file_exist

        if not is_available:
            message = f"Training file: {train_filepath} or Testing file: {test_filepath}" \
                    "is not present"
            raise Exception(message)

        return is_available


    def get_and_save_data_drift_report(self):
        profile = Profile(sections=[DataDriftProfileSection])

        train_df, test_df = self.get_train_test_df()
        
        profile.calculate(train_df,test_df)

        report = json.loads(profile.json())

        report_file_path = self.config.report_file_path
        report_dir = os.path.dirname(report_file_path)
        os.makedirs(report_dir,exist_ok=True)

        with open(report_file_path,"w") as report_file:
            json.dump(report, report_file, indent=6)
        return report

    def save_data_drift_report_page(self):
        dashboard = Dashboard(tabs=[DataDriftTab()])
        train_df,test_df = self.get_train_and_test_df()
        dashboard.calculate(train_df,test_df)

        report_page_file_path = self.config.report_page_file_path
        report_page_dir = os.path.dirname(report_page_file_path)
        os.makedirs(report_page_dir,exist_ok=True)

        dashboard.save(report_page_file_path)

    def is_data_drift_found(self):
        report = self.get_and_save_data_drift_report()
        self.save_data_drift_report_page()
        return True

    def initiate_data_validation(self):
        self.is_train_test_file_exist()
        self.validate_dataset_schema()
        self.is_data_drift_found()




In [4]:
import json

In [None]:
config = ConfigurationManager()
data_ingestion_config = config.get_data_validation_config()
data_validation = DataValidation(config=DataValidationConfig,data_ingestion_config=DataIngestionConfig)
data_validation.initiate_data_validation()