In [1]:
import os
os.chdir("../")

In [2]:
from src.movie_predictor.constants import *

In [3]:
from src.movie_predictor.entity.artifact_entity import DataIngestionArtifact

In [4]:
from src.movie_predictor.utils.common import *

In [5]:
from src.movie_predictor.entity.config_entity import DataValidationConfig

In [6]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        self.schema = read_yaml(DATA_VALIDATION_FILE)

        create_directories([self.config.artifacts_root])

    
    def get_data_validation_config(self) -> DataValidationConfig:
        data_validation_config = self.config.data_validation_config
        
        create_directories([data_validation_config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir = data_validation_config.root_dir,
            data_report_file_name = data_validation_config.report_file_name,
            report_page_name = data_validation_config.report_page_file_name
        )

        return data_validation_config

In [7]:

from src.movie_predictor.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact


In [10]:
from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.test_preset import DataStabilityTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.metric_preset import DataDriftPreset

import json
from movie_predictor.entity.artifact_entity import DataValidationArtifact
from movie_predictor import logging
import pandas as pd
import numpy as np
class DataValidation:
    
    def __init__(self,data_validation_config:DataValidationConfig,data_ingestion_artifact:DataIngestionArtifact):
        self.schema_config = read_yaml(path_to_yaml=DATA_VALIDATION_FILE)
        self.data_validation_config = data_validation_config
        self.data_ingestion_artifact = data_ingestion_artifact

    def is_train_test_file_exists(self)-> bool:
          """
          this function ensure the presence of train and test data then only we perform 
           initiate_data_validation 
           first we decleare our file present to False since 
           it is boolean we can return the file to true if file exist
          """

          try:
            is_train_file_exist = False  
            is_test_file_exist = False
            #our output data_ingestion_artifact has the data so
            train_file_path = self.data_ingestion_artifact.train_file_path
            test_file_path = self.data_ingestion_artifact.test_file_path
            
            #os.path.exists return boolean value of path exists or not
            is_train_file_exist = os.path.exists(train_file_path) 
            is_test_file_exist = os.path.exists(test_file_path)
            
            # and operator to make it sure only returns True is both are true otherwise False
            is_exists =  is_train_file_exist and is_test_file_exist
            #writing logger  
            

            if not is_exists:
                #our output data_ingestion_artifact has the data so
                train_file_path = self.data_ingestion_artifact.train_file_path
                test_file_path = self.data_ingestion_artifact.test_file_path
                messages = f"Training_file {train_file_path} or Testing file :{test_file_path} is not present "
                raise Exception(messages)
        
            
            return is_exists

          except Exception as e:
            return e   
          
    def get_train_test_df(self):
        """
        this function is return to obtain the train and test dataset for checking the data drift 
        """
        try:
            train_df = pd.read_csv(self.data_ingestion_artifact.train_file_path)
            test_df = pd.read_csv(self.data_ingestion_artifact.test_file_path)
            return train_df,test_df
        except Exception as e:
            return e
        
    def validate_schema_columns_dataset(self)-> bool:
        '''
        this method will fetch the necessary requirement from #schema_validation.yaml 
        to check the file contained is satisfied.
        '''    

        try:
            # is_validated = False
            # columns = self.schema_config[COLUMNS]
            # columns_numbers = self.schema_config['NumberofColumns']
            # target_column = self.schema_config['target_column']
            # domain_range = self.schema_config['domain_value']
            # train_df,test_df = self.get_train_test_df()
            # df = train_df
            # for keys,values in target_column.items():
            #     if df.shape[1]==columns_numbers :
            #         pass
            #     else:
            #         logging.info(f, "Invalid Column Length for the file %s" % train_df)
            # logging.info('column length is validated')
               
                
                
                                   
            # """perform validation here"""
            # return is_validated
            pass
        except Exception as e:
            return e
        
    def get_save_data_drift_report(self):
        """
        create the profile object for data drift from Evidently
        It ask for profile section which check data drift which need profile of our data
        since it is comparision of 2 dataset train and test so we need  dataset 
        """
        try:
            data_stability = TestSuite(tests=[DataStabilityTestPreset(),])

            train_df,test_df = self.get_train_test_df()
            data_stability.run(train_df,test_df,column_mapping=None)  #run method accept train and test df to generate data drift report
            #now data drift is available in the form of string  format by profile.json() and using json.loads to convert
            report= json.loads(data_stability.json())

            with open(self.data_validation_config.data_report_file_name,'w') as report_file_name:
                json.dump(report,report_file_name,indent=6) #indent=6 is for formatting

            return report    
       
        except Exception as e:
            return e

    def save_data_drift_report_page(self):
        try:
            """DatadriftTab is required in list format to get the data dashborad"""
            data_stability = TestSuite(tests=[DataStabilityTestPreset(),])

            train_df,test_df = self.get_train_test_df()
            data_stability.run(train_df,test_df,column_mapping=None)  #run method accept train and test df to generate data drift report
            #now data drift is available in the form of string  format by profile.json() and using json.loads to convert
            

            report_page_file_name = self.data_validation_config.report_page_name

            #save in the location
            data_stability.save_html(report_page_file_name)
           
        except Exception as e:
            return e 

    def is_data_drift_found(self)-> bool:
        try:
            report = self.get_save_data_drift_report()
            self.save_data_drift_report_page()
            
            return True
        except Exception as e:
            return e    

    def initiate_data_validation(self)->DataValidationArtifact:
        """
        In this functioin  schema validation is done.

        """  
        try:
            self.is_train_test_file_exists()
            self.validate_schema_columns_dataset()
            self.is_data_drift_found()
            
            data_validation_artifact = DataValidationArtifact(
                report_file_path=self.data_validation_config.data_report_file_name,
                report_page_file_path=self.data_validation_config.report_page_name,
                is_validated=True,
                message="Data Validation performed successully."
            )
            logging.info(f"Data validation artifact: {data_validation_artifact}")
            return data_validation_artifact
            
        except Exception as e:
            return e


    def initiate_data_validation(self)->DataValidationArtifact :
        try:
            self.is_train_test_file_exists()
            self.validate_schema_columns_dataset()
            self.is_data_drift_found()

            data_validation_artifact = DataValidationArtifact(
                schema_file_path=self.data_validation_config.root_dir,
                report_file_path=self.data_validation_config.data_report_file_name,
                report_page_file_path=self.data_validation_config.report_page_name,
                is_validated=True,
                message="Data Validation performed successully."
            )
            # logging.info(f"Data validation artifact: {data_validation_artifact}")
            return data_validation_artifact
        except Exception as e:
            return e    

  
              

In [11]:
schema = ConfigurationManager()
validation_config = schema.get_data_validation_config()
data_validation = DataValidation(data_validation_config=validation_config,data_ingestion_artifact=DataIngestionArtifact)
data_validation.initiate_data_validation()

DataValidationArtifact(schema_file_path='artifacts/data_validation', report_page_file_path='artifacts/data_validation/report.html', report_file_path='artifacts/data_validation/report.json', is_validated=True, message='Data Validation performed successully.')

In [None]:
# datatset_schema = read_yaml(DATA_VALIDATION_FILE)

In [None]:
# datatset_schema

In [None]:
# data_ingestion_artifact = DataIngestionArtifact
# data_ingestion_artifact.train_file_path

In [None]:
# is_train_file_exist = False  
# is_test_file_exist = False
# #our output data_ingestion_artifact has the data so
# train_file_path = data_ingestion_artifact.train_file_path
# test_file_path = data_ingestion_artifact.test_file_path

# #os.path.exists return boolean value of path exists or not
# os.path.exists(train_file_path) 
# # is_test_file_exist = os.path.exists(test_file_path)

# # # and operator to make it sure only returns True is both are true otherwise False
# # is_exists =  is_train_file_exist and is_test_file_exist

In [None]:
# train_file_path