## Data Validation

In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\NVDNLP'

In [17]:
# ============================================
#     ENTITY: DATA VALIDATION CONFIG
# ============================================
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    ALL_REQUIRED_FILES: list
    REQUIRED_COLUMNS: list

In [18]:
# ============================================
# ⚙️ CONFIGURATION MANAGER
# ============================================
from src.NVDNLP.constant import *
from src.NVDNLP.utils.common import read_yaml, create_directories 

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
            REQUIRED_COLUMNS=config.REQUIRED_COLUMNS,
        )

        return data_validation_config

In [19]:
# ============================================
# 🔍 DATA VALIDATION COMPONENT
# ============================================
import os
import pandas as pd
from NVDNLP import logger

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def is_validation_completed(self) -> bool:
        """Check if validation has already been completed by looking for status file"""
        try:
            if os.path.exists(self.config.STATUS_FILE):
                with open(self.config.STATUS_FILE, 'r') as f:
                    status_content = f.read().strip()
                    if "Validation status: True" in status_content:
                        logger.info(" Data validation already completed successfully. Skipping...")
                        return True
                    elif "Validation status: False" in status_content:
                        logger.info("Previous validation failed. Re-running validation...")
                        return False
            return False
        except Exception as e:
            logger.info(f"Error checking validation status: {e}")
            return False
    
    def validate_all_files_exist(self) -> bool:
        """Validate that all required files exist"""
        try:
            validation_status = True
            
            # Check if all required files exist
            for file in self.config.ALL_REQUIRED_FILES:
                file_path = os.path.join("artifacts", "data_ingestion", file)
                if not os.path.exists(file_path):
                    validation_status = False
                    logger.info(f"Missing required file: {file}")
                    break
            
            # Write validation status to file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")
            
            return validation_status
        
        except Exception as e:
            logger.info(f"Error in file validation: {e}")
            raise e
    
    def validate_dataset_columns(self, data_path: str) -> bool:
        """Validate that dataset contains all required columns"""
        try:
            validation_status = True
            
            # Read the dataset
            df = pd.read_csv(data_path)
            
            # Check if all required columns are present
            missing_columns = []
            for column in self.config.REQUIRED_COLUMNS:
                if column not in df.columns:
                    missing_columns.append(column)
                    validation_status = False
            
            if missing_columns:
                logger.info(f"Missing required columns: {missing_columns}")
            else:
                logger.info(" All required columns are present!")
                
                # Additional validation: Check for null values in critical columns
                critical_columns = ['Description', 'Severity']
                null_check_status = True
                for col in critical_columns:
                    null_count = df[col].isnull().sum()
                    if null_count > 0:
                        logger.info(f"Column '{col}' has {null_count} null values")
                        null_check_status = False
                
                if null_check_status:
                    logger.info(" No null values in critical columns!")
                else:
                    logger.info("  Null values found in critical columns")
            
            # Write detailed validation report
            validation_report = {
                'validation_status': validation_status,
                'missing_columns': missing_columns,
                'total_rows': len(df),
                'columns_present': list(df.columns),
                'severity_distribution': df['Severity'].value_counts().to_dict() if 'Severity' in df.columns else {}
            }
            
            # Save validation report
            report_path = os.path.join(self.config.root_dir, "validation_report.txt")
            with open(report_path, 'w') as f:
                f.write("=== DATA VALIDATION REPORT ===\n")
                f.write(f"Overall Status: {'PASS' if validation_status else 'FAIL'}\n")
                f.write(f"Total Rows: {validation_report['total_rows']}\n")
                f.write(f"Missing Columns: {missing_columns}\n")
                f.write(f"Columns Present: {validation_report['columns_present']}\n")
                f.write(f"Severity Distribution: {validation_report['severity_distribution']}\n")
            
            # Update status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")
            
            return validation_status
        
        except Exception as e:
            logger.info(f"Error in dataset validation: {e}")
            raise e
    
    def validate_all(self, data_path: str) -> bool:
        """Perform complete data validation only if not already completed"""
        try:
            # Check if validation is already completed
            if self.is_validation_completed():
                return True
            
            logger.info(" Starting Data Validation Process...")
            
            # Validate files exist
            files_valid = self.validate_all_files_exist()
            
            # Validate dataset structure
            dataset_valid = self.validate_dataset_columns(data_path)
            
            # Overall validation status
            overall_status = files_valid and dataset_valid
            
            if overall_status:
                logger.info(" Data Validation PASSED - All checks completed successfully!")
            else:
                logger.info(" Data Validation FAILED - Check validation report for details!")
            
            return overall_status
            
        except Exception as e:
            logger.info(f"Error in complete validation: {e}")
            # Mark as failed in status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write("Validation status: False")
            raise e

In [21]:
# ============================================
# 🔄 DATA VALIDATION PIPELINE
# ============================================

from NVDNLP import logger

try:
    # Initialize configuration
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    
    # Initialize data validation
    data_validation = DataValidation(config=data_validation_config)
    
    # Check if validation is already completed
    if data_validation.is_validation_completed():
        
        logger.info("  Data validation already completed. Moving to next stage...")
    else:
        # Perform validation
        data_path = "artifacts/data_ingestion/nvd_combined_2010_2025.csv"
        validation_status = data_validation.validate_all(data_path)
        
        if validation_status:
            logger.info(" Data Validation Completed Successfully!")
        else:
            logger.info(" Data Validation Failed!")
    
except Exception as e:
    logger.info(f"Data Validation Pipeline Failed: {e}")
    raise e

[2025-10-22 19:51:10,312: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-22 19:51:10,315: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-22 19:51:10,316: INFO: common: created directory at: artifacts]
[2025-10-22 19:51:10,319: INFO: common: created directory at: artifacts/data_validation]
[2025-10-22 19:51:10,350: INFO: 3267598641: Previous validation failed. Re-running validation...]
[2025-10-22 19:51:10,350: INFO: 3267598641: Previous validation failed. Re-running validation...]
[2025-10-22 19:51:10,356: INFO: 3267598641:  Starting Data Validation Process...]
[2025-10-22 19:51:13,345: INFO: 3267598641:  All required columns are present!]
[2025-10-22 19:51:13,454: INFO: 3267598641: Column 'Severity' has 13516 null values]
[2025-10-22 19:51:13,456: INFO: 3267598641:   Null values found in critical columns]
[2025-10-22 19:51:13,623: INFO: 3267598641:  Data Validation PASSED - All checks completed successfully!]
[2025-10-22 19:51:13,626: INF