In [3]:
import os
import pandas as pd
import logging
from datetime import datetime

# Configure logging
LOGS_PATH = "logs/"
os.makedirs(LOGS_PATH, exist_ok=True)
logging.basicConfig(filename=os.path.join(LOGS_PATH, 'data_validation.log'), level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths to raw data
LOCAL_STORAGE_PATH = "raw_data/"
LATEST_DATE = datetime.now().strftime('%Y-%m-%d')
KAGGLE_DATA_PATH = os.path.join(LOCAL_STORAGE_PATH, "kaggle", LATEST_DATE, "kaggle_churn.csv")
HUGGINGFACE_DATA_PATH = os.path.join(LOCAL_STORAGE_PATH, "huggingface", LATEST_DATE, "huggingface_churn.csv")

# Function to validate dataset
def validate_data(file_path, source):
    try:
        if not os.path.exists(file_path):
            logging.error(f"{source} dataset not found at {file_path}")
            return None
        
        df = pd.read_csv(file_path)
        logging.info(f"Validating {source} dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
        
        # Check for missing values
        missing_values = df.isnull().sum()
        missing_values_report = missing_values[missing_values > 0]
        if not missing_values_report.empty:
            logging.warning(f"Missing values found in {source}:\n{missing_values_report}")
        else:
            logging.info(f"No missing values in {source}")
        
        # Check for duplicates
        duplicate_rows = df.duplicated().sum()
        if duplicate_rows > 0:
            logging.warning(f"{source} dataset contains {duplicate_rows} duplicate rows.")
        else:
            logging.info(f"No duplicate rows in {source} dataset.")
        
        # Check data types
        logging.info(f"Data types for {source} dataset:\n{df.dtypes}")
        
        # Save a validation report
        report_path = os.path.join(LOGS_PATH, f"{source}_validation_report.csv")
        validation_report = pd.DataFrame({
            'Missing Values': missing_values,
            'Data Type': df.dtypes.astype(str)
        })
        validation_report.to_csv(report_path)
        logging.info(f"Validation report saved: {report_path}")
        
        return df
    
    except Exception as e:
        logging.error(f"Error validating {source} dataset: {e}")
        return None

if __name__ == "__main__":
    kaggle_df = validate_data(KAGGLE_DATA_PATH, "Kaggle")
    hf_df = validate_data(HUGGINGFACE_DATA_PATH, "Hugging Face")
    logging.info("Data validation completed.")
