# Creating Automated Data Cleaning Pipelines Using Python and Pandas
## Table of Contents  
1. [Project Description](#project-description)  
2. [Standardize Your Data Import Process](#standardize-your-data-import-process)  
3. [Implement Automated Data Validation](#implement-automated-data-validation)  
4. [Create a Data Cleaning Pipeline](#create-a-data-cleaning-pipeline)    
5. [Automate String Cleaning and Standardization](#automate-string-cleaning-and-standardization)  
6. [Monitor Data Quality Over Time](#monitor-data-quality-over-time)  
7. [Conclusion](#conclusion)  


In [None]:
import pandas as pd
from pathlib import Path

def load_dataset(file_path, **kwargs):
    """
    Load data from various file formats while handling common issues.
    """
    file_type = Path(file_path).suffix.lower()
    handlers = {
        '.csv': pd.read_csv,
        '.xlsx': pd.read_excel,
        '.json': pd.read_json,
        '.parquet': pd.read_parquet
    }
    reader = handlers.get(file_type)
    if reader is None:
        raise ValueError(f"Unsupported file type: {file_type}")
    df = reader(file_path, **kwargs)
    df.columns = df.columns.str.strip().str.lower()
    df = df.replace('', pd.NA)
    return df

In [None]:
def validate_dataset(df, validation_rules=None):
    """
    Apply validation rules to a dataframe and return validation results.
    """
    if validation_rules is None:
        validation_rules = {
            'numeric_columns': {
                'check_type': 'numeric',
                'min_value': 0,
                'max_value': 1000000
            }
        }
    validation_results = {}
    for column, rules in validation_rules.items():
        if column not in df.columns:
            continue
        issues = []
        missing_count = df[column].isna().sum()
        if missing_count > 0:
            issues.append(f"Found {missing_count} missing values")
        validation_results[column] = issues
    return validation_results

In [None]:
class DataCleaningPipeline:
    def __init__(self):
        self.steps = []
    
    def add_step(self, name, function):
        self.steps.append({'name': name, 'function': function})
    
    def execute(self, df):
        results = []
        current_df = df.copy()
        for step in self.steps:
            try:
                current_df = step['function'](current_df)
                results.append({'step': step['name'], 'status': 'success'})
            except Exception as e:
                results.append({'step': step['name'], 'status': 'failed', 'error': str(e)})
                break
        return current_df, results

In [None]:
def clean_text_columns(df, columns=None):
    """
    Apply standardized text cleaning to specified columns.
    """
    if columns is None:
        columns = df.select_dtypes(include=['object']).columns
    df = df.copy()
    for column in columns:
        if column not in df.columns:
            continue
        df[column] = (df[column]
                     .astype(str)
                     .str.strip()
                     .str.lower()
                     .replace(r'\s+', ' ', regex=True)
                     .replace(r'[^\w\s]', '', regex=True))
    return df

In [None]:
def generate_quality_metrics(df, baseline_metrics=None):
    """
    Generate quality metrics for a dataset and compare with baseline if provided.
    """
    metrics = {
        'row_count': len(df),
        'missing_values': df.isna().sum().to_dict(),
        'unique_values': df.nunique().to_dict(),
        'data_types': df.dtypes.astype(str).to_dict()
    }
    if baseline_metrics:
        metrics['changes'] = {
            'row_count_change': metrics['row_count'] - baseline_metrics['row_count']
        }
    return metrics