
## Utility Functions used in Notebooks


In [None]:
def cap_outliers(df_cleaned, column, lower_bound, upper_bound):
    """
    Cap outliers in a specified column of input DataFrame to the given bounds.

    Parameters:
    - df: pandas DataFrame
    - column: str, name of the column to process
    - lower_bound: float, lower bound for capping
    - upper_bound: float, upper bound for capping

    Returns:
    - DataFrame with outliers capped
    """
    print(f"\nColumn {column} has outliers greater than upper bound ({upper_bound}) or lower than lower bound ({lower_bound}). Capping them now.")
    df_cleaned[column] = df_cleaned[column].clip(lower=lower_bound, upper=upper_bound)
    return df_cleaned

In [None]:
def handle_duplicates(df):
    """
    Identifies, displays, counts, and drops duplicate rows in a DataFrame.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - DataFrame with duplicate rows removed
    """
    # Identify duplicate rows
    duplicate_rows = df.duplicated()
    
    # Display duplicate rows
    duplicates = df[duplicate_rows]
    print("\nActual Duplicate Rows:")
    print(duplicates)
    
    # Count duplicate rows
    duplicate_count = duplicate_rows.sum()
    print("\nNumber of Duplicate Rows:", duplicate_count)
    
    # Drop duplicate rows
    df_cleaned = df.drop_duplicates()
    print("\nAfter dropping duplicate rows, the count of duplicate rows now: ", df_cleaned.duplicated().sum())
    
    return df_cleaned
