In [1]:
import pandas as pd

In [2]:
def drop_null_cols(df):
    # Drop columns: 'lasnap20share', 'lahunv20share' and 24 other columns
    df = df.drop(columns=['lasnap20share', 'lahunv20share', 'lahisp20share', 'laomultir20share', 'laaian20share', 'lanhopi20share', 'laasian20share', 'lablack20share', 'lawhite20share', 'laseniors20share', 'lakids20share', 'lalowi20share', 'lapop20share', 'lasnap10share', 'lahunv10share', 'lahisp10share', 'laomultir10share', 'laaian10share', 'lanhopi10share', 'laasian10share', 'lablack10share', 'lawhite10share', 'laseniors10share', 'lakids10share', 'lalowi10share', 'lapop10share'])
    # Drop row with all missing data
    df = df.dropna(subset=['lasnap1share'])
    return df

def cap_outliers(series: pd.Series, multiplier: float = 1.5) -> pd.Series:
    """Cap outliers in a series based on the IQR rule.

    Args:
        series (pd.Series): The series to cap outliers in.
        multiplier (float, optional): The IQR multiplier to use. Defaults to 1.5.

    Returns:
        pd.Series: The series with outliers capped.
    """
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (multiplier * iqr)
    upper_bound = q3 + (multiplier * iqr)
    return series.clip(lower_bound, upper_bound)

def drop_outliers(df: pd.DataFrame):
    """Drop outliers in a dataframe based on the IQR rule.

    Args:
        df (pd.DataFrame): The dataframe to drop outliers in.

    Returns:
        pd.DataFrame: The dataframe with outliers dropped.
    """
    df = df.copy()
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].apply(cap_outliers, axis=0)
    return df

def drop_const_cols(df: pd.DataFrame):
    """Drop columns with constant values in a dataframe."""
    const_cols = [col for col in df.columns if df[col].nunique() <= 1]
    print(const_cols)
    df_reduced = df.drop(const_cols, axis=1)
    return df_reduced

def drop_manual_cols(df: pd.DataFrame):
    """Drop columns from a dataframe that are present in a manually-defined list"""
    drop_cols = ["MTW Status"]
    df_reduced = df.drop(drop_cols, axis=1)
    return df_reduced

df = pd.read_csv(r'data/merged_data.csv')
df_clean = drop_manual_cols(drop_const_cols(drop_outliers(drop_null_cols(df))))
df_clean.to_csv(r'data/merged_data_clean.csv', index=False)

['lanhopi10', 'lapop20', 'lalowi20', 'lakids20', 'laseniors20', 'lawhite20', 'lablack20', 'laasian20', 'lanhopi20', 'laaian20', 'laomultir20', 'lahisp20', 'lahunv20', 'lasnap20', 'GroupQuartersFlag', 'LILATracts_1And10', 'LILATracts_1And20', 'LILATracts_Vehicle', 'HUNVFlag', 'LA1and20', 'LATracts1', 'LATracts10', 'LATracts20', 'LATractsVehicle_20']
