In [None]:
# change name of columns to lower case and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# change text of strig columns to lower case and replace spaces with underscores
# TO IMPLEMENT
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# check the unique values in columns
for col in df.columns:
    print(col)
    print(df[col].unique()[:5]) # print first 5 unique values
    print(df[col].nunique()) # print number of unique values
    print()


# check missing values

# check target variable distribution
# apply log if the target variable is skewed
# do not forget to treat log 0 as adding 1, since log 0  is undefined
price_logs = np.log1p(df.msrp) # you can apply log1p to treat log 0

## Check relationship between variables

- **FOR CATEGORICAL** Use mutual_info_score
- **FOR NUMERICAL** Use pearson (note: only linear relationships) / Strong if r>0.6, moderate between 0.2 & 0.5

| Feature               | Mutual Information (MI)        | Correlation (e.g., Pearson)   |
|-----------------------|---------------------------------|--------------------------------|
| **Type of Relationship** | Measures **any** kind of dependency (linear & nonlinear) | Measures **only linear** relationships |
| **Range of Values**   | Always **≥ 0** (0 = no relation, higher = stronger dependency) | Between **-1 and 1** (-1 = strong negative, 1 = strong positive, 0 = no correlation) |
| **Interpretation**    | Measures **how much knowing one variable reduces uncertainty** about the other | Measures **how much one variable changes proportionally** with another |
| **Handles Categorical Data?** | ✅ Yes (works for both categorical & numerical data) | ❌ No (only works well for numerical data) |
| **Sensitivity to Scale** | Not affected by scaling | Strongly affected by scaling (needs normalization) |


## Check data quality

In [None]:
def check_data_quality(df):
    # Store initial data quality metrics
    quality_report = {
        'missing_values': df.isnull().sum().to_dict(),
        'duplicates': df.duplicated().sum(),
        'total_rows': len(df),
        'memory_usage': df.memory_usage().sum() / 1024**2  # in MB
    }
    return quality_report

## Standardise data types

In [None]:
def standardise_datatypes(df):
    for column in df.columns:
        # Try converting string dates to datetime
        if df[column].dtype == 'object':
            try:
                df[column] = pd.to_datetime(df[column])
                print(f"Converted {column} to datetime")
            except ValueError:
                # Try converting to numeric if datetime fails
                try:
                    df[column] = pd.to_numeric(df[column].str.replace('$', '').str.replace(',', ''))
                    print(f"Converted {column} to numeric")
                except:
                    pass
    return df

## Handling missing values

In [None]:
from sklearn.impute import SimpleImputer

def handle_missing_values(df):
    # Handle numeric columns
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_columns) > 0:
        num_imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = num_imputer.fit_transform(df[numeric_columns])
    
    # Handle categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    if len(categorical_columns) > 0:
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])
    
    return df

## Handling outliers

In [None]:
def remove_outliers(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    outliers_removed = {}

    for column in numeric_columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Count outliers before removing
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]

        # Cap the values instead of removing them
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

        if outliers > 0:
            outliers_removed[column] = outliers

    return df, outliers_removed

## Validate Cleaning

In [None]:
def validate_cleaning(df, original_shape, cleaning_report):
    validation_results = {
        'rows_remaining': len(df),
        'missing_values_remaining': df.isnull().sum().sum(),
        'duplicates_remaining': df.duplicated().sum(),
        'data_loss_percentage': (1 - len(df)/original_shape[0]) * 100
    }
    
    # Add validation results to the cleaning report
    cleaning_report['validation'] = validation_results
    return cleaning_report