In [10]:
import pandas as pd

# Load dataset
full_data_path = "../data/processed/Car_Sales_Clean.csv"
df = pd.read_csv(full_data_path)

# Exclude phone column from numeric checks
numeric_columns = df.select_dtypes(include=['number']).columns
numeric_columns = [col for col in numeric_columns if col != 'Phone']

# Store summary and all outlier rows
outlier_summary = []
all_outliers = pd.DataFrame()

for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].copy()
    outliers['Outlier_Column'] = col
    outliers['Outlier_Value'] = outliers[col]

    # Add to summary
    outlier_summary.append({
        'Column': col,
        'Q1': round(Q1, 2),
        'Q3': round(Q3, 2),
        'IQR': round(IQR, 2),
        'Lower Bound': round(lower_bound, 2),
        'Upper Bound': round(upper_bound, 2),
        'Outlier Count': len(outliers)
    })

    # Append all outliers
    all_outliers = pd.concat([all_outliers, outliers], ignore_index=True)

# Save summary and outlier records
outlier_summary_df = pd.DataFrame(outlier_summary)
outlier_summary_df.to_csv("../data/processed/Car_Sales_Outlier_Summary.csv", index=False)
all_outliers.to_csv("../data/processed/Car_Sales_All_Outliers.csv", index=False)

# Print results
print(outlier_summary_df)
print("\nAll outliers saved to '../data/processed/Car_Sales_All_Outliers.csv'")


          Column         Q1          Q3         IQR  Lower Bound  Upper Bound  \
0  annual_income   386000.0  1175750.00   789750.00   -798625.00   2360375.00   
1          price    18001.0    34000.00    15999.00     -5997.50     57998.50   
2          phone  6746495.0  8248146.25  1501651.25   4494018.12  10500623.12   

   Outlier Count  
0            816  
1           1449  
2              0  

All outliers saved to '../data/processed/Car_Sales_All_Outliers.csv'
