# **Data Cleaning for Carlist**

Step 1: Import Pandas and Load the CSV File
python



In [None]:
import pandas as pd

# Load CSV with headers (since your file already has headers)
df_raw = pd.read_csv("raw_data.csv", encoding='utf-8', low_memory=False)

# Optionally, you can standardize the column names to lowercase to ensure consistency
df_raw.columns = df_raw.columns.str.lower()

# --- Initial Data Inspection ---
# Print initial number of rows and columns
print(f"[INITIAL DATA SHAPE] -> Rows: {df_raw.shape[0]}, Columns: {df_raw.shape[1]}")

# List columns before cleaning
print("\n[INITIAL COLUMN NAMES] ->")
print(df_raw.columns.values)


[INITIAL DATA SHAPE] -> Rows: 175545, Columns: 17

[INITIAL COLUMN NAMES] ->
['car name' 'price (myr)' 'currency' 'location' 'region' 'brand' 'model'
 'year' 'mileage' 'fuel type' 'color' 'body type' 'seating capacity'
 'condition' 'image' 'description' 'url']


Step 2: Attempt to Fix Encoding Issues
python

In [None]:
# Attempt to fix mojibake in description
def fix_encoding(text):
    try:
        return text.encode('latin1').decode('utf-8')
    except:
        return text  # fallback to original if error

df_raw['description'] = df_raw['description'].apply(fix_encoding)


Step 3: Drop Unnecessary Column ('currency')

In [None]:
# Drop the 'currency' column
df_raw = df_raw.drop(columns=['currency'])

# Make a working copy
df = df_raw.copy()


Step 4: Check for Duplicates

In [None]:
# Check for duplicates
dupes = df.duplicated(subset=["car name", "price (myr)", "location", "year", "mileage", "url"])
print(f"\n[NUMBER OF DUPLICATES] -> {dupes.sum()}")



[NUMBER OF DUPLICATES] -> 0


Step 5: Check Missing Values

In [None]:
# Check missing values
print("\n[MISSING VALUES PER COLUMN]:")
print(df.isna().sum())



[MISSING VALUES PER COLUMN]:
car name              0
price (myr)           0
location            143
region                0
brand                 0
model                 0
year                  0
mileage               0
fuel type            39
color                 0
body type            31
seating capacity      0
condition             0
image                 0
description           0
url                   0
dtype: int64


Step 6: Standardize String Columns and Handle Missing Values

In [None]:
# --- Cleaning ---
# Standardize string columns: lowercase + strip
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Fix 'condition' values
df['condition'] = df['condition'].replace({
    'usedcondition': 'used',
    'refurbishedcondition': 'refurbished',
    'newcondition': 'new'
})

# Fill missing text columns with "unknown"
text_cols = df.select_dtypes(include='object').columns
df[text_cols] = df[text_cols].fillna('unknown')

# Re-check missing values
print("\n[MISSING VALUES PER COLUMN AFTER REPLACE WITH UNKNOWN]:")
print(df.isna().sum())



[MISSING VALUES PER COLUMN AFTER REPLACE WITH UNKNOWN]:
car name            0
price (myr)         0
location            0
region              0
brand               0
model               0
year                0
mileage             0
fuel type           0
color               0
body type           0
seating capacity    0
condition           0
image               0
description         0
url                 0
dtype: int64


Step 7: Filter Out Invalid URLs

In [None]:
# Filter out invalid or missing URLs
df = df[df['url'].notna() & df['url'].str.startswith("http")]
print(f"\n[VALID URL ROWS ONLY] -> {df.shape}")



[VALID URL ROWS ONLY] -> (175545, 16)


Step 8: Drop Duplicates

In [None]:
# Drop duplicates
df = df.drop_duplicates()

# Reset index for clean CSV output
df = df.reset_index(drop=True)


Step 9: Convert Price, Year, and Mileage to Numeric and Handle Missing Values

In [None]:

# Convert 'price (myr)', 'year', 'mileage' to numeric values (if possible)
df['price (myr)'] = pd.to_numeric(df['price (myr)'], errors='coerce')
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

# Handle missing numeric columns by filling with median or dropping
df['price (myr)'] = df['price (myr)'].fillna(df['price (myr)'].median())
df['mileage'] = df['mileage'].fillna(df['mileage'].median())
df['year'] = df['year'].fillna(df['year'].mode()[0])

# Handle outliers: Remove rows where price or mileage is unreasonable (e.g., negative or extreme values)
df = df[(df['price (myr)'] >= 0) & (df['mileage'] >= 0)]

# Ensure valid year (e.g., no future years)
current_year = pd.to_datetime('today').year
df = df[df['year'] <= current_year]


Step 10: Handle Condition Column

In [None]:
# Standardize and handle the "condition" column more robustly (already done in previous steps)
# Filter out rows with invalid 'condition' values if necessary
valid_conditions = ['used', 'new', 'refurbished']
df = df[df['condition'].isin(valid_conditions)]


Step 11: Final Data Inspection

In [None]:
# --- Final Inspection ---
# Re-check the data shape after cleaning
print("\n[CLEANED DATA SHAPE] ->", df.shape)

print("\n[INITIAL COLUMN NAMES] ->")
print(df.columns.values)



[CLEANED DATA SHAPE] -> (175545, 16)

[INITIAL COLUMN NAMES] ->
['car name' 'price (myr)' 'location' 'region' 'brand' 'model' 'year'
 'mileage' 'fuel type' 'color' 'body type' 'seating capacity' 'condition'
 'image' 'description' 'url']


Step 12: Save Cleaned Data to CSV

In [None]:
# --- Save Cleaned File ---
df.to_csv("cleaned_data.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_data.csv'")



✅ Cleaned dataset saved as 'cleaned_data.csv'
