---
# Overview

In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re 

In [86]:
# Get the current working directory of the notebook
current_dir = os.getcwd()
# the notebook is in 'notebooks/' and project root is one level up
project_root = os.path.dirname(current_dir)
# Add the project root to the Python path
if project_root not in sys.path:
    sys.path.append(project_root)

# Import functions from utils module
from utils.preprocess import load_and_concat_raw_data, preprocess_dataframe

In [87]:
# Load raw data
raw_df = load_and_concat_raw_data()

if raw_df.empty:
    print("Raw DataFrame is empty. Cannot proceed with preprocessing.")
else:
    print(f"Shape before preprocessing: {raw_df.shape}")
    print(f"Null values in raw_df:\n{raw_df.isnull().sum()}")

    # Preprocess and save the data
    # No need to pass identifier to preprocess_dataframe anymore
    df = preprocess_dataframe(raw_df.copy())

    print(f"Final DataFrame shape after preprocessing: {df.shape}")
    print("\n--- Final Info After Preprocessing ---")
    df.info()
    print("\n--- Final Missing Values After Preprocessing ---")
    print(df.isnull().sum())

Shape before preprocessing: (574, 15)
Null values in raw_df:
full_title        0
brand             0
model             0
year              0
trim_version      0
price             0
location          1
mileage           0
fuel_type         0
gearbox           0
body_condition    0
body_color        0
interior_color    0
ad_url            0
scrape_date       0
dtype: int64

Cleaned data saved to: data/cleaned/bama_cleaned_data_20250608_195915.csv
Final DataFrame shape after preprocessing: (197, 15)

--- Final Info After Preprocessing ---
<class 'pandas.core.frame.DataFrame'>
Index: 197 entries, 501 to 521
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           197 non-null    object 
 1   model           197 non-null    object 
 2   year            197 non-null    int64  
 3   trim_version    197 non-null    object 
 4   price           197 non-null    float64
 5   location        197 non-null    obje

In [88]:
df.head()

Unnamed: 0,brand,model,year,trim_version,price,location,mileage,fuel_type,gearbox,body_condition,body_color,interior_color,ad_url,scrape_date,car_age
501,کی,ام سی، T9,1404,2لیترتوربو,2750000000.0,تهران/17شهریور,0,بنزینی,اتوماتیک,بدون رنگ,مشکی,داخل مشکی,https://bama.ir/car/detail-ivr1bewx-kmc-t9-2li...,2025-06-08 18:58:37.966920,0
500,زامیاد,وانت نیسان,1404,دوگانهسوز,431000000.0,تهران/جمهوری,0,دوگانهسوز,دنده‌ای,بدون رنگ,آبی,داخل مشکی,https://bama.ir/car/detail-gbfit7nl-zamyad-pic...,2025-06-08 18:58:35.924288,0
498,ام,وی ام، X55 PRO,1403,IE,950000000.0,تهران/یوسفآباد,0,بنزینی,اتوماتیک,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-oh9k3uhd-mvm-x55pro...,2025-06-08 18:58:31.603706,1
496,بی,وای دی، F3,1397,دنده‌ای,990000000.0,کرمان,0,بنزینی,دنده‌ای,بدون رنگ,نقرهای,داخل مشکی,https://bama.ir/car/detail-sbcvdlga-byd-f3-mt-...,2025-06-08 18:58:27.651656,7
495,فیدلیتی,پرایم,1404,تیپ2هفتنفره,1100000000.0,تهران/جمهوری,0,بنزینی,اتوماتیک,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-dlurbvxi-fidelity-p...,2025-06-08 18:58:25.696925,0


In [89]:
df["mileage"].value_counts()

mileage
0    197
Name: count, dtype: int64

In [78]:
testdf = pd.read_csv('../data/raw/bama_raw_data_all_cars_20250608_185840.csv')

In [79]:
testdf["mileage"].value_counts()

mileage
صفر         334
101000km      4
180000km      4
60000km       4
20000km       4
           ... 
300000km      1
88000km       1
163000km      1
54000km       1
25000km       1
Name: count, Length: 123, dtype: int64