---
# Overview

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import re 

In [21]:
# Get the current working directory of the notebook
current_dir = os.getcwd()
# Assuming the notebook is in 'notebooks/' and project root is one level up
project_root = os.path.dirname(current_dir)
# Add the project root to the Python path
if project_root not in sys.path:
    sys.path.append(project_root)

# Import functions from your utils module
from utils.preprocess import load_and_concat_raw_data, preprocess_dataframe

In [22]:
# --- START: Modified data_raw_path setup ---
# Define the path to your raw data relative to the project root
# This ensures it always points to 'car-price-predictor/data/raw/'
data_raw_folder_path = os.path.join(project_root, 'data', 'raw')
# --- END: Modified data_raw_path setup ---

In [23]:
# Load raw data
# Pass the correctly constructed path to the function
raw_df = load_and_concat_raw_data(data_raw_path=data_raw_folder_path)

if raw_df.empty:
    print("Raw DataFrame is empty. Cannot proceed with preprocessing.")
else:
    print(f"Shape before preprocessing: {raw_df.shape}")
    print(f"Null values in raw_df:\n{raw_df.isnull().sum()}")

    # Preprocess and save the data
    # No need to pass output_cleaned_dir here as it's handled internally in preprocess_dataframe relative to its own location
    df = preprocess_dataframe(raw_df.copy()) 

    print(f"Final DataFrame shape after preprocessing: {df.shape}")
    print("\n--- Final Info After Preprocessing ---")
    df.info()
    print("\n--- Final Missing Values After Preprocessing ---")
    print(df.isnull().sum())

Shape before preprocessing: (574, 15)
Null values in raw_df:
full_title        0
brand             0
model             0
year              0
trim_version      0
price             0
location          1
mileage           0
fuel_type         0
gearbox           0
body_condition    0
body_color        0
interior_color    0
ad_url            0
scrape_date       0
dtype: int64
Shape before preprocessing: (574, 15)
Shape after deduplication: (533, 15)
Shape after price cleaning: (397, 15)
Shape after mileage cleaning: (395, 15)
Shape after year/car_age cleaning: (318, 16)

Cleaned data saved to: c:\Users\fatii\OneDrive\Desktop\car-price-predictor\data/cleaned/bama_cleaned_data_20250703_225834.csv
Final DataFrame shape after preprocessing: (318, 15)

--- Final Info After Preprocessing ---
<class 'pandas.core.frame.DataFrame'>
Index: 318 entries, 501 to 506
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand      

---
# **Overview of the Dataset**

In [34]:
df.shape

(318, 15)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 318 entries, 501 to 506
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           318 non-null    object 
 1   model           318 non-null    object 
 2   year            318 non-null    int64  
 3   trim_version    318 non-null    object 
 4   price           318 non-null    float64
 5   location        318 non-null    object 
 6   mileage         318 non-null    int64  
 7   fuel_type       318 non-null    object 
 8   gearbox         318 non-null    object 
 9   body_condition  318 non-null    object 
 10  body_color      318 non-null    object 
 11  interior_color  318 non-null    object 
 12  ad_url          318 non-null    object 
 13  scrape_date     318 non-null    object 
 14  car_age         318 non-null    int64  
dtypes: float64(1), int64(3), object(11)
memory usage: 39.8+ KB


In [36]:
df.isnull().sum()

brand             0
model             0
year              0
trim_version      0
price             0
location          0
mileage           0
fuel_type         0
gearbox           0
body_condition    0
body_color        0
interior_color    0
ad_url            0
scrape_date       0
car_age           0
dtype: int64

In [24]:
df.head()

Unnamed: 0,brand,model,year,trim_version,price,location,mileage,fuel_type,gearbox,body_condition,body_color,interior_color,ad_url,scrape_date,car_age
501,کی,ام سی، T9,1404,2لیترتوربو,2750000000.0,تهران/17شهریور,0,بنزینی,اتوماتیک,بدون رنگ,مشکی,داخل مشکی,https://bama.ir/car/detail-ivr1bewx-kmc-t9-2li...,2025-06-08 18:58:37.966920,0
500,زامیاد,وانت نیسان,1404,دوگانهسوز,431000000.0,تهران/جمهوری,0,دوگانهسوز,دنده ای,بدون رنگ,آبی,داخل مشکی,https://bama.ir/car/detail-gbfit7nl-zamyad-pic...,2025-06-08 18:58:35.924288,0
499,فونیکس,تیگو 7 پرو,1403,پریمیوم,1980000000.0,تهران/کامرانیه,25000,بنزینی,اتوماتیک,دو لکه رنگ,مشکی,داخل مشکی,https://bama.ir/car/detail-7vxu3asw-fownix-tig...,2025-06-08 18:58:33.573617,1
498,ام,وی ام، X55 PRO,1403,IE,950000000.0,تهران/یوسفآباد,0,بنزینی,اتوماتیک,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-oh9k3uhd-mvm-x55pro...,2025-06-08 18:58:31.603706,1
496,بی,وای دی، F3,1397,دنده ای,990000000.0,کرمان,0,بنزینی,دنده ای,بدون رنگ,نقرهای,داخل مشکی,https://bama.ir/car/detail-sbcvdlga-byd-f3-mt-...,2025-06-08 18:58:27.651656,7


In [25]:
df.tail()

Unnamed: 0,brand,model,year,trim_version,price,location,mileage,fuel_type,gearbox,body_condition,body_color,interior_color,ad_url,scrape_date,car_age
521,پژو,207,1404,پانوراما دنده ای,530000000.0,تهران/صادقیه,0,بنزینی,دنده ای,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-uipqpula-peugeot-20...,2025-06-08 18:26:40.378456,0
516,پژو,207,1390,اتوماتیکTU5,535000000.0,بوشهر,320000,بنزینی,اتوماتیک,دو لکه رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-m5jclusu-peugeot-20...,2025-06-08 18:26:30.414523,14
512,پژو,207,1399,اتوماتیکTU5,800000000.0,تهران/تهرانپارسشرقی,30000,بنزینی,اتوماتیک,بدون رنگ,سفید,داخل نوک مدادی,https://bama.ir/car/detail-y2fpairz-peugeot-20...,2025-06-08 18:26:22.401885,5
511,پژو,206,1401,تیپ2,620000000.0,اصفهان/آینهخانه,16000,بنزینی,دنده ای,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-itgfr1yh-peugeot-20...,2025-06-08 18:26:20.206141,3
506,پژو,207,1402,TU3,695000000.0,تهران/علی‌آباد,19800,بنزینی,دنده ای,بدون رنگ,سفید,داخل مشکی,https://bama.ir/car/detail-5sqwre6r-peugeot-20...,2025-06-08 18:26:08.207662,2


In [26]:
df["mileage"].value_counts()

mileage
0         197
400000      4
7000        3
47000       3
74000       3
         ... 
260000      1
127000      1
320000      1
30000       1
19800       1
Name: count, Length: 96, dtype: int64

In [37]:
categorical_columns = df.select_dtypes(include='object').columns

for col in categorical_columns:
    print(f"--- {col} ---")
    print(df[col].value_counts(), "\n")

--- brand ---
brand
پژو        66
فونیکس     42
کی         31
ام         30
جک         24
فیدلیتی    10
کوییک       9
دنا         9
شاهین       9
پراید       7
چری         7
هایما       7
اکستریم     6
لاماری      6
پیش         6
رنو         5
بی          5
زامیاد      4
ساینا       4
تیبا        4
لوکانو      3
اطلس        3
ریسپکت      3
سمند        3
رانا        2
تارا        2
فوتون       2
برلیانس     2
مزدا        1
حواله       1
پاژن        1
دیگنیتی     1
نیسان       1
دانگ        1
کیا         1
Name: count, dtype: int64 

--- model ---
model
207              28
206              16
پارس             12
S5               12
پرایم            10
                 ..
GL                1
H230              1
H330              1
فنگ، H30 کراس     1
ریو (مونتاژ)      1
Name: count, Length: 89, dtype: int64 

--- trim_version ---
trim_version
اتوماتیک             51
دنده ای              30
IE                   25
1.5لیتر              13
تیپ2                 11
                     ..
2.0ل

In [39]:
# Summary of numerical columns
df.describe()

Unnamed: 0,year,price,mileage,car_age
count,318.0,318.0,318.0,318.0
mean,1401.028302,1114964000.0,50026.19,2.971698
std,4.923627,766875400.0,185329.1,4.923627
min,1378.0,170000000.0,0.0,0.0
25%,1401.0,552500000.0,0.0,0.0
50%,1403.0,914000000.0,0.0,1.0
75%,1404.0,1475000000.0,47000.0,3.0
max,1404.0,4500000000.0,3000000.0,26.0
