Import and concatenate

In [5]:
import pandas as pd
import os

# File paths
base_path = os.path.join("..","data","raw")
file_paths = {
	"Bangalore": os.path.join(base_path, "bangalore_cars.xlsx"),
	"Chennai": os.path.join(base_path, "chennai_cars.xlsx"),
	"Delhi": os.path.join(base_path, "delhi_cars.xlsx"),
	"Hyderabad": os.path.join(base_path, "hyderabad_cars.xlsx"),
	"Jaipur": os.path.join(base_path, "jaipur_cars.xlsx"),
	"Kolkata": os.path.join(base_path, "kolkata_cars.xlsx"),
}

# Read and structure each dataset and add a 'City' column
dataframes = []
for city, file_path in file_paths.items():
    try:
        df = pd.read_excel(file_path, engine="openpyxl")
        df['City'] = city
        dataframes.append(df)
        print(f"Loaded {city} - {df.shape}")
    except Exception as e:
        print(f"Failed to load {city} from {file_path}: {e}")

# combine all dataframes into a single dataframe
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"\n✅ combined DataFrame shape: {combined_df.shape}")
    print(f"Combined DataFrame columns: {combined_df.columns.tolist()}")
    print(f"Combined DataFrame head:\n{combined_df.head()}")

    

Loaded Bangalore - (1481, 6)
Loaded Chennai - (1419, 6)
Loaded Delhi - (1485, 6)
Loaded Hyderabad - (1483, 6)
Loaded Jaipur - (1120, 6)
Loaded Kolkata - (1381, 6)

✅ combined DataFrame shape: (8369, 6)
Combined DataFrame columns: ['new_car_detail', 'new_car_overview', 'new_car_feature', 'new_car_specs', 'car_links', 'City']
Combined DataFrame head:
                                      new_car_detail  \
0  {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...   
1  {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...   
2  {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...   
3  {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...   
4  {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...   

                                    new_car_overview  \
0  {'heading': 'Car overview', 'top': [{'key': 'R...   
1  {'heading': 'Car overview', 'top': [{'key': 'R...   
2  {'heading': 'Car overview', 'top': [{'key': 'R...   
3  {'heading': 'Car overview', 'top': [{'key': 'R...   
4  {'heading': 'Car overview', '

In [6]:
import os
print(os.getcwd())


d:\Education\Data Science\Project\car-dheko-used-car-price-prediction\notebooks


Handline Missing Values

Step 1: Identify missing values

In [7]:
# summary of the missing values
missing_summary = combined_df.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_summary / len(combined_df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_summary,
    'Percentage': missing_percentage.round(2)
})
print("🔍 Missing Values Summary:")
print(missing_df[missing_df['Missing Values'] > 0])

🔍 Missing Values Summary:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [8]:
combined_df.dtypes

new_car_detail      object
new_car_overview    object
new_car_feature     object
new_car_specs       object
car_links           object
City                object
dtype: object

In [9]:
combined_df['new_car_detail'].iloc[0:10]

0    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
1    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
2    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
3    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
4    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
5    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
6    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
7    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
8    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
9    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
Name: new_car_detail, dtype: object

In [10]:
# Convert string representation of dictionaries to actual dictionaries
import ast

# Convert 'new_car_detail' column to dictionaries
combined_df["new_car_detail"] = combined_df["new_car_detail"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)  
print("✅ Converted 'new_car_detail' to dictionaries.")
print(combined_df['new_car_detail'].head())  

✅ Converted 'new_car_detail' to dictionaries.
0    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
1    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
2    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
3    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
4    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
Name: new_car_detail, dtype: object


In [11]:
# Convert new_car_detail dictionary column into a DataFrame
detail_df = pd.json_normalize(combined_df['new_car_detail'])

# Preview the flattened detail data
print("🔍 Flattened 'new_car_detail' columns:")
print(detail_df.columns.tolist())
print("✅ Preview of flattened new_car_detail:")
print(detail_df.head())
print(detail_df.columns)
# Concatenate the detail DataFrame with the main DataFrame
combined_df = pd.concat([combined_df.drop(columns=['new_car_detail']), detail_df], axis
=1)
print("✅ Combined DataFrame shape after concatenation:", combined_df.shape)

🔍 Flattened 'new_car_detail' columns:
['it', 'ft', 'bt', 'km', 'transmission', 'ownerNo', 'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName', 'price', 'priceActual', 'priceSaving', 'priceFixedText', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc']
✅ Preview of flattened new_car_detail:
   it      ft         bt        km transmission  ownerNo      owner      oem  \
0   0  Petrol  Hatchback  1,20,000       Manual        3  3rd Owner   Maruti   
1   0  Petrol        SUV    32,706       Manual        2  2nd Owner     Ford   
2   0  Petrol  Hatchback    11,949       Manual        1  1st Owner     Tata   
3   0  Petrol      Sedan    17,794       Manual        1  1st Owner  Hyundai   
4   0  Diesel        SUV    60,000       Manual        1  1st Owner   Maruti   

                model  modelYear  centralVariantId               variantName  \
0      Maruti Celerio       2015              3979                       VXI   
1       Ford Ecosport       2018

In [12]:
print(combined_df.columns)

Index(['new_car_overview', 'new_car_feature', 'new_car_specs', 'car_links',
       'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo', 'owner',
       'oem', 'model', 'modelYear', 'centralVariantId', 'variantName', 'price',
       'priceActual', 'priceSaving', 'priceFixedText', 'trendingText.imgUrl',
       'trendingText.heading', 'trendingText.desc'],
      dtype='object')


In [None]:
# Function to flatten any nested dictionary column
import ast
from pandas import json_normalize

def flatten_dict_column(df, column_name):
    """
    Converts a column of stringfield dictionaries to a actual dictionaries flatten them into a single column
    
    Parameters:
        df (DataFrame): The DataFrame containing the column to be flattened
        column_name (str): The name of the column containing the nested dictionaries    
    
    Returns:
        DataFrame: The original DataFrame with the specified column flattened appended to it
    """
    print(f"🔍 Flattening column: {column_name} in DataFrame with shape {df.shape}...")
    
    # Safely evaluate the stringified dictionary
    df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Flatten the nested dictionaries
    nested_df = json_normalize(df[column_name])

    # Rename the columns to keep them identifiable
    nested_df.columns = [f"{column_name}_{col}" for col in nested_df.columns] 

    # Drop the original nested column
    df = df.drop(columns=[column_name], inplace=False)   

    # Concatenate the flattened columns to the original DataFrame
    df = pd.concat([df.reset_index(drop=True), nested_df.reset_index(drop=True)], axis=1)
    
    print(f"✅ Added columns: {nested_df.columns.tolist()}")
    return df


In [34]:
# Apply it all 3 remaining columns

# Apply to the other nested fields
for col in ['new_car_overview', 'new_car_feature', 'new_car_specs']:
    if col in combined_df.columns:
        combined_df = flatten_dict_column(combined_df, col)
    else:
        print(f"⚠️ Column '{col}' not found in DataFrame. Skipping...")

⚠️ Column 'new_car_overview' not found in DataFrame. Skipping...
⚠️ Column 'new_car_feature' not found in DataFrame. Skipping...
⚠️ Column 'new_car_specs' not found in DataFrame. Skipping...


In [19]:
print("🗒️ Final columns after all flattening:")
print(combined_df.columns.tolist())

print("sample rows:")
print(combined_df.head(2))

🗒️ Final columns after all flattening:
['car_links', 'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo', 'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName', 'price', 'priceActual', 'priceSaving', 'priceFixedText', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc', 'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats', 'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership', 'Engine Displacement', 'Transmission', 'Year of Manufacture', 'new_car_overview_heading', 'new_car_overview_top', 'new_car_overview_bottomData', 'new_car_feature_heading', 'new_car_feature_top', 'new_car_feature_data', 'new_car_feature_commonIcon', 'new_car_specs_heading', 'new_car_specs_top', 'new_car_specs_data', 'new_car_specs_commonIcon']
sample rows:
                                           car_links       City  it      ft  \
0  https://www.cardekho.com/used-car-details/used...  Bangalore   0  Petrol   
1  https://www.cardekho.com/buy

We still have nested lists/dictionaries in our data. Let's flatten them using a function.  
1. new_car_overview_top (list of dicts)
2. new_car_feature_top (list of dicts)
3. new_car_feture_data (list of dicts)
4. new_car_specs_top (list of dicts)
5. new_car_specs_data (dict of dicts)  

Further flatten or extract key values for each list of dicts.

In [20]:
# Reuseable extract function for nested dictionaries
def extract_spec_value(specs_list, key):
    """
    Extracts the value of a given key from a list of dictionaries.
    
    Args:
        specs_list (list): A list of dictionaries containing specifications.
        key (str): The key for which the value needs to be extracted.
    
    Returns:
        str or none: A list of values corresponding to the given key. else none.
    """
    if isinstance(specs_list, list):
        for item in specs_list:
            if isinstance(item, dict) and item.get("key") == key:
                return item.get("Value")
    return None

# 1.------ new_car_specs (dict with 'top' as list of dicts) --------
# Apply for some key specs
if 'new_car_specs_top' in combined_df.columns:
    combined_df['Mileage'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Mileage') if isinstance(x, dict) else None)
    combined_df['Engine'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Engine') if isinstance(x, dict) else None)
    combined_df['Max Power'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Max Power') if isinstance(x, dict) else None)
    combined_df['Torque'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Torque') if isinstance(x, dict) else None)
    combined_df['Seats'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Seats') if isinstance(x, dict) else None)
    print("Successfully extracted the specs from the new_car_specs_top column.")
else:
    print("⚠️ Column 'new_car_specs_top' not found in DataFrame. Skipping...")
    # Initialize new_car_specs_top column with null values if new_car_specs_top does not exist

    combined_df['Mileage'] = pd.NA
    combined_df['Engine'] = pd.NA
    combined_df['Max Power'] = pd.NA
    combined_df['Torque'] = pd.NA
    combined_df['Seats'] = pd.NA
     
# 2.------- new_car_overview (dict with 'top' as list of dicts) --------
# Apply for some key specs
if 'new_car_overview_top' in combined_df.columns:
    combined_df['Registration Year'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Registration Year') if isinstance(x, dict) else None)
    combined_df['Insurance Validity'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Insurance Validity') if isinstance(x, dict) else None)
    combined_df['Fuel Type'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Fuel Type') if isinstance(x, dict) else None)
    combined_df['Kms Driven'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Kms Driven') if isinstance(x, dict) else None)
    combined_df['RTO'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'RTO') if isinstance(x, dict) else None)
    combined_df['Ownership'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Ownership') if isinstance(x, dict) else None)
    combined_df['Engine Displacement'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Engine Displacement') if isinstance(x, dict) else None)
    combined_df['Transmission'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Transmission') if isinstance(x, dict) else None)
    combined_df['Year of Manufacture'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x.get('top', []), 'Year of Manufacture') if isinstance(x, dict) else None)
    print("Successfully extracted the specs from the new_car_overview_top column.")
else:
    print("⚠️ Column 'new_car_overview_top' not found in DataFrame. Skipping...")
    # Initialize new_car_specs_top column with null values if new_car_specs_top does not exist      
    combined_df['Registration Year'] = pd.NA
    combined_df['Insurance Validity'] = pd.NA
    combined_df['Fuel Type'] = pd.NA
    combined_df['Kms Driven'] = pd.NA
    combined_df['RTO'] = pd.NA
    combined_df['Ownership'] = pd.NA
    combined_df['Engine Displacement'] = pd.NA
    combined_df['Transmission'] = pd.NA
    combined_df['Year of Manufacture'] = pd.NA
    


Successfully extracted the specs from the new_car_specs_top column.
Successfully extracted the specs from the new_car_overview_top column.


In [21]:
print(combined_df.columns)

Index(['car_links', 'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo',
       'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName',
       'price', 'priceActual', 'priceSaving', 'priceFixedText',
       'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc',
       'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats',
       'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven',
       'RTO', 'Ownership', 'Engine Displacement', 'Transmission',
       'Year of Manufacture', 'new_car_overview_heading',
       'new_car_overview_top', 'new_car_overview_bottomData',
       'new_car_feature_heading', 'new_car_feature_top',
       'new_car_feature_data', 'new_car_feature_commonIcon',
       'new_car_specs_heading', 'new_car_specs_top', 'new_car_specs_data',
       'new_car_specs_commonIcon'],
      dtype='object')


In [22]:
# dropping the original nested columns
combined_df.drop(['new_car_specs_top', 'new_car_overview_top'], axis=1, inplace=True)

# concate the extracted columns to the original DataFrame
combined_df = pd.concat([combined_df, combined_df[['Mileage', 'Engine', 'Max Power', 'Torque', 'Seats', 'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership', 'Engine Displacement', 'Transmission', 'Year of Manufacture']]], axis=1)


In [23]:
print(combined_df.columns)

Index(['car_links', 'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo',
       'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName',
       'price', 'priceActual', 'priceSaving', 'priceFixedText',
       'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc',
       'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats',
       'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven',
       'RTO', 'Ownership', 'Engine Displacement', 'Transmission',
       'Year of Manufacture', 'new_car_overview_heading',
       'new_car_overview_bottomData', 'new_car_feature_heading',
       'new_car_feature_top', 'new_car_feature_data',
       'new_car_feature_commonIcon', 'new_car_specs_heading',
       'new_car_specs_data', 'new_car_specs_commonIcon', 'Mileage', 'Engine',
       'Max Power', 'Torque', 'Seats', 'Registration Year',
       'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership',
       'Engine Displacement', 'Transmission', 

In [None]:
# Flatten Nested Features() Function
import ast
import pandas as pd

def extract_feature_flags(feature_data): 
    feature_flags = {}
    if isinstance(feature_data, list):
        for section in feature_data:
            if isinstance(section, dict) and 'value' in section:
                for feature in section['value']:
                    if feature:
                        feature_flags[f"has_{feature}"]=1
    return feature_flags

def extract_top_features(top_data):
    flags = {}
    if isinstance(top_data, list):
        for item in top_data:
            if isinstance(item, dict):
                feature = item.get('value')
                if feature:
                    flags[f"hasTop{feature}"] = 1
    return flags

def extract_spec_fields(spec_data):
    flat_spec = {}
    if isinstance(spec_data, list):
        for section in spec_data:
            if isinstance(section, dict) and 'value' in section:
                for item in section['value']:
                    if isinstance(item, dict) and 'key' in item and 'Value' in item:
                        flat_spec[item['key']] = item['Value']
    return flat_spec

def flatten_nested_features(df):
    print("🚀 Flattening new_car_feature_top...")
    df['new_car_feature_top']=df['new_car_feature_top'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
        )
    top_flags_df = df['new_car_feature_top'].apply(extract_top_features)    
    top_flags_df = pd.DataFrame(top_flags_df.tolist()).fillna(0).astype(int)

    print("🔍 Flattening new_car_feature_data...")
    df['new_car_feature_data']=df['new_car_feature_data'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    feature_flags_df = df['new_car_feature_data'].apply(extract_feature_flags)
    feature_flags_df = pd.DataFrame(feature_flags_df.tolist()).fillna(0).astype(int)

    print("⚙️ Flatting new_car_specs_data...")
    df['new_car_specs_data'] = df['new_car_specs_data'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )
    specs_df = df['new_car_specs_data'].apply(extract_spec_fields)
    specs_df = pd.DataFrame(specs_df.tolist()).fillna(0).astype(int)

    print("✅ Flattening completed. Merging all flattened data...")

    # Drop only if present (avoids KeyError)
    columns_to_drop = ['new_car_feature_top','new_car_feature_data',
        'new_car_specs_data',
    ]
    df =df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    # Concatenate the original DataFrame with the flattened DataFrames  

    df = pd.concat([
        df.reset_index(drop=True), 
        top_flags_df.reset_index(drop=True), 
        feature_flags_df.reset_index(drop=True), 
        specs_df.reset_index(drop=True)
    ], axis=1)
    
    print("✅ All fields flattened and merged successfully.")
    return df

In [30]:
print(combined_df.columns)

Index(['car_links', 'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo',
       'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName',
       'price', 'priceActual', 'priceSaving', 'priceFixedText',
       'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc',
       'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats',
       'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven',
       'RTO', 'Ownership', 'Engine Displacement', 'Transmission',
       'Year of Manufacture', 'new_car_overview_heading',
       'new_car_overview_bottomData', 'new_car_feature_heading',
       'new_car_feature_top', 'new_car_feature_data',
       'new_car_feature_commonIcon', 'new_car_specs_heading',
       'new_car_specs_data', 'new_car_specs_commonIcon', 'Mileage', 'Engine',
       'Max Power', 'Torque', 'Seats', 'Registration Year',
       'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership',
       'Engine Displacement', 'Transmission', 

In [31]:
print("Available Columns:")
print(combined_df.columns.tolist())

Available Columns:
['car_links', 'City', 'it', 'ft', 'bt', 'km', 'transmission', 'ownerNo', 'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName', 'price', 'priceActual', 'priceSaving', 'priceFixedText', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc', 'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats', 'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership', 'Engine Displacement', 'Transmission', 'Year of Manufacture', 'new_car_overview_heading', 'new_car_overview_bottomData', 'new_car_feature_heading', 'new_car_feature_top', 'new_car_feature_data', 'new_car_feature_commonIcon', 'new_car_specs_heading', 'new_car_specs_data', 'new_car_specs_commonIcon', 'Mileage', 'Engine', 'Max Power', 'Torque', 'Seats', 'Registration Year', 'Insurance Validity', 'Fuel Type', 'Kms Driven', 'RTO', 'Ownership', 'Engine Displacement', 'Transmission', 'Year of Manufacture']


In [32]:
combined_df = flatten_nested_features(combined_df)

🚀 Flattening new_car_feature_top...
🔍 Flattening new_car_feature_data...
⚙️ Flatting new_car_specs_data...
✅ Flattening completed. Merging all flattened data...
✅ All fields flattened and merged successfully.


In [33]:
print("Available Columns:")
print(combined_df.columns.tolist())

Available Columns:


**✅ Key Highlights of Progress:**  
- ⬇️ Loaded and merged multi-city car data
- 🧼 Cleaned and standardized nested dictionary fields
- 🔄 Converted new_car_feature_top into one-hot encoded flags
- 🛠 Flattened complex features into ML-ready columns