Import and concatenate

In [35]:
import pandas as pd
import os

# File paths
base_path = os.path.join("..","data","raw")
file_paths = {
	"Bangalore": os.path.join(base_path, "bangalore_cars.xlsx"),
	"Chennai": os.path.join(base_path, "chennai_cars.xlsx"),
	"Delhi": os.path.join(base_path, "delhi_cars.xlsx"),
	"Hyderabad": os.path.join(base_path, "hyderabad_cars.xlsx"),
	"Jaipur": os.path.join(base_path, "jaipur_cars.xlsx"),
	"Kolkata": os.path.join(base_path, "kolkata_cars.xlsx"),
}

# Read and structure each dataset and add a 'City' column
dataframes = []
for city, file_path in file_paths.items():
    try:
        df = pd.read_excel(file_path, engine="openpyxl")
        df['City'] = city
        dataframes.append(df)
        print(f"Loaded {city} - {df.shape}")
    except Exception as e:
        print(f"Failed to load {city} from {file_path}: {e}")

# combine all dataframes into a single dataframe
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"\n✅ combined DataFrame shape: {combined_df.shape}")
    print(f"Combined DataFrame columns: {combined_df.columns.tolist()}")
    print(f"Combined DataFrame head:\n{combined_df.head()}")

    

Loaded Bangalore - (1481, 6)
Loaded Chennai - (1419, 6)
Loaded Delhi - (1485, 6)
Loaded Hyderabad - (1483, 6)
Loaded Jaipur - (1120, 6)
Loaded Kolkata - (1381, 6)

✅ combined DataFrame shape: (8369, 6)
Combined DataFrame columns: ['new_car_detail', 'new_car_overview', 'new_car_feature', 'new_car_specs', 'car_links', 'City']
Combined DataFrame head:
                                      new_car_detail  \
0  {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...   
1  {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...   
2  {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...   
3  {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...   
4  {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...   

                                    new_car_overview  \
0  {'heading': 'Car overview', 'top': [{'key': 'R...   
1  {'heading': 'Car overview', 'top': [{'key': 'R...   
2  {'heading': 'Car overview', 'top': [{'key': 'R...   
3  {'heading': 'Car overview', 'top': [{'key': 'R...   
4  {'heading': 'Car overview', '

In [36]:
import os
print(os.getcwd())


d:\Education\Data Science\Project\car-dheko-used-car-price-prediction\notebooks


Handline Missing Values

Step 1: Identify missing values

In [37]:
# summary of the missing values
missing_summary = combined_df.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_summary / len(combined_df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_summary,
    'Percentage': missing_percentage.round(2)
})
print("🔍 Missing Values Summary:")
print(missing_df[missing_df['Missing Values'] > 0])

🔍 Missing Values Summary:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []


In [38]:
combined_df.dtypes

new_car_detail      object
new_car_overview    object
new_car_feature     object
new_car_specs       object
car_links           object
City                object
dtype: object

In [39]:
combined_df['new_car_detail'].iloc[0:10]

0    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
1    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
2    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
3    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
4    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
5    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
6    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
7    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
8    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
9    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
Name: new_car_detail, dtype: object

In [41]:
# Convert string representation of dictionaries to actual dictionaries
import ast

# Convert 'new_car_detail' column to dictionaries
combined_df["new_car_detail"] = combined_df["new_car_detail"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)  
print("✅ Converted 'new_car_detail' to dictionaries.")
print(combined_df['new_car_detail'].head())  

✅ Converted 'new_car_detail' to dictionaries.
0    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
1    {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...
2    {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...
3    {'it': 0, 'ft': 'Petrol', 'bt': 'Sedan', 'km':...
4    {'it': 0, 'ft': 'Diesel', 'bt': 'SUV', 'km': '...
Name: new_car_detail, dtype: object


In [42]:
# Convert new_car_detail dictionary column into a DataFrame
detail_df = pd.json_normalize(combined_df['new_car_detail'])

# Preview the flattened detail data
print("🔍 Flattened 'new_car_detail' columns:")
print(detail_df.columns.tolist())
print("✅ Preview of flattened new_car_detail:")
print(detail_df.head())
print(detail_df.columns)


🔍 Flattened 'new_car_detail' columns:
['it', 'ft', 'bt', 'km', 'transmission', 'ownerNo', 'owner', 'oem', 'model', 'modelYear', 'centralVariantId', 'variantName', 'price', 'priceActual', 'priceSaving', 'priceFixedText', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc']
✅ Preview of flattened new_car_detail:
   it      ft         bt        km transmission  ownerNo      owner      oem  \
0   0  Petrol  Hatchback  1,20,000       Manual        3  3rd Owner   Maruti   
1   0  Petrol        SUV    32,706       Manual        2  2nd Owner     Ford   
2   0  Petrol  Hatchback    11,949       Manual        1  1st Owner     Tata   
3   0  Petrol      Sedan    17,794       Manual        1  1st Owner  Hyundai   
4   0  Diesel        SUV    60,000       Manual        1  1st Owner   Maruti   

                model  modelYear  centralVariantId               variantName  \
0      Maruti Celerio       2015              3979                       VXI   
1       Ford Ecosport       2018

In [45]:
# Function to flatten any nested dictionary column
import ast
from pandas import json_normalize

def flatten_dict_column(df, column_name):
    """
    Converts a column of stringfield dictionaries to a actual dictionaries flatten them into a single column
    
    Parameters:
        df (DataFrame): The DataFrame containing the column to be flattened
        column_name (str): The name of the column containing the nested dictionaries    
    
    Returns:
        DataFrame: The original DataFrame with the specified column flattened appended to it
    """
    print(f"🔍 Flattening column: {column_name} in DataFrame with shape {df.shape}...")
    
    # Safely evaluate the stringified dictionary
    df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Flatten the nested dictionaries
    nested_df = json_normalize(df[column_name])

    # Rename the columns to keep them identifiable
    nested_df.columns = [f"{column_name}_{col}" for col in nested_df.columns] 

    # Drop the original nested column
    df = df.drop(columns=[column_name], inplace=False)   

    # Concatenate the flattened columns to the original DataFrame
    df = pd.concat([df.reset_index(drop=True), nested_df.reset_index(drop=True)], axis=1)
    
    print(f"✅ Added columns: {nested_df.columns.tolist()}")
    return df


In [46]:
# Apply it all 3 remaining columns

# Apply to the other nested fields
for col in ['new_car_overview', 'new_car_feature', 'new_car_specs']:
    if col in combined_df.columns:
        combined_df = flatten_dict_column(combined_df, col)
    else:
        print(f"⚠️ Column '{col}' not found in DataFrame. Skipping...")

🔍 Flattening column: new_car_overview in DataFrame with shape (8369, 6)...
✅ Added columns: ['new_car_overview_heading', 'new_car_overview_top', 'new_car_overview_bottomData']
🔍 Flattening column: new_car_feature in DataFrame with shape (8369, 8)...
✅ Added columns: ['new_car_feature_heading', 'new_car_feature_top', 'new_car_feature_data', 'new_car_feature_commonIcon']
🔍 Flattening column: new_car_specs in DataFrame with shape (8369, 11)...
✅ Added columns: ['new_car_specs_heading', 'new_car_specs_top', 'new_car_specs_data', 'new_car_specs_commonIcon']


In [47]:
print("🗒️ Final columns after all flattening:")
print(combined_df.columns.tolist())

print("sample rows:")
print(combined_df.head(2))

🗒️ Final columns after all flattening:
['new_car_detail', 'car_links', 'City', 'new_car_overview_heading', 'new_car_overview_top', 'new_car_overview_bottomData', 'new_car_feature_heading', 'new_car_feature_top', 'new_car_feature_data', 'new_car_feature_commonIcon', 'new_car_specs_heading', 'new_car_specs_top', 'new_car_specs_data', 'new_car_specs_commonIcon']
sample rows:
                                      new_car_detail  \
0  {'it': 0, 'ft': 'Petrol', 'bt': 'Hatchback', '...   
1  {'it': 0, 'ft': 'Petrol', 'bt': 'SUV', 'km': '...   

                                           car_links       City  \
0  https://www.cardekho.com/used-car-details/used...  Bangalore   
1  https://www.cardekho.com/buy-used-car-details/...  Bangalore   

  new_car_overview_heading                               new_car_overview_top  \
0             Car overview  [{'key': 'Registration Year', 'value': '2015',...   
1             Car overview  [{'key': 'Registration Year', 'value': 'Feb 20...   

  new_car_

We still have nested lists/dictionaries in our data. Let's flatten them using a function.  
1. new_car_overview_top (list of dicts)
2. new_car_feature_top (list of dicts)
3. new_car_feture_data (list of dicts)
4. new_car_specs_top (list of dicts)
5. new_car_specs_data (dict of dicts)  

Further flatten or extract key values for each list of dicts.

In [49]:
# Reuseable extract function for nested dictionaries
def extract_spec_value(specs_list, key):
    """
    Extracts the value of a given key from a list of dictionaries.
    
    Args:
        specs_list (list): A list of dictionaries containing specifications.
        key (str): The key for which the value needs to be extracted.
    
    Returns:
        str or none: A list of values corresponding to the given key. else none.
    """
    if isinstance(specs_list, list):
        for item in specs_list:
            if isinstance(item, dict) and item.get("key") == key:
                return item.get("Value")
    return None

# 1.------ new_car_specs_top (list of dicts) --------
# Apply for some key specs
combined_df['Mileage'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x, 'Mileage'))
combined_df['Engine'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x, 'Engine'))
combined_df['Max Power'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x, 'Max Power'))
combined_df['Torque'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x, 'Torque'))
combined_df['Seats'] = combined_df['new_car_specs_top'].apply(lambda x: extract_spec_value(x, 'Seats'))

#2.------- new_car_overview_top (list of dicts) --------
# Apply for some key specs
combined_df['Registration Year'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Registration Year'))
combined_df['Insurance Validity'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Insurance Validity'))
combined_df['Fuel Type'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Fuel Type'))
combined_df['Kms Driven'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Kms Driven'))
combined_df['RTO'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'RTO'))
combined_df['Ownership'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Ownership'))
combined_df['Engine Displacement'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Engine Displacement'))
combined_df['Transmission'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Transmission'))
combined_df['Year of Manufacture'] = combined_df['new_car_overview_top'].apply(lambda x: extract_spec_value(x, 'Year of Manufacture'))

# dropping the original nested columns
combined_df.drop(['new_car_specs_top', 'new_car_overview_top','new_car_feature_top'], axis=1, inplace=True)

