##Loading the train dataset

In [1]:
import pandas as pd
# Load the dataset
df = pd.read_csv('train.csv')

##Identifying the missing values

In [2]:
# Identify missing values in each column
missing_values = df.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)

Missing Values in Each Column:
Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64


##Handling missing values

In [3]:
# Handle missing values by imputing with mean, median, or mode
df['Mileage'] = df['Mileage'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['Engine'] = df['Engine'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['Power'] = df['Power'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['Seats'] = df['Seats'].apply(lambda x: float(x) if isinstance(x, str) else x)

# Drop rows with remaining missing values
df.dropna(subset=['New_Price'], inplace=True)

In [4]:
# Remove units from attributes
df['Mileage'] = df['Mileage'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['Engine'] = df['Engine'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['Power'] = df['Power'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)
df['New_Price'] = df['New_Price'].apply(lambda x: float(str(x).split()[0]) if isinstance(x, str) else x)


##Convert categorical variables into numerical

In [5]:
# Convert categorical variables into one-hot encoded values
df = pd.get_dummies(df, columns=['Fuel_Type', 'Transmission'])


##Calculating the current age of the car by subtracting year value from the current year

In [6]:
# Calculate current year
current_year = pd.to_datetime('today').year

# Create a new column for car age
df['Car_Age'] = current_year - df['Year']

# Verify the changes
print(df.head())

    Unnamed: 0                               Name Location  Year  \
1            2                       Honda Jazz V  Chennai  2011   
5            7  Toyota Innova Crysta 2.8 GX AT 8S   Mumbai  2016   
8           10                   Maruti Ciaz Zeta    Kochi  2018   
13          15        Mitsubishi Pajero Sport 4X4    Delhi  2014   
18          20                  BMW 3 Series 320d    Kochi  2014   

    Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  New_Price  \
1               46000      First    13.00  1199.0   88.70    5.0       8.61   
5               36000      First    11.36  2755.0  171.50    8.0      21.00   
8               25692      First    21.56  1462.0  103.25    5.0      10.65   
13             110000      First    13.50  2477.0  175.56    7.0      32.01   
18              32982      First    22.69  1995.0  190.00    5.0      47.87   

    Price  Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Petrol  \
1    4.50                 0                   0