In [181]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [182]:
data = pd.read_excel('used_cars_data.xlsx')

In [183]:
data.head()

Unnamed: 0,name,city,year,mileage,transmission,fuel type,engine capacity,price
0,Suzuki Wagon R 2022 Hybrid FZ,Lahore,2022.0,"9,019 km",Automatic,Hybrid,660 cc,37.95 lacs
1,Toyota Yaris Hatchback 2020,Karachi,2020.0,"13,250 km",Automatic,Petrol,1000 cc,50 lacs
2,Toyota Surf 1993,Lahore,1993.0,"120,000 km",Automatic,Petrol,2700 cc,25.6 lacs
3,BMW 3 Series 2013 316i,Gujranwala,2013.0,"75,000 km",Automatic,Petrol,1600 cc,73 lacs
4,Suzuki Wagon R 2020 Hybrid FX,Multan,2020.0,"8,500 km",Automatic,Hybrid,660 cc,36.75 lacs


## 1. ANALYSIS

### Objective: Create regression model that will predict the price based on the given features.

### FEATURE ANALYSIS

'name': This refers to the brand or manufacturer of the car. It is suggested to extract this information from the full name of the car model, as it can influence buyer perception and, therefore, affect pricing.

'city': The city where the car is sold could impact its price, with differences between rural and urban areas. It is proposed to gather numerical data representing the average purchasing power of each city, such as average income and other economic indicators.

'year': The year of manufacture indicates the freshness of the car in the market. Converting this into a numeric indicator by subtracting the current year can give the age of the car's model, which can influence pricing.

'mileage': This feature represents the distance the car has traveled, which can affect its condition and, therefore, its price. It is suggested to retain this feature as is.

'transmission': The type of transmission (e.g., manual, automatic) can be a preference for buyers and may influence pricing. It is recommended to keep this feature unchanged.

'fuel type': The type of fuel used by the car can impact the type of engine, its cost, and buyer perception. It is proposed to retain this feature without modification.

'engine capacity': This feature indicates the size of the engine, which can directly influence the price of the car. It is suggested to keep this feature as is.

### FEATURE ENGINEERING STEPS

1. Transform 'name' to the equivalent brand of the car.
2. Transform 'city' into a feature that provides a numeric indicator of its economic position. This could include indicators such as 'average income', 'GDP', and other economic metrics.
3. Transform the 'year' by calculating the difference between the model year and the current year (2024).
4. Transform 'transmission' using sklearn's OneHotEncoder.
5. Transform 'fuel type' using sklearn's OneHotEncoder.

### 1. Transform 'name' to the equivalent brand of the car.


In [184]:
# Transform 'name' into 'brand'
data['brand'] = data['name'].str.split().str[0]

# Drop the 'name' column
data.drop(columns=['name'], inplace=True)

# Print the first few rows to verify the transformation
data.head()


# Assuming df is your DataFrame
new_column_order = ['brand', 'city', 'year', 'mileage', 'transmission', 'fuel type', 'engine capacity', 'price']

# Reindex the DataFrame with the new column order
df = data.reindex(columns=new_column_order)


#REMOVING CURRENCY STRINGS
def convert_price(price):
    if isinstance(price, str):
        if 'lacs' in price:
            return float(price.replace(' lacs', ''))
        elif 'crore' in price:
            return float(price.replace(' crore', '')) * 100
    return price

# Apply the custom function to the 'price' column
df['price'] = df['price'].apply(convert_price)



# Rename the column 'engine capacity' to 'engine_capacity'
df.rename(columns={'engine capacity': 'engine_capacity'}, inplace=True)




#REMOVING 'cc' STRING ON engine_capacity
def cc_remover(engine_capacity):
    if isinstance(engine_capacity, str):
        if 'cc' in engine_capacity:
            return int(engine_capacity.replace(' cc',''))
        
        return engine_capacity

# Apply the custom function to the 'engine_capacity' column
df['engine_capacity'] = df['engine_capacity'].apply(cc_remover)


# #REMOVING 'km' STRING ON mileage
# def km_remover(mileage):
#     if isinstance(mileage, str):
#         if 'km' in mileage:
#             return float(mileage.replace(' km',''))
        
#         return mileage

# # Apply the custom function to the 'mileage' column
# df['mileage'] = df['mileage'].apply(km_remover)


# Print the DataFrame to verify the change
df.head()

Unnamed: 0,brand,city,year,mileage,transmission,fuel type,engine_capacity,price
0,Suzuki,Lahore,2022.0,"9,019 km",Automatic,Hybrid,660,37.95
1,Toyota,Karachi,2020.0,"13,250 km",Automatic,Petrol,1000,50.0
2,Toyota,Lahore,1993.0,"120,000 km",Automatic,Petrol,2700,25.6
3,BMW,Gujranwala,2013.0,"75,000 km",Automatic,Petrol,1600,73.0
4,Suzuki,Multan,2020.0,"8,500 km",Automatic,Hybrid,660,36.75


# START HERE, REMOVE KM to MILEAGE, ITS ALL STRING

### 2. Transform 'city' into a feature that provides a numeric indicator of its economic position. This could include indicators such as 'average income', 'GDP', and other economic metrics.


### 3. Transform the 'year' by calculating the difference between the model year and the current year (2024).
