# Imports and configurations

In [1]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [2]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

In [3]:
# prompt: pandas set display settings to display wihtout e+ and 2 numbers after decimal
pd.options.display.float_format = '{:.2f}'.format


# Load data

In [4]:
data = "https://storage.googleapis.com/edulabs-public-datasets/CAR%20DETAILS%20FROM%20CAR%20DEKHO.csv"

In [5]:
df = pd.read_csv(data)

In [6]:
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [7]:
print(list(df['name'])[:20])

['Maruti 800 AC', 'Maruti Wagon R LXI Minor', 'Hyundai Verna 1.6 SX', 'Datsun RediGO T Option', 'Honda Amaze VX i-DTEC', 'Maruti Alto LX BSIII', 'Hyundai Xcent 1.2 Kappa S', 'Tata Indigo Grand Petrol', 'Hyundai Creta 1.6 VTVT S', 'Maruti Celerio Green VXI', 'Chevrolet Sail 1.2 Base', 'Tata Indigo Grand Petrol', 'Toyota Corolla Altis 1.8 VL CVT', 'Maruti 800 AC', 'Maruti Wagon R LXI Minor', 'Hyundai Verna 1.6 SX', 'Datsun RediGO T Option', 'Honda Amaze VX i-DTEC', 'Maruti Alto LX BSIII', 'Hyundai Xcent 1.2 Kappa S']


# Correlations

## Numerical

In [8]:
px.scatter(df, x='year', y='selling_price')

In [17]:
df[(df['year'] == 2010) & (df['selling_price'] > 4_000_000)]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
3875,Land Rover Range Rover 4.4 Diesel LWB Vogue SE,2010,4200000,100000,Diesel,Dealer,Automatic,First Owner


In [18]:
df[(df['year'] == 2010)]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
41,Mercedes-Benz E-Class E 200 CGI Elegance,2010,850000,119000,Petrol,Dealer,Automatic,First Owner
69,Chevrolet Tavera Neo LS B3 - 7(C) seats BSIII,2010,280000,350000,Diesel,Individual,Manual,Second Owner
80,Hyundai i10 Sportz 1.2,2010,248000,63000,Petrol,Individual,Manual,Second Owner
107,Hyundai Sonata CRDi M/T,2010,600000,100000,Diesel,Individual,Manual,First Owner
113,Maruti Wagon R LXI Minor,2010,100000,60000,Petrol,Individual,Manual,Fourth & Above Owner
...,...,...,...,...,...,...,...,...
4280,Hyundai i10 Sportz 1.2,2010,250000,110000,Petrol,Individual,Manual,Second Owner
4286,Fiat Punto 1.3 Emotion,2010,130000,210000,Diesel,Individual,Manual,Second Owner
4303,Hyundai Verna 1.6 VTVT,2010,190000,38000,Petrol,Dealer,Manual,First Owner
4329,Tata Manza Aura Safire BS IV,2010,160000,60000,Petrol,Individual,Manual,Second Owner


In [9]:
px.scatter(df, x='km_driven', y='selling_price')

In [10]:
px.box(df, y='fuel', x='selling_price', color='fuel')

In [11]:
px.box(df, y='seller_type', x='selling_price', color='seller_type')

In [12]:
px.box(df, y='transmission', x='selling_price', color='transmission')

In [13]:
px.box(df, y='owner', x='selling_price', color='owner')

- How should we handle Test Drive Car in terms of ordinality?
- It looks like the price is much higher than all other hands, but we would suggest that Test Drive Car should reduce the price
- This might happen because of these cars are with later years


In [19]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,2832
Second Owner,1106
Third Owner,304
Fourth & Above Owner,81
Test Drive Car,17


In [20]:
df[df['owner'] == 'Test Drive Car'].sort_values('year')

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
2782,Honda Jazz 1.5 VX i DTEC,2017,700000,24585,Diesel,Dealer,Manual,Test Drive Car
3615,Honda City i-VTEC CVT ZX,2018,1165000,13000,Petrol,Dealer,Automatic,Test Drive Car
2362,Ford Figo Aspire Titanium Plus Diesel,2019,894999,13000,Diesel,Dealer,Manual,Test Drive Car
2359,Volkswagen Vento 1.5 TDI Highline BSIV,2019,1350000,5400,Diesel,Dealer,Manual,Test Drive Car
1795,Ford Ecosport 1.5 Petrol Titanium Plus,2019,1100000,5166,Petrol,Dealer,Manual,Test Drive Car
2360,Renault KWID Climber 1.0 MT Opt BSIV,2020,541000,1000,Petrol,Dealer,Manual,Test Drive Car
1781,Ford Freestyle Titanium Plus,2020,852000,1010,Petrol,Dealer,Manual,Test Drive Car
1780,Ford Ecosport Thunder Edition Diesel,2020,1331000,1010,Diesel,Dealer,Manual,Test Drive Car
1714,Ford Freestyle Titanium Diesel,2020,784000,101,Diesel,Dealer,Manual,Test Drive Car
1777,Ford Ecosport 1.5 Petrol Trend,2020,1030000,1010,Petrol,Dealer,Manual,Test Drive Car


In [21]:
px.box(df, y='owner', x='year', color='owner')

In [22]:
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

In [23]:
ordinal_encoder = OrdinalEncoder(categories=[[
   'First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car'
]], dtype='int8')
df['owner_encoded'] = ordinal_encoder.fit_transform(df[['owner']]) + 1

In [24]:
df.select_dtypes('number').corr()

Unnamed: 0,year,selling_price,km_driven,owner_encoded
year,1.0,0.41,-0.42,-0.41
selling_price,0.41,1.0,-0.19,-0.2
km_driven,-0.42,-0.19,1.0,0.27
owner_encoded,-0.41,-0.2,0.27,1.0


## Lets get manufacturer

In [25]:
df['manufacturer'] = df['name'].str.split(' ').str[0]

In [26]:
px.box(df, y='manufacturer', x='selling_price', color='manufacturer')

In [None]:
df['manufacturer'].value_counts()

# Lets inspect and remove outliers

In [27]:
df[df['selling_price'] > 7_000_000]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,owner_encoded,manufacturer
89,Mercedes-Benz S-Class S 350d Connoisseurs Edition,2017,8150000,6500,Diesel,Dealer,Automatic,First Owner,1,Mercedes-Benz
3872,Audi RS7 2015-2019 Sportback Performance,2016,8900000,13000,Petrol,Dealer,Automatic,First Owner,1,Audi


In [28]:
df[df['km_driven'] > 500_000]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,owner_encoded,manufacturer
525,Maruti SX4 S Cross DDiS 320 Delta,2016,665000,560000,Diesel,Dealer,Manual,First Owner,1,Maruti
1243,Maruti Swift VXI BSIII,2009,250000,806599,Petrol,Dealer,Manual,First Owner,1,Maruti
4184,Maruti SX4 S Cross DDiS 320 Delta,2016,665000,560000,Diesel,Dealer,Manual,First Owner,1,Maruti


In [29]:
print(f"rows before drop: {len(df)}")

rows before drop: 4340


In [30]:
df.drop(df[df['selling_price'] > 7_000_000].index, inplace=True)
df.drop(df[df['km_driven'] > 500_000].index, inplace=True)

In [31]:
print(f"rows after drop: {len(df)}")

rows after drop: 4335


# Inspect distribtions

In [32]:
df['selling_price'].plot(kind='hist')

Looks like we might need log transformation

In [35]:
np.log(df['selling_price']).plot(kind='hist', bins=150)

# Train the model

In [None]:
df.shape

In [38]:
df1 = df.drop(columns=['name','owner_encoded'])

In [46]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first',
                          categories=[list(df['manufacturer'].unique())]
                          ),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [47]:
# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

Unnamed: 0,Test,Train
MSE,108775379840.04,113231500597.59
MAPE,0.34,0.35
R2,0.64,0.63


# What happens if we remove manufacturer?

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        # (
        #     'manufacturer-one-hot',
        #     OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
        #     ['manufacturer']
        # ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['manufacturer', 'selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# What happens if we remove all the outliers according to IQR?

In [None]:
def drop_outliers(df, col):
    """user iqr to drop outliers"""

    # remove outlier prices using iqr
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

In [None]:
len(df)

In [None]:
df['selling_price'].shape

In [None]:
price_no_outliers = drop_outliers(df, 'selling_price')['selling_price']

In [None]:
price_no_outliers.shape

In [None]:
t1 = drop_outliers(df, 'selling_price')
t2 = drop_outliers(t1, 'km_driven')
len(t2)

In [None]:
price_no_outliers.plot(kind='hist')

In [None]:
np.log(price_no_outliers).plot(kind='hist')

In [None]:
df2 = drop_outliers(df1, 'selling_price')
df2 = drop_outliers(df2, 'km_driven')

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df2.drop(columns=['selling_price'])
y = np.log(df2['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# Add scaling

In [48]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            StandardScaler(),
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

Unnamed: 0,Test,Train
MSE,52933874324.41,57718972095.08
MAPE,0.31,0.31
R2,0.82,0.81


# Add scaling to all the features

In [49]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(sparse_output=False, drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(sparse_output=False, drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

Unnamed: 0,Test,Train
MSE,52933747156.42,57718971370.2
MAPE,0.31,0.31
R2,0.82,0.81


# Check correlations after encoding

In [50]:
df1.shape

(4335, 8)

In [51]:
len(pipeline[-1].coef_)

38

In [52]:
len(preprocessor.get_feature_names_out())

38

In [54]:
encoded_df_with_price = pd.concat([
    pd.DataFrame(preprocessor.fit_transform(df1), columns=preprocessor.get_feature_names_out(), index=df1.index),
    np.log(df1['selling_price'])], axis=1)

In [55]:
encoded_df_with_price

Unnamed: 0,numerical__km_driven,numerical__year,manufacturer-one-hot__manufacturer_Hyundai,manufacturer-one-hot__manufacturer_Datsun,manufacturer-one-hot__manufacturer_Honda,manufacturer-one-hot__manufacturer_Tata,manufacturer-one-hot__manufacturer_Chevrolet,manufacturer-one-hot__manufacturer_Toyota,manufacturer-one-hot__manufacturer_Jaguar,manufacturer-one-hot__manufacturer_Mercedes-Benz,...,manufacturer-one-hot__manufacturer_Kia,one-hot-cat__fuel_Diesel,one-hot-cat__fuel_Electric,one-hot-cat__fuel_LPG,one-hot-cat__fuel_Petrol,one-hot-cat__seller_type_Individual,one-hot-cat__seller_type_Trustmark Dealer,one-hot-cat__transmission_Manual,ordinal-cat__owner,selling_price
0,70000.00,2007.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,1.00,0.00,1.00,0.00,11.00
1,50000.00,2007.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,1.00,0.00,1.00,0.00,11.81
2,100000.00,2012.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,13.30
3,46000.00,2017.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,1.00,0.00,1.00,0.00,12.43
4,141000.00,2014.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,1.00,1.00,13.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,80000.00,2014.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,1.00,1.00,12.92
4336,80000.00,2014.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,1.00,1.00,12.92
4337,83000.00,2009.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,1.00,0.00,1.00,1.00,11.61
4338,90000.00,2016.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00,13.67


In [56]:
encoded_df_with_price.corr()['selling_price'].abs().sort_values(ascending=False)

Unnamed: 0,selling_price
selling_price,1.0
numerical__year,0.7
one-hot-cat__transmission_Manual,0.41
one-hot-cat__fuel_Diesel,0.39
one-hot-cat__fuel_Petrol,0.37
ordinal-cat__owner,0.32
one-hot-cat__seller_type_Individual,0.28
numerical__km_driven,0.26
manufacturer-one-hot__manufacturer_BMW,0.22
manufacturer-one-hot__manufacturer_Audi,0.22


**❓ What happens if we leave only highly corelated features?**

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24766 stored elements and shape (4335, 38)>

**❓What happens if we add 2nd order polynomial features after leaving only hoghly correlated features?**