# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

In [None]:
# prompt: pandas set display settings to display wihtout e+ and 2 numbers after decimal
pd.options.display.float_format = '{:.2f}'.format


# Load data

In [None]:
data = "https://storage.googleapis.com/edulabs-public-datasets/CAR%20DETAILS%20FROM%20CAR%20DEKHO.csv"

In [None]:
df = pd.read_csv(data)

In [None]:
df

In [None]:
print(list(df['name'])[:20])

# Correlations

## Numerical

In [None]:
px.scatter(df, x='year', y='selling_price')

In [None]:
px.scatter(df, x='km_driven', y='selling_price')

In [None]:
px.box(df, y='fuel', x='selling_price', color='fuel')

In [None]:
px.box(df, y='seller_type', x='selling_price', color='seller_type')

In [None]:
px.box(df, y='transmission', x='selling_price', color='transmission')

In [None]:
px.box(df, y='owner', x='selling_price', color='owner')

- How should we handle Test Drive Car in terms of ordinality?
- It looks like the price is much higher than all other hands, but we would suggest that Test Drive Car should reduce the price
- This might happen because of these cars are with later years


In [None]:
df['owner'].value_counts()

In [None]:
df[df['owner'] == 'Test Drive Car'].sort_values('year')

In [None]:
px.box(df, y='owner', x='year', color='owner')

In [None]:
df['owner'].unique()

In [None]:
ordinal_encoder = OrdinalEncoder(categories=[[
   'First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car'
]], dtype='int8')
df['owner_encoded'] = ordinal_encoder.fit_transform(df[['owner']]) + 1

In [None]:
df.select_dtypes('number').corr()

## Lets get manufacturer

In [None]:
df['manufacturer'] = df['name'].str.split(' ').str[0]

In [None]:
px.box(df, y='manufacturer', x='selling_price', color='manufacturer')

In [None]:
df['manufacturer'].value_counts()

# Lets inspect and remove outliers

In [None]:
df[df['selling_price'] > 7_000_000]

In [None]:
df[df['km_driven'] > 500_000]

In [None]:
print(f"rows before drop: {len(df)}")

In [None]:
df.drop(df[df['selling_price'] > 7_000_000].index, inplace=True)
df.drop(df[df['km_driven'] > 500_000].index, inplace=True)

In [None]:
print(f"rows after drop: {len(df)}")

# Inspect distribtions

In [None]:
df['selling_price'].plot(kind='hist')

Looks like we might need log transformation

In [None]:
np.log(df['selling_price']).plot(kind='hist')

# Train the model

In [None]:
df.shape

In [None]:
df1 = df.drop(columns=['name','owner_encoded'])

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# What happens if we remove manufacturer?

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        # (
        #     'manufacturer-one-hot',
        #     OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
        #     ['manufacturer']
        # ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['manufacturer', 'selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# What happens if we remove all the outliers according to IQR?

In [None]:
def drop_outliers(df, col):
    """user iqr to drop outliers"""

    # remove outlier prices using iqr
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

In [None]:
len(df)

In [None]:
df['selling_price'].shape

In [None]:
price_no_outliers = drop_outliers(df, 'selling_price')['selling_price']

In [None]:
price_no_outliers.shape

In [None]:
t1 = drop_outliers(df, 'selling_price')
t2 = drop_outliers(t1, 'km_driven')
len(t2)

In [None]:
price_no_outliers.plot(kind='hist')

In [None]:
np.log(price_no_outliers).plot(kind='hist')

In [None]:
df2 = drop_outliers(df1, 'selling_price')
df2 = drop_outliers(df2, 'km_driven')

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df2.drop(columns=['selling_price'])
y = np.log(df2['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# Add scaling

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            StandardScaler(),
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# Add scaling to all the features

In [None]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        (
            'numerical',
            'passthrough',
            ['km_driven', 'year']
        ),
        (
            'manufacturer-one-hot',
            OneHotEncoder(sparse_output=False, drop='first', categories=[list(df['manufacturer'].unique())]),
            ['manufacturer']
        ),
        (
            'one-hot-cat',
            OneHotEncoder(sparse_output=False, drop='first'),
            ['fuel', 'seller_type', 'transmission']
        ),
        (
            'ordinal-cat',
            OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car' ]], dtype='int8'),
            ['owner']
        ),
    ]
)

# Define model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Split dataset
X = df1.drop(columns=['selling_price'])
y = np.log(df1['selling_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation


# Model evaluation
mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])

# Check correlations after encoding

In [None]:
df1.shape

In [None]:
len(pipeline[-1].coef_)

In [None]:
len(preprocessor.get_feature_names_out())

In [None]:
encoded_df_with_price = pd.concat([
    pd.DataFrame(preprocessor.fit_transform(df1).toarray(), columns=preprocessor.get_feature_names_out(), index=df1.index),
    np.log(df1['selling_price'])], axis=1)

In [None]:
encoded_df_with_price

In [None]:
encoded_df_with_price.corr()['selling_price'].abs().sort_values(ascending=False)

**❓ What happens if we leave only highly corelated features?**

**❓What happens if we add 2nd order polynomial features after leaving only hoghly correlated features?**