# Imports and configurations

In [None]:
import datetime
from math import sqrt
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer, OrdinalEncoder, PolynomialFeatures
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression, RFE, RFECV, mutual_info_regression

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats


In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

# Load data

In [None]:
data = "https://storage.googleapis.com/edulabs-public-datasets/CAR%20DETAILS%20FROM%20CAR%20DEKHO.csv"

In [None]:
df1 = pd.read_csv(data)

In [None]:
df1

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [None]:
print(list(df1['name'])[:20])

['Maruti 800 AC', 'Maruti Wagon R LXI Minor', 'Hyundai Verna 1.6 SX', 'Datsun RediGO T Option', 'Honda Amaze VX i-DTEC', 'Maruti Alto LX BSIII', 'Hyundai Xcent 1.2 Kappa S', 'Tata Indigo Grand Petrol', 'Hyundai Creta 1.6 VTVT S', 'Maruti Celerio Green VXI', 'Chevrolet Sail 1.2 Base', 'Tata Indigo Grand Petrol', 'Toyota Corolla Altis 1.8 VL CVT', 'Maruti 800 AC', 'Maruti Wagon R LXI Minor', 'Hyundai Verna 1.6 SX', 'Datsun RediGO T Option', 'Honda Amaze VX i-DTEC', 'Maruti Alto LX BSIII', 'Hyundai Xcent 1.2 Kappa S']


# Topics to discuss

- name column - how to handle? (LLMs with structured output)
- owner column

In [None]:
df1['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

In [None]:
df1['name'].nunique()

1491

In [None]:
df1[df1['name'].str.contains('Renault')].count()

Unnamed: 0,0
name,153
year,153
selling_price,153
km_driven,153
fuel,153
seller_type,153
transmission,153
owner,153


In [None]:
df1[df1['name'].str.contains('Honda')].count()

Unnamed: 0,0
name,252
year,252
selling_price,252
km_driven,252
fuel,252
seller_type,252
transmission,252
owner,252


In [None]:
px.histogram(df1, x="selling_price")

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_feature_target_scatter(df, features, target_variable):
    """
    Displays a figure with multiple scatter plots showing the correlation
    between each feature and the target variable.

    Args:
        df (pd.DataFrame): DataFrame containing features and target variable.
        features (list): List of column names to be considered as features.
        target_variable (str): Name of the target variable column.
    """

    num_features = len(features)
    if num_features == 0:
        print("No features provided to plot.")
        return

    # Determine subplot grid layout (adjust as needed for better layout)
    if num_features <= 2:
        rows = 1
        cols = num_features
    elif num_features <= 4:
        rows = 2
        cols = 2
    else:
        rows = (num_features + 1) // 3  # Adjust columns for more features
        cols = 3

    fig = make_subplots(rows=rows, cols=cols,
                        subplot_titles=[f'Feature vs. Target: {feature}' for feature in features])

    for i, feature in enumerate(features):
        row_index = (i // cols) + 1
        col_index = (i % cols) + 1

        scatter_trace = go.Scatter(
            x=df[feature],
            y=df[target_variable],
            mode='markers',
            marker=dict(color='blue', size=5),
            name=feature
        )
        fig.add_trace(scatter_trace, row=row_index, col=col_index)

        fig.update_xaxes(title_text=feature, row=row_index, col=col_index)
        fig.update_yaxes(title_text=target_variable, row=row_index, col=col_index)

    fig.update_layout(title_text="Feature vs Target Variable Scatter Plots", showlegend=False)
    fig.show()

In [None]:
plot_feature_target_scatter(df1, df1.select_dtypes('number').columns.drop('selling_price'), 'selling_price')

In [None]:
df = df1.copy()

In [None]:
def extract_until_number(name):
    words = name.split()[1:] if isinstance(name, str) else []
    result = []
    for word in words:
        #if re.search(r'\d', word):  # Check if the word contains a digit
        #    continue
        result.append(word)
    return ' '.join(result)

In [None]:
df['manufacturer'] = df['name'].str.split().str[0]
df['model'] = df['name'].apply(extract_until_number)
df['model'] = df['name'].str.split().str[1]
df['year'] = df['year'].astype('str')
df = df[df['owner'] != 'Test Drive Car']
df['owner'] = df['owner'].replace({
    'First Owner' : 1,
    'Second Owner' : 2,
    'Third Owner' : 3,
    'Fourth & Above Owner' : 4,
})

In [None]:
num_col = ['km_driven', 'owner']
cat_col = ['year', 'fuel', 'seller_type', 'transmission', 'manufacturer', 'model']

In [None]:
num_transformer = ColumnTransformer(
    transformers=[('km',Pipeline([
        ('transform', FunctionTransformer(np.log1p, validate=True)),
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scale', StandardScaler())
    ]), num_col)]
)

In [None]:
num_transformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_col),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_col)
    ]
)

feature_selector = SelectKBest(score_func=f_regression , k=250)  # Selecting best k features

In [None]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', LinearRegression())
])

In [None]:
X = df.drop(['name', 'selling_price'], axis=1)
y = np.log1p(df['selling_price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
pipeline.fit(X_train, y_train)


k=250 is greater than n_features=240. All the features will be returned.



In [None]:
# Model evaluation

y_pred = np.exp(pipeline.predict(X_test)) # Reverse log transformation

mse = metrics.mean_squared_error(np.exp(y_test), y_pred)  # Compare in original scale
r2 = metrics.r2_score(np.exp(y_test), y_pred)
mape = metrics.mean_absolute_percentage_error(np.exp(y_test), y_pred)

### TRAIN

# Predictions
y_pred_train = np.exp(pipeline.predict(X_train)) # Reverse log transformation


# Model evaluation
mse_train = metrics.mean_squared_error(np.exp(y_train), y_pred_train)  # Compare in original scale
r2_train = metrics.r2_score(np.exp(y_train), y_pred_train)
mape_train = metrics.mean_absolute_percentage_error(np.exp(y_train), y_pred_train)

metrics_dict = {
    'Test': [mse, mape, r2],
    'Train': [mse_train, mape_train, r2_train]
}



pd.DataFrame(metrics_dict, index=['MSE', 'MAPE', 'R2'])


Found unknown categories in columns [4, 5] during transform. These unknown categories will be encoded as all zeros



Unnamed: 0,Test,Train
MSE,19343780000.0,16268970000.0
MAPE,0.2033864,0.1787854
R2,0.9380894,0.9522244
