In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sackx-used-cars/sample_submission.csv
/kaggle/input/sackx-used-cars/train.csv
/kaggle/input/sackx-used-cars/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/sackx-used-cars/train.csv')
test  = pd.read_csv('/kaggle/input/sackx-used-cars/test.csv')

In [3]:
# Count the occurrences
print(train.shape)
counts = train['brand'].value_counts()

# Find categories with fewer than 5 occurrences
small_categories = counts[counts < 5].index

# Replicate these categories
replicated_rows = []
for category in small_categories:
    rows = train[train['brand'] == category]
    num_repeats = 20 // len(rows)  # Number of times to replicate to reach around 20
    replicated_rows.append(pd.concat([rows]*num_repeats, ignore_index=True))

# If there are any replicated rows, concatenate them with the original DataFrame
if replicated_rows:
    replicated_df = pd.concat(replicated_rows, ignore_index=True)
    train = pd.concat([train, replicated_df], ignore_index=True)
print(train.shape)

(54273, 13)
(54371, 13)


In [4]:
len(train.loc[train['price']<100000]) / len(train)
train=train.loc[train['price']<100000]

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
class EngineTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.cc = 'engine'
        hp_pattern = r'(\d+(\.\d+)?)\s*[hH][pP]'
        liter_pattern = r'(\d+(\.\d+)?)\s*[lL](iter)?'
        cylinder_pattern = r'(?:(?:Straight\s*)?(\d+)|V(\d+))\s*Cylinder'
        
        hp = X[self.cc].str.extract(hp_pattern, expand=False)[0].astype(np.float64)
        liter = X[self.cc].str.extract(liter_pattern, expand=False)[0].astype(np.float64)
        cylinders = X[self.cc].str.extract(cylinder_pattern, expand=False).bfill(axis=1).iloc[:, 0].astype(np.float64)
        
        self.h=hp.mode()[0]
        self.l=liter.mode()[0]
        self.c=cylinders.mode()[0]
        
        print(f"fitted Engine: {self.h} {self.l} {self.c}")
        return self
    
    def transform(self, X):
        hp_pattern = r'(\d+(\.\d+)?)\s*[hH][pP]'
        liter_pattern = r'(\d+(\.\d+)?)\s*[lL](iter)?'
        cylinder_pattern = r'(?:(?:Straight\s*)?(\d+)|V(\d+))\s*Cylinder'
        
        hp = X[self.cc].str.extract(hp_pattern, expand=False)[0].astype(np.float64)
        hp = hp.fillna(self.h).astype(np.float64)
        
        liter = X[self.cc].str.extract(liter_pattern, expand=False)[0].astype(np.float64)
        liter = liter.fillna(self.l).astype(np.float64)
        
        cylinders = X[self.cc].str.extract(cylinder_pattern, expand=False).bfill(axis=1).iloc[:, 0].astype(np.float64)
        cylinders = cylinders.fillna(self.c).astype(np.float64)

        # Create a DataFrame
        df = pd.DataFrame({
            'hp': hp,
            'liter': liter,
            'cylinders': cylinders
        })
        print(f"transformed Engine: {self.h} {self.l} {self.c}")
        return df
    
    def get_feature_names_out(self, names=None):
        return ['hp', 'liter', 'cylinders']
    
ps,pa=None,None
class TransmissionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column_name='transmission'):
        self.column_name = column_name  # Allows flexibility on column name
#         self.speed_mode = None
#         self.automatic_mode = None
        
    def fit(self, X, y=None):
        if self.column_name not in X.columns:
            raise ValueError(f"{self.column_name} is not in the DataFrame")
        
        # Regular expressions to extract features
        speed_pattern = r'(\d+)-?[Ss]peed'
        automatic_pattern = r'A/T|Automatic|CVT|a/t|A / T|a / t'

        # Extracting speed and handling missing values
        speed = X[self.column_name].str.extract(speed_pattern, expand=False)
        self.speed_mode = speed.mode()[0]
#         ps = self.speed_mode

        # Detecting automatic transmissions and handling missing values
        is_automatic = X[self.column_name].str.contains(automatic_pattern, case=False, na=False)
        is_automatic = is_automatic.astype(float)  # Direct conversion to float
        self.automatic_mode = is_automatic.mode()[0]
#         pa = self.automatic_mode
        print(f"fitted Transmission: {self.speed_mode} {self.automatic_mode}")
        return self
    
    def transform(self, X):
        if self.column_name not in X.columns:
            raise ValueError(f"{self.column_name} is not in the DataFrame")

        # Extracting speed
        speed_pattern = r'(\d+)-?[Ss]peed'
        speed = X[self.column_name].str.extract(speed_pattern, expand=False)
        print(ps)
        speed = speed.fillna(self.speed_mode).astype(np.float64)

        # Detecting automatic transmissions
        automatic_pattern = r'A/T|Automatic|CVT|a/t|A / T|a / t'
        is_automatic = X[self.column_name].str.contains(automatic_pattern, case=False, na=False)
        is_automatic = is_automatic.fillna(self.automatic_mode).astype(float)

        # Creating a DataFrame from extracted features
        df = pd.DataFrame({
            'speed': speed,
            'automatic': is_automatic
        })
        print(f"transformed Transmission: {self.speed_mode} {self.automatic_mode}")
        return df
    
    def get_feature_names_out(self, names=None):
        return ['speed', 'automatic']
    
    
    
# class ArrayToDataFrameTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, column_names):
#         self.column_names = column_names
    
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return pd.DataFrame(X, columns=self.column_names)

# # Example usage within the pipeline
# column_names = ['brand_encoded', 'fuel_type_encoded', 'hp', 'liter', 'cylinders', 
#                 'speed', 'automatic', 'age', 'clean_title', 'accident', 'is_automatic',
#                 'milage', 'model', 'id'] 

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x,X, y,Y = train_test_split(train.drop(columns=['price']), train[['price']], train_size=0.8)

In [8]:
x.shape,x.columns

((41696, 12),
 Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
        'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
       dtype='object'))

In [9]:
y.shape,y.columns

((41696, 1), Index(['price'], dtype='object'))

In [10]:
et,tt =EngineTransformer(), TransmissionTransformer()

In [11]:
print(x.columns)

x=x.drop(columns=['id'])
x['age'] = 2024-x['model_year']

x=pd.concat([x,et.fit_transform(x),tt.fit_transform(x)], axis=1)
x['feat1']=x['speed']*x['hp']
x.columns
x.shape

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
      dtype='object')
fitted Engine: 300.0 3.0 6.0
transformed Engine: 300.0 3.0 6.0
fitted Transmission: 6 1.0
None
transformed Transmission: 6 1.0


(41696, 18)

In [12]:
ly = np.log(y)
xn = x.select_dtypes(include=[np.number])
xc = x[[col for col in x.columns if col not in xn.columns]]

In [13]:
bm=xn.copy()
bm['logprice']=ly
ly[:5]

Unnamed: 0,price
20845,10.558414
31503,10.477288
38709,10.34171
32718,10.736397
12261,10.1849


In [14]:
bm.corr()['logprice'].sort_values()

age          -0.687385
milage       -0.686776
automatic    -0.027235
liter         0.183506
cylinders     0.211985
speed         0.395991
hp            0.520956
feat1         0.573728
model_year    0.687385
logprice      1.000000
Name: logprice, dtype: float64

In [15]:
imp=['milage', 'age','feat1','hp']
impp=['milage', 'age','feat1','hp','logprice']
bm[impp].corr()['logprice'].sort_values()

age        -0.687385
milage     -0.686776
hp          0.520956
feat1       0.573728
logprice    1.000000
Name: logprice, dtype: float64

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
lr=LinearRegression()
lr.fit(xn[imp],ly)

In [18]:
print(X.columns)
# et,tt =EngineTransformer(), TransmissionTransformer()
id= X[['id']]
X=X.drop(columns=['id'])
X['age'] = 2024-X['model_year']
X=pd.concat([X,et.transform(X),tt.transform(X)], axis=1)
X['feat1']=X['speed']*X['hp']
X.columns
X.shape

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
      dtype='object')
transformed Engine: 300.0 3.0 6.0
None
transformed Transmission: 6 1.0


(10424, 18)

In [19]:
p=lr.predict(X[imp])

In [20]:
from sklearn.metrics import mean_squared_error

rms = mean_squared_error(Y[['price']], np.exp(p).reshape(-1,1), squared=False)
print(rms)

14045.499142788094


In [21]:
print(test.columns)
id= test[['id']]
test=test.drop(columns=['id'])
test['age'] = 2024-test['model_year']
test=pd.concat([test,et.fit_transform(test),tt.fit_transform(test)], axis=1)
test['feat1']=test['speed']*test['hp']
test.columns
test.shape

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
      dtype='object')
fitted Engine: 300.0 3.0 6.0
transformed Engine: 300.0 3.0 6.0
fitted Transmission: 6 1.0
None
transformed Transmission: 6 1.0


(36183, 18)

In [22]:
for c in test[imp].columns:
    print(f"{c} {sum(test[c].isna())}")

milage 0
age 0
feat1 0
hp 0


In [23]:
p=lr.predict(test[imp])

In [24]:
p.shape

(36183, 1)

In [25]:
res = pd.DataFrame()
res['id'] = id
res['price']=np.exp(p)
res.to_csv('simple-lr.csv',index=False)