In [2]:
# Loading dependencies
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error, mean_squared_error

from typing import Optional, Any, List

import pandas as pd

In [4]:
# Loading data
df = pd.read_csv("./data/data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


In [None]:
# Cleaning data
df.dropna(inplace=True)

### Defining data transformers

In [None]:
class CarModelExtractionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        self._popular_brands: List[str] = []

    def fit(self, X: pd.DataFrame, y: Optional[Any] = None, max_models_count: int = 30):
        data = X.copy()
        data["model"] = data.name.apply(lambda x:  x.split()[1])

        self._popular_brands = data.groupby("model").size().sort_values(ascending=False).index[:max_models_count]
        return self

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
        data = X.copy()
        data["model"] = data.name.apply(lambda x: x.split()[1])
        data.loc[~data["model"].isin(self._popular_brands), "model"] = "Other"
        return data

In [None]:
class CarTechnicalInfoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y: Optional[Any] = None):
        return self

    def _extract_engine_info(self, engine_info: str) -> int:
        try:
            return int(engine_info.split()[0])
        except AttributeError:
            return engine_info

    def _parse_mileage(self, mileage: str) -> float:
        if type(mileage) != str:
            return mileage
        measurement, measure_type = mileage.split()
        if measure_type == "km/kg":
            return float(measurement) * 1.4
        return float(measurement)

    def _extract_max_power(self, power: str) -> float:
        if type(power) != str:
            return power
        try:
            return float(power.split()[0])
        except ValueError:
            return float('nan')
        
    def _parse_year_to_age(self, year: int) -> int:
        return 2022 - year

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
        data = X.copy()
        data.max_power = data.max_power.apply(self._extract_max_power)
        data.mileage = data.mileage.apply(self._parse_mileage)
        data["age"] = data.year.apply(self._parse_year_to_age)
        data.engine = data.engine.apply(self._extract_engine_info)

        return data

In [None]:
class CarTechnicalInfoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y: Optional[Any] = None):
        return self

    def _extract_engine_info(self, engine_info: str) -> int:
        try:
            return int(engine_info.split()[0])
        except AttributeError:
            return engine_info

    def _parse_mileage(self, mileage: str) -> float:
        if type(mileage) != str:
            return mileage
        measurement, measure_type = mileage.split()
        if measure_type == "km/kg":
            return float(measurement) * 1.4
        return float(measurement)

    def _extract_max_power(self, power: str) -> float:
        if type(power) != str:
            return power
        try:
            return float(power.split()[0])
        except ValueError:
            return float('nan')
        
    def _parse_year_to_age(self, year: int) -> int:
        return 2022 - year

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
        data = X.copy()
        data.max_power = data.max_power.apply(self._extract_max_power)
        data.mileage = data.mileage.apply(self._parse_mileage)
        data["age"] = data.year.apply(self._parse_year_to_age)
        data.engine = data.engine.apply(self._extract_engine_info)

        return data