# Sklearn Pipelines and Transformers
## 2022-11-30

In [2]:
import pandas as pd

In [24]:
data = pd.read_csv("./data/Car details V3.csv")

In [31]:
data.dropna(inplace=True)

In [25]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,113.7Nm@ 4000rpm,5.0
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,190Nm@ 2000rpm,5.0
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,140Nm@ 1800-3000rpm,5.0


In [26]:
# Engineer car brand and model
def extract_car_brand(name: str) -> str:
    return name.split()[0]


def extract_car_model(name: str) -> str:
    return name.split()[1]


data["brand"] = data.name.apply(extract_car_brand)
data["model"] = data.name.apply(extract_car_model)

In [27]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,113.7Nm@ 4000rpm,5.0,Hyundai,i20
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,140Nm@ 1800-3000rpm,5.0,Tata,Indigo


In [32]:
# extract engine info
def extract_engine_info(engine_info: str) -> int:
    return int(engine_info.split()[0])


data["engine"] = data.engine.apply(extract_engine_info)

In [33]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197,82.85 bhp,113.7Nm@ 4000rpm,5.0,Hyundai,i20
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248,73.9 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396,70 bhp,140Nm@ 1800-3000rpm,5.0,Tata,Indigo


In [37]:
# parse mileage to standardised number


def parse_mileage(mileage: str) -> float:
    measurement, measure_type = mileage.split()
    if measure_type == "km/kg":
        return float(measurement) * 1.4
    return float(measurement)


data["mileage"] = data.mileage.apply(parse_mileage)

data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model,tmp
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.40,1248,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid,kmpl
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.70,1497,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City,kmpl
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.00,1396,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20,kmpl
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.10,1298,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift,kmpl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.50,1197,82.85 bhp,113.7Nm@ 4000rpm,5.0,Hyundai,i20,kmpl
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.80,1493,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna,kmpl
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.30,1248,73.9 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57,1396,70 bhp,140Nm@ 1800-3000rpm,5.0,Tata,Indigo,kmpl


In [39]:
def parse_owner(owner: str) -> int:
    owners = {
        "Test Drive Car": 0,
        "First Owner": 1,
        "Second Owner": 2,
        "Third Owner": 3,
        "Fourth & Above Owner": 4,
    }

    return owners[owner]


data["owner"] = data["owner"].apply(parse_owner)

data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model,tmp
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,1,23.40,1248,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid,kmpl
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,3,17.70,1497,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City,kmpl
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,1,23.00,1396,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20,kmpl
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,1,16.10,1298,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift,kmpl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,1,18.50,1197,82.85 bhp,113.7Nm@ 4000rpm,5.0,Hyundai,i20,kmpl
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,4,16.80,1493,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna,kmpl
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,1,19.30,1248,73.9 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,1,23.57,1396,70 bhp,140Nm@ 1800-3000rpm,5.0,Tata,Indigo,kmpl


In [40]:
# parse year to age


def parse_year_to_age(year: int) -> int:
    return 2022 - year


data["age"] = data["year"].apply(parse_year_to_age)

data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model,tmp,age
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,1,23.40,1248,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl,8
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid,kmpl,8
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,3,17.70,1497,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City,kmpl,16
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,1,23.00,1396,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20,kmpl,12
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,1,16.10,1298,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift,kmpl,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,1,18.50,1197,82.85 bhp,113.7Nm@ 4000rpm,5.0,Hyundai,i20,kmpl,9
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,4,16.80,1493,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna,kmpl,15
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,1,19.30,1248,73.9 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl,13
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,1,23.57,1396,70 bhp,140Nm@ 1800-3000rpm,5.0,Tata,Indigo,kmpl,9


In [43]:
data["max_power"] = data["max_power"].apply(lambda x: float(x.split()[0]))
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,model,tmp,age
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,1,23.40,1248,74.00,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl,8
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498,103.52,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid,kmpl,8
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,3,17.70,1497,78.00,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City,kmpl,16
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,1,23.00,1396,90.00,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20,kmpl,12
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,1,16.10,1298,88.20,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift,kmpl,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,1,18.50,1197,82.85,113.7Nm@ 4000rpm,5.0,Hyundai,i20,kmpl,9
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,4,16.80,1493,110.00,"24@ 1,900-2,750(kgm@ rpm)",5.0,Hyundai,Verna,kmpl,15
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,1,19.30,1248,73.90,190Nm@ 2000rpm,5.0,Maruti,Swift,kmpl,13
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,1,23.57,1396,70.00,140Nm@ 1800-3000rpm,5.0,Tata,Indigo,kmpl,9


In [5]:
from typing import Any, List, Optional

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [6]:
data = pd.read_csv("./data/Car details V3.csv")

In [7]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X

In [8]:
class CarModelExtractionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        self._popular_brands: List[str] = []

    def fit(self, X: pd.DataFrame, y: Optional[Any] = None, max_models_count: int = 30):
        data = X.copy()
        data["model"] = data.name.apply(lambda x: x.split()[1])

        self._popular_brands = (
            data.groupby("model").size().sort_values(ascending=False).index[:max_models_count]
        )
        return self

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
        data = X.copy()
        data["model"] = data.name.apply(lambda x: x.split()[1])
        data.loc[~data["model"].isin(self._popular_brands), "model"] = "Other"
        return data

In [9]:
class CarTechnicalInfoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y: Optional[Any] = None):
        return self

    def _extract_engine_info(self, engine_info: str) -> int:
        try:
            return int(engine_info.split()[0])
        except AttributeError:
            return engine_info

    def _parse_mileage(self, mileage: str) -> float:
        if type(mileage) != str:
            return mileage
        measurement, measure_type = mileage.split()
        if measure_type == "km/kg":
            return float(measurement) * 1.4
        return float(measurement)

    def _extract_max_power(self, power: str) -> float:
        if type(power) != str:
            return power
        try:
            return float(power.split()[0])
        except ValueError:
            return float("nan")

    def _parse_year_to_age(self, year: int) -> int:
        return 2022 - year

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
        data = X.copy()
        data.max_power = data.max_power.apply(self._extract_max_power)
        data.mileage = data.mileage.apply(self._parse_mileage)
        data["age"] = data.year.apply(self._parse_year_to_age)
        data.engine = data.engine.apply(self._extract_engine_info)

        return data

In [10]:
class CarOwnerParsingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X: pd.DataFrame, y: Optional[Any] = None):
        return self

    def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> int:
        data = X.copy()

        owners = {
            "Test Drive Car": 0,
            "First Owner": 1,
            "Second Owner": 2,
            "Third Owner": 3,
            "Fourth & Above Owner": 4,
        }

        data["owner"] = data["owner"].apply(lambda x: owners[x])

        return data

In [11]:
tr = CarModelExtractionTransformer()
data = tr.fit_transform(data)

In [12]:
tr = CarTechnicalInfoTransformer()
data = tr.fit_transform(data)

In [13]:
tr = CarOwnerParsingTransformer()
data = tr.fit_transform(data)

In [14]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,model,age
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,1,23.40,1248.0,74.00,190Nm@ 2000rpm,5.0,Swift,8
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498.0,103.52,250Nm@ 1500-2500rpm,5.0,Other,8
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,3,17.70,1497.0,78.00,"12.7@ 2,700(kgm@ rpm)",5.0,City,16
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,1,23.00,1396.0,90.00,22.4 kgm at 1750-2750rpm,5.0,i20,12
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,1,16.10,1298.0,88.20,"11.5@ 4,500(kgm@ rpm)",5.0,Swift,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,1,18.50,1197.0,82.85,113.7Nm@ 4000rpm,5.0,i20,9
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,4,16.80,1493.0,110.00,"24@ 1,900-2,750(kgm@ rpm)",5.0,Verna,15
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,1,19.30,1248.0,73.90,190Nm@ 2000rpm,5.0,Swift,13
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,1,23.57,1396.0,70.00,140Nm@ 1800-3000rpm,5.0,Indigo,9


### Pipeline definition

In [15]:
data = pd.read_csv("./data/Car details V3.csv")

train_data, test_data = train_test_split(data)

In [16]:
cont_pipeline = Pipeline(
    [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

cat_pipeline = Pipeline(
    [("imputer", SimpleImputer(strategy="most_frequent")), ("one_hot_encode", OneHotEncoder())]
)

In [17]:
cat_features = ["fuel", "seller_type", "transmission", "model"]
cont_features = ["km_driven", "owner", "mileage", "engine", "max_power", "seats", "age"]

label = "selling_price"

In [18]:
pipeline = Pipeline(
    [
        ("model_extraction", CarModelExtractionTransformer()),
        ("technical_info_extraction", CarTechnicalInfoTransformer()),
        ("owner_extraction", CarOwnerParsingTransformer()),
        (
            "ColumnTransformer",
            ColumnTransformer(
                [
                    ("cont_transformer", cont_pipeline, cont_features),
                    ("cat_transformer", cat_pipeline, cat_features),
                ]
            ),
        ),
        ("model", LinearRegression()),
    ]
)

In [19]:
pipeline.fit(train_data, train_data[label].values)

In [20]:
predictions = pipeline.predict(test_data)
print("MAE", mean_absolute_error(test_data[label].values, predictions))
print("MSE", mean_squared_error(test_data[label].values, predictions))

MAE 240054.33055278272
MSE 172439858118.75684


In [21]:
import cloudpickle

cloudpickle.dump(pipeline, open("model.pkl", "wb"))

In [1]:
import cloudpickle

cloudpickle.load(open("./model.pkl", "rb"))