In [1]:
# Importing important libraries
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd

In [2]:
# Loading the data
df = pd.read_excel('data/dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)
df_test = df.iloc[:5,]

In [3]:
def replaceComma(text):
    if "." in str(text): 
        return str("".join(str(text).split("."))).replace(",",".")
    else:
        return str(text).replace(",",".")

def removeLetters(text):
    return "".join(c for c in text if c.isdigit() or c == ".")

def removeThousandPoint(text):
    return str(text).replace(".","")

from datetime import datetime
from dateutil.relativedelta import relativedelta

def calculate_age(registration_date):
    if registration_date == "Neuwagen":
        return 0
    else:
        today = datetime.now()
        date = datetime.strptime(registration_date, "%m/%Y")
        delta = relativedelta(today, date)
        return (delta.years * 12) + delta.months
    
def getConsumption(consumption):
    return str(consumption).split()[0]    

In [4]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.base import BaseEstimator, TransformerMixin

class AgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def calculate_age(registration_date):
            if registration_date == "Neuwagen":
                return 0
            else:
                today = datetime.now()
                date = datetime.strptime(registration_date, "%m/%Y")
                delta = relativedelta(today, date)
                return (delta.years * 12) + delta.months
        
        X_copy = X.copy()
        X_copy["first_registration"] = X_copy["first_registration"].apply(calculate_age)
        X_copy["first_registration"] = X_copy["first_registration"].apply(float) 
        return X_copy
    
class getConsumption(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def getConsumption(consumption):
            return str(consumption).split()[0]
        
        X_copy = X.copy()
        X_copy['consumption'] = X_copy['consumption'].apply(getConsumption)
        X_copy["consumption"] = X_copy["consumption"].apply(replaceComma)
        X_copy["consumption"] = X_copy["consumption"].apply(float)  
        return X_copy

class transformMilage(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["milage"] = X_copy["milage"].apply(replaceComma)
        X_copy["milage"] = X_copy["milage"].apply(removeLetters)   
        X_copy["milage"] = X_copy["milage"].apply(float) 
        return X_copy
    
class transformDuration (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["duration"] = X_copy["duration"].apply(replaceComma)
        X_copy["duration"] = X_copy["duration"].apply(removeLetters)   
        X_copy["duration"] = X_copy["duration"].apply(float) 
        return X_copy    
    
class transformFee (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(replaceComma)
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(removeLetters)   
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(float)  
        return X_copy 
    
class transformEmission (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["emission_value"] = X_copy["emission_value"].apply(replaceComma)
        X_copy["emission_value"] = X_copy["emission_value"].apply(removeLetters)   
        X_copy["emission_value"] = X_copy["emission_value"].apply(float) 
        return X_copy 
    
class transformHorsepower (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["horsepower"] = X_copy["horsepower"].apply(replaceComma)
        X_copy["horsepower"] = X_copy["horsepower"].apply(removeLetters)
        X_copy["horsepower"] = X_copy["horsepower"].apply(float)    
        return X_copy 
    
class transformKilowatts (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(replaceComma)
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(removeLetters) 
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(float)    
        return X_copy 
    
    
class passThrough(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X




In [5]:


class PandasTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, transformer):
        self.transformer = transformer
        
    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        return self
        
    def transform(self, X):
        transformed = self.transformer.transform(X)
        return pd.DataFrame(transformed, columns=X.columns)


In [6]:
# define the pipeline steps
age_transformer = AgeTransformer()
get_consumption = getConsumption()
pass_through = passThrough()
transform_milage = transformMilage()
transform_duration = transformDuration()
transform_fee = transformFee()
transfrom_emission = transformEmission()
transform_horsepower = transformHorsepower()
transform_kilowatts = transformKilowatts()
column_transformer = ColumnTransformer([
    ("brand_passthrough", pass_through, ["brand_name"]),
    ("model_name", pass_through, ["model_name"]),
    ("milage", transform_milage, ["milage"]),
    ("age_transformer", age_transformer, ["first_registration"]),
    ("duration", transform_duration, ["duration"]),
    ("gear", pass_through, ["gear"]),
    ("monthly_fee", transform_fee, ["monthly_fee"]),
    ("emission_value", transfrom_emission, ["emission_value"]),
    ("getConsumption", get_consumption, ["consumption"]), 
    ("horsepower", transform_horsepower, ["horsepower"]),
    ("kilowatts", transform_kilowatts, ["kilowatts"]),
    ("fuel_type", pass_through, ["fuel_type"])
])

# wrap the column transformer in the PandasTransformer
pipeline = Pipeline([
    ("column_transformer", PandasTransformer(column_transformer)),
])

# fit and transform the data
transformed_df = pipeline.fit_transform(df_test)


In [7]:
transformed_df

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201.0,2.0,48.0,Automatik,574.01,119.0,5.0,150.0,110.0,Diesel
1,Volkswagen,T-Cross VW Life TSI,201.0,2.0,48.0,Manuelle Schaltung,382.58,131.0,6.0,95.0,70.0,Benzin
2,Seat,Ibiza Austria Edition,15000.0,7.0,48.0,Manuelle Schaltung,239.62,120.0,5.0,80.0,59.0,Benzin
3,Volkswagen,Polo VW,1.0,4.0,48.0,Manuelle Schaltung,309.11,127.0,6.0,80.0,59.0,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105301.0,41.0,48.0,Automatik,587.75,138.0,5.0,190.0,140.0,Diesel
