# ZEB Project 10.05.2023
Referenzen:
- https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f
- https://towardsdatascience.com/from-ml-model-to-ml-pipeline-9f95c32c6512


In [1]:
# Modules and packages
import numpy as np
import pandas as pd


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import re
from datetime import datetime

In [6]:
df = pd.read_excel('data/dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)

df.head()

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",119 g/km,"5,0 l/100 km",150 PS,110 kW,Diesel
1,Volkswagen,T-Cross VW Life TSI,201 km,03/2023,48 Monat (anpassbar),Manuelle Schaltung,"382,58 €",131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
2,Seat,Ibiza Austria Edition,15.000 km,10/2022,48 Monat (anpassbar),Manuelle Schaltung,"239,62 €",120 g/km,"5,0 l/100 km",80 PS,59 kW,Benzin
3,Volkswagen,Polo VW,1 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,"309,11 €",127 g/km,"6,0 l/100 km",80 PS,59 kW,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",138 g/km,"5,0 l/100 km",190 PS,140 kW,Diesel


In [7]:
def replaceComma(text):
    if "." in str(text): 
        return str("".join(str(text).split("."))).replace(",",".")
    else:
        return str(text).replace(",",".")

def removeLetters(text):
    return "".join(c for c in text if c.isdigit() or c == ".")

def removeThousandPoint(text):
    return str(text).replace(".","")

In [15]:
# Custom transformer to remove units from values
class UnitRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: re.findall(r'\b\d+\.?\d*\b', str(x))[0] if re.findall(r'\b\d+\.?\d*\b', str(x)) else '0')
        return X_

# Custom transformer to calculate months from first registration to now
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: (datetime.now() - datetime.strptime(x, "%m/%Y")).days // 30 if "/" in x else np.nan)
        return X_

# Define preprocessing for numeric columns (remove units and scale values)
numeric_features = ['milage', 'monthly_fee', 'emission_value', 'consumption', 'horsepower', 'kilowatts']
numeric_transformer = Pipeline(steps=[
    ('unit_remover', UnitRemover()),
    ('imputer', SimpleImputer(strategy='mean'))
])

# Define preprocessing for date columns
date_features = ['first_registration']
date_transformer = Pipeline(steps=[
    ('date_transformer', DateTransformer()),
    ('imputer', SimpleImputer(strategy='mean'))
])

# Define preprocessing for categorical features
categorical_features = ['brand_name', 'model_name', 'duration', 'gear', 'fuel_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('date', date_transformer, date_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor)
                      ])

In [16]:
df_new=clf.fit_transform(df)

In [23]:
df_new_df = pd.DataFrame(df_new)

In [24]:
df_new_df

Unnamed: 0,0,1,2,3,4,5,6
0,201.000,574.0,119.0,5.0,150.0,110.0,2.0
1,201.000,382.0,131.0,6.0,95.0,70.0,2.0
2,15.000,239.0,120.0,5.0,80.0,59.0,7.0
3,1.000,309.0,127.0,6.0,80.0,59.0,4.0
4,105.301,587.0,138.0,5.0,190.0,140.0,41.0
...,...,...,...,...,...,...,...
19053,201.000,692.0,146.0,6.0,150.0,110.0,4.0
19054,201.000,574.0,187.0,8.0,150.0,110.0,2.0
19055,105.301,587.0,143.0,6.0,190.0,140.0,41.0
19056,18.903,256.0,40.0,2.0,80.0,59.0,35.0


In [None]:

from dateutil.relativedelta import relativedelta

class AgeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def calculate_age(registration_date):
            if registration_date == "Neuwagen":
                return 0
            else:
                today = datetime.now()
                date = datetime.strptime(registration_date, "%m/%Y")
                delta = relativedelta(today, date)
                return (delta.years * 12) + delta.months
        
        X_copy = X.copy()
        X_copy["first_registration"] = X_copy["first_registration"].apply(calculate_age)
        X_copy["first_registration"] = X_copy["first_registration"].apply(float) 
        return X_copy
    
class getConsumption(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def getConsumption(consumption):
            return str(consumption).split()[0]
        
        X_copy = X.copy()
        X_copy['consumption'] = X_copy['consumption'].apply(getConsumption)
        X_copy["consumption"] = X_copy["consumption"].apply(replaceComma)
        X_copy["consumption"] = X_copy["consumption"].apply(float)  
        return X_copy

class transformMilage(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["milage"] = X_copy["milage"].apply(replaceComma)
        X_copy["milage"] = X_copy["milage"].apply(removeLetters)   
        X_copy["milage"] = X_copy["milage"].apply(float) 
        return X_copy
    
class transformDuration (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["duration"] = X_copy["duration"].apply(replaceComma)
        X_copy["duration"] = X_copy["duration"].apply(removeLetters)   
        X_copy["duration"] = X_copy["duration"].apply(float) 
        return X_copy    
    
class transformFee (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(replaceComma)
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(removeLetters)   
        X_copy["monthly_fee"] = X_copy["monthly_fee"].apply(float)  
        return X_copy 
    
class transformEmission (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["emission_value"] = X_copy["emission_value"].apply(replaceComma)
        X_copy["emission_value"] = X_copy["emission_value"].apply(removeLetters)   
        X_copy["emission_value"] = X_copy["emission_value"].apply(float) 
        return X_copy 
    
class transformHorsepower (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["horsepower"] = X_copy["horsepower"].apply(replaceComma)
        X_copy["horsepower"] = X_copy["horsepower"].apply(removeLetters)
        X_copy["horsepower"] = X_copy["horsepower"].apply(float)    
        return X_copy 
    
class transformKilowatts (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        X_copy = X.copy()
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(replaceComma)
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(removeLetters) 
        X_copy["kilowatts"] = X_copy["kilowatts"].apply(float)    
        return X_copy 
    
    
class passThrough(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X


