# ZEB Ansatz Tobi 10.05.2023
Referenzen:
- https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f
- https://towardsdatascience.com/from-ml-model-to-ml-pipeline-9f95c32c6512


In [6]:
# Modules and packages
import numpy as np
import pandas as pd


In [15]:
# Machine learning pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import re
from datetime import datetime

set_config(display="diagram")

In [8]:
df = pd.read_excel('data/dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)

In [9]:
df.shape

(19058, 12)

In [10]:
df.head()

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",119 g/km,"5,0 l/100 km",150 PS,110 kW,Diesel
1,Volkswagen,T-Cross VW Life TSI,201 km,03/2023,48 Monat (anpassbar),Manuelle Schaltung,"382,58 €",131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
2,Seat,Ibiza Austria Edition,15.000 km,10/2022,48 Monat (anpassbar),Manuelle Schaltung,"239,62 €",120 g/km,"5,0 l/100 km",80 PS,59 kW,Benzin
3,Volkswagen,Polo VW,1 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,"309,11 €",127 g/km,"6,0 l/100 km",80 PS,59 kW,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",138 g/km,"5,0 l/100 km",190 PS,140 kW,Diesel


In [13]:
# 1. Partition data
seed = 123
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['monthly_fee']), 
                                                    df['monthly_fee'], 
                                                    test_size=.2, 
                                                    random_state=seed)

class UnitRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: re.findall(r'\b\d+\.?\d*\b', str(x))[0] if re.findall(r'\b\d+\.?\d*\b', str(x)) else '0')
        return X_.values
        
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: (datetime.now() - datetime.strptime(x, "%m/%Y")).days // 30 if "/" in x else np.nan)
        return X_.values

class ConsumptionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: (str(x).split()[0]))


In [None]:
pre

In [None]:
# pipe = Pipeline([
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)), 
#     ('model', LinearRegression())
# ])
# pipe.fit(X_train, y_train)

In [None]:
numeric_features = ['milage', 'monthly_fee', 'emission_value', 'consumption', 'horsepower', 'kilowatts']
numeric_transformer = Pipeline(steps=[
    ('unit_remover', UnitRemover()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

In [14]:
# GPT Antwort

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
import re
from datetime import datetime

# Custom transformer to remove units from values
class UnitRemover(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: re.findall(r'\b\d+\.?\d*\b', str(x))[0] if re.findall(r'\b\d+\.?\d*\b', str(x)) else '0')
        return X_.values

# Custom transformer to calculate months from first registration to now
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for col in X_.columns:
            X_[col] = X_[col].apply(lambda x: (datetime.now() - datetime.strptime(x, "%m/%Y")).days // 30 if "/" in x else np.nan)
        return X_.values

# Define preprocessing for numeric columns (remove units and scale values)
numeric_features = ['milage', 'monthly_fee', 'emission_value', 'consumption', 'horsepower', 'kilowatts']
numeric_transformer = Pipeline(steps=[
    ('unit_remover', UnitRemover()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for date columns
date_features = ['first_registration']
date_transformer = Pipeline(steps=[
    ('date_transformer', DateTransformer()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
categorical_features = ['brand_name', 'model_name', 'duration', 'gear', 'fuel_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('date', date_transformer, date_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

# Assume X is your features and y is the label
X = df.drop('ID', axis=1) # assuming 'ID' is not a feature
y = df['ID'] # assuming 'ID' is the label

clf.fit(X, y)

# Now you can use clf to predict new instances
# y_pred = clf.predict(X_new)