# ZEB Projekt
## Stand 2023_05_05

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_excel('data/dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)

In [7]:
def replaceComma(text):
    if "." in str(text): 
        return str("".join(str(text).split("."))).replace(",",".")
    else:
        return str(text).replace(",",".")

def removeLetters(text):
    return "".join(c for c in text if c.isdigit() or c == ".")

def removeThousandPoint(text):
    return str(text).replace(".","")

from datetime import datetime
from dateutil.relativedelta import relativedelta

def calculate_age(registration_date):
    if registration_date == "Neuwagen":
        return 0
    else:
        today = datetime.now()
        date = datetime.strptime(registration_date, "%m/%Y")
        delta = relativedelta(today, date)
        return (delta.years * 12) + delta.months
    
def getConsumption(consumption):
    return str(consumption).split()[0]    

In [6]:
columnsNumeric = ["milage", "duration", "monthly_fee", "horsepower", "emission_value", "kilowatts", "consumption"]
def preProcess(columnsNumeric, df):
    
    # calculating age
    
    df['first_registration'] = df['first_registration'].apply(calculate_age)
    
    
    # removing unnecesary characters from consumption
    
    df['consumption'] = df['consumption'].apply(getConsumption)
    
    
    for i in range(len(columnsNumeric)):
        
        
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(replaceComma)
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(removeLetters)
    
    # removing thousand point from milage column
    
    df["milage"] = df["milage"].apply(removeThousandPoint)
    
    
    # replacing "" and "nan" values from the different 

    df = df.replace('', np.nan)
    df = df.replace('nan', np.nan)
    
    

    
    df[df.columns.difference(['brand_name', 'model_name','fuel_type', 'gear' ])] = df[df.columns.difference(['brand_name', 'model_name','fuel_type', 'gear' ])].astype(float)


    return df

In [8]:
def preprocessor(df):
    columnsNumeric = ["milage", "duration", "monthly_fee", "horsepower", "emission_value", "kilowatts", "consumption"]
    
    df = preProcess(columnsNumeric, df)
    
    return df

In [9]:
preprocess = FunctionTransformer(preprocessor)

In [10]:
# Define categorical features
categorical_features = ['brand_name', 'model_name', 'gear', 'fuel_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocess, columnsNumeric),
        ('cat', categorical_transformer, categorical_features)])

In [47]:
# Define model
model = DecisionTreeRegressor()


In [46]:
# X = df.drop('monthly_fee', axis=1) # assume we want to predict 'monthly_fee'
# y = pd.DataFrame(df['monthly_fee'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

def split(df):
    X = df.drop('monthly_fee', axis=1) # assume we want to predict 'monthly_fee'
    y = pd.DataFrame(df['monthly_fee'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return(X_train, X_test, y_train, y_test)

In [48]:
# Combine preprocessing and model training steps into a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [49]:
pipeline.fit(df)

KeyError: 'first_registration'

In [19]:
X_train

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,emission_value,consumption,horsepower,kilowatts,fuel_type
13460,Audi,A1 Sportback 25 TFSI intense,3.000 km,11/2022,48 Monat (anpassbar),Manuelle Schaltung,114 g/km,"5,0 l/100 km",95 PS,70 kW,Benzin
15717,Seat,Ibiza FR Austria 1.0 TSI,24.000 km,03/2022,48 Monat (anpassbar),Manuelle Schaltung,131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
5620,BMW,X3 M Competition Individual,13.000 km,08/2021,36 Monat (anpassbar),Automatik,130 g/km,"5,0 l/100 km",510 PS,375 kW,Benzin
6901,BMW,520 d Touring G31*M-Paket*Leder*HeadUP*19Zoll*,101.100 km,03/2019,48 Monat (anpassbar),Automatik,189 g/km,"7,0 l/100 km",190 PS,140 kW,Benzin
12505,Seat,Leon SP Kombi Reference 1.0 TSI,5.000 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,230 g/km,"10,0 l/100 km",90 PS,66 kW,Benzin
...,...,...,...,...,...,...,...,...,...,...,...
9225,Volkswagen,Touran VW Comfortline TDI SCR DSG 5-Sitzer,85.600 km,11/2018,48 Monat (anpassbar),Automatik,129 g/km,"5,0 l/100 km",116 PS,85 kW,Diesel
13123,Volkswagen,T-Cross VW Life TSI,10 km,05/2022,48 Monat (anpassbar),Manuelle Schaltung,,,95 PS,70 kW,Benzin
9845,Opel,"Insignia ST 1.6 CDTI Aut. LED,Navi,AGR,SHZ",44.043 km,04/2020,48 Monat (anpassbar),Automatik,138 g/km,"6,0 l/100 km",136 PS,100 kW,Diesel
10799,Volkswagen,Golf VW Variant Life TDI,5.000 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,42 g/km,"2,0 l/100 km",116 PS,85 kW,Diesel


In [None]:
print("Model score: %.3f" % pipeline.score(X_test, y_test))