 
# Leasing Price Predictor
**Data Science Lab**
  - Tobias Ponesch
  - Sina Haghgoo
  - Finnian John Dempsey
  - Adrian Lehrner
---

## Libraries

In [2]:
import pandas as pd
import numpy as np

import pickle
#from compress_pickle import dump, load
import warnings

from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator

***

## Import the dataset

In [46]:
df = pd.read_excel('data/dataset_2023.xlsx')


In [47]:
df = df.drop('Unnamed: 0', axis = 1)
df

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",119 g/km,"5,0 l/100 km",150 PS,110 kW,Diesel
1,Volkswagen,T-Cross VW Life TSI,201 km,03/2023,48 Monat (anpassbar),Manuelle Schaltung,"382,58 €",131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
2,Seat,Ibiza Austria Edition,15.000 km,10/2022,48 Monat (anpassbar),Manuelle Schaltung,"239,62 €",120 g/km,"5,0 l/100 km",80 PS,59 kW,Benzin
3,Volkswagen,Polo VW,1 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,"309,11 €",127 g/km,"6,0 l/100 km",80 PS,59 kW,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",138 g/km,"5,0 l/100 km",190 PS,140 kW,Diesel
...,...,...,...,...,...,...,...,...,...,...,...,...
19053,Seat,Ateca FR 2.0 TDI DSG 4Drive,201 km,01/2023,48 Monat (anpassbar),Automatik,"692,03 €",146 g/km,"6,0 l/100 km",150 PS,110 kW,Diesel
19054,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",187 g/km,"8,0 l/100 km",150 PS,110 kW,Diesel
19055,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",143 g/km,"6,0 l/100 km",190 PS,140 kW,Diesel
19056,Volkswagen,Polo VW,18.903 km,06/2020,48 Monat (anpassbar),Manuelle Schaltung,"256,33 €",40 g/km,"2,0 l/100 km",80 PS,59 kW,Benzin


***

## First Preprocessing

In [62]:
class CalculateAge(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        def calculate_age(registration_date):
            if registration_date == "Neuwagen":
                return 0
            else:
                today = datetime.now()
                date = datetime.strptime(registration_date, "%m/%Y")
                delta = relativedelta(today, date)
                return (delta.years * 12) + delta.months

        X['age'] = X['first_registration'].apply(calculate_age)
        return X

class Commas(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def comma(input):
            print(input)
            return input.apply(lambda x: x.replace(".", "").replace(",", ".") if isinstance(x, str) else x)

        for col in self.columns:
            X[]
        X[self.columns] = comma(X[self.columns])
        return X


class RemoveUnits(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        def replaceStrings(input):
            input = input.str.split(" ")
            return (input[0][0].replace(r'[^,\d]+', ''))
            
        X[self.columns] = X[self.columns].apply(replaceStrings)
        X[self.columns] = X[self.columns].astype(float)
        return X

In [63]:
units =['consumption', 'milage', 'duration', 'monthly_fee', 'emission_value', 'horsepower', 'kilowatts']
ageTransformer = Pipeline([
    ('calculate_age', CalculateAge())
])
commaTransformer = Pipeline([
    ('change_commas', Commas(units))
])


unitTransformer = Pipeline([
    ('remove_units', RemoveUnits(units))
])

basicpreprocessor = ColumnTransformer(
    transformers=[
        ('age', ageTransformer,["first_registration"]),
        ('commas', commaTransformer, units),
        ('units', unitTransformer, units)
    ]
)

basicpreprocessor.fit_transform(df)

        consumption      milage              duration monthly_fee  \
0      5,0 l/100 km      201 km  48 Monat (anpassbar)    574,01 €   
1      6,0 l/100 km      201 km  48 Monat (anpassbar)    382,58 €   
2      5,0 l/100 km   15.000 km  48 Monat (anpassbar)    239,62 €   
3      6,0 l/100 km        1 km  48 Monat (anpassbar)    309,11 €   
4      5,0 l/100 km  105.301 km  48 Monat (anpassbar)    587,75 €   
...             ...         ...                   ...         ...   
19053  6,0 l/100 km      201 km  48 Monat (anpassbar)    692,03 €   
19054  8,0 l/100 km      201 km  48 Monat (anpassbar)    574,01 €   
19055  6,0 l/100 km  105.301 km  48 Monat (anpassbar)    587,75 €   
19056  2,0 l/100 km   18.903 km  48 Monat (anpassbar)    256,33 €   
19057  8,0 l/100 km   48.000 km  48 Monat (anpassbar)    539,72 €   

      emission_value horsepower kilowatts  
0           119 g/km     150 PS    110 kW  
1           131 g/km      95 PS     70 kW  
2           120 g/km      80 PS     59 

ValueError: could not convert string to float: '5,0'

***

## Explanatory Data Analysis

### Traget variable