# ZEB Project

## Loading dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)

## Preprocessing functions

In [2]:
def replaceComma(text):
    if "." in str(text): 
        return str("".join(str(text).split("."))).replace(",",".")
    else:
        return str(text).replace(",",".")

def removeLetters(text):
    return "".join(c for c in text if c.isdigit() or c == ".")

def removeThousandPoint(text):
    return str(text).replace(".","")

from datetime import datetime
from dateutil.relativedelta import relativedelta

def calculate_age(registration_date):
    if registration_date == "Neuwagen":
        return 0
    else:
        today = datetime.now()
        date = datetime.strptime(registration_date, "%m/%Y")
        delta = relativedelta(today, date)
        return (delta.years * 12) + delta.months
    
def getConsumption(consumption):
    return str(consumption).split()[0]    

## Combining the preprocessing steps 

In [3]:
columnsNumeric = ["milage", "duration", "monthly_fee", "horsepower", "emission_value", "kilowatts", "consumption"]
def preProcess(columnsNumeric, df):
    
    # calculating age
    
    df['first_registration'] = df['first_registration'].apply(calculate_age)
    
    
    # removing unnecesary characters from consumption
    
    df['consumption'] = df['consumption'].apply(getConsumption)
    
    
    for i in range(len(columnsNumeric)):
        
        
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(replaceComma)
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(removeLetters)
    
    # removing thousand point from milage column
    
    df["milage"] = df["milage"].apply(removeThousandPoint)
    
    
    # replacing "" and "nan" values from the different 

    df = df.replace('', np.nan)
    df = df.replace('nan', np.nan)
    
    

    
    df[df.columns.difference(['brand_name', 'model_name','fuel_type', 'gear' ])] = df[df.columns.difference(['brand_name', 'model_name','fuel_type', 'gear' ])].astype(float)


    return df

In [4]:
df = preProcess(columnsNumeric, df)

# Final dataset for implementation

# Before

In [2]:
df.head()

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",119 g/km,"5,0 l/100 km",150 PS,110 kW,Diesel
1,Volkswagen,T-Cross VW Life TSI,201 km,03/2023,48 Monat (anpassbar),Manuelle Schaltung,"382,58 €",131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
2,Seat,Ibiza Austria Edition,15.000 km,10/2022,48 Monat (anpassbar),Manuelle Schaltung,"239,62 €",120 g/km,"5,0 l/100 km",80 PS,59 kW,Benzin
3,Volkswagen,Polo VW,1 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,"309,11 €",127 g/km,"6,0 l/100 km",80 PS,59 kW,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",138 g/km,"5,0 l/100 km",190 PS,140 kW,Diesel


# After

In [5]:
df.head()

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201.0,2.0,48.0,Automatik,574.01,119.0,5.0,150.0,110.0,Diesel
1,Volkswagen,T-Cross VW Life TSI,201.0,2.0,48.0,Manuelle Schaltung,382.58,131.0,6.0,95.0,70.0,Benzin
2,Seat,Ibiza Austria Edition,15000.0,7.0,48.0,Manuelle Schaltung,239.62,120.0,5.0,80.0,59.0,Benzin
3,Volkswagen,Polo VW,1.0,4.0,48.0,Manuelle Schaltung,309.11,127.0,6.0,80.0,59.0,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105301.0,41.0,48.0,Automatik,587.75,138.0,5.0,190.0,140.0,Diesel


# Still needs to be done: Implement the preprocessing steps from above in to a pipeline so it looks like cell bellow

In [54]:


# Define the columns that require preprocessing
columnsNumeric = ['milage', 'monthly_fee', 'emission_value', 'consumption', 'horsepower', 'kilowatts']

# Create a pipeline for numeric column preprocessing
numeric_transformer = Pipeline(steps=[
    ('replace_comma', FunctionTransformer(replaceComma)),
    ('remove_letters', FunctionTransformer(removeLetters)),
    ('remove_thousand', FunctionTransformer(removeThousandPoint))
])

# Create a pipeline for the 'first_registration' column preprocessing
first_registration_transformer = Pipeline(steps=[
    ('calculate_age', FunctionTransformer(calculate_age))
])

# Create a pipeline for the 'consumption' column preprocessing
consumption_transformer = Pipeline(steps=[
    ('get_consumption', FunctionTransformer(getConsumption))
])



# Define the column transformer to apply the different preprocessing pipelines to the appropriate columns
preprocessor1 = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, columnsNumeric)
    ])
preprocessor2 = ColumnTransformer(transformers=[
    ('first_registration', first_registration_transformer, ['first_registration']),
    ('consumption', consumption_transformer, ['consumption'])
    ])


# Combine the preprocessing step with a machine learning model
from sklearn.linear_model import LinearRegression

preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor_Column_Specific_Processing', preprocessor2),
    ('preprocessor_extracting_numeric_values', preprocessor1),
])
preprocessing_pipeline

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
##feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [7]:
dfNoRate = df.drop('monthly_fee', axis=1)  # input features
dfRate = df['monthly_fee']
X_train, X_test, y_train, y_test = train_test_split(dfNoRate, dfRate, test_size=0.25, shuffle=False)

In [10]:
## numerical processing pipeline
import numpy as np
numeric_transformer=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="median")),
          ("scaler",StandardScaler())]

)


categorical_transformer = Pipeline(
    steps=[("imputer",SimpleImputer(strategy="constant", fill_value="missing")),
          ("onehot",OneHotEncoder(handle_unknown= "error", sparse_output = False, drop = "first"))]

)


numeric_features = dfNoRate.select_dtypes(include=np.number).columns
categorical_features = dfNoRate.select_dtypes(exclude=["number"]).columns


preprocessor = ColumnTransformer(
    transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)    
    ])


preprocessor.fit(dfNoRate)

#cat_columns = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names(categorical_features)
#columns_pipeline = np.append(cat_columns, numeric_features)

cat_columns = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_features)
columns_pipeline = np.append(cat_columns, numeric_features)

In [19]:
## numerical processing pipeline
import numpy as np
numeric_transformer=Pipeline(
    steps=[("imputation_mean",SimpleImputer(missing_values=np.nan,strategy="median")),
          ("scaler",StandardScaler())]

)


categorical_transformer = Pipeline(
    steps=[("imputer",SimpleImputer(strategy="constant", fill_value="missing")),
          ("onehot",OneHotEncoder(handle_unknown= "error", sparse_output = False, ))]

)


numeric_features = dfNoRate.select_dtypes(include=np.number).columns
categorical_features = dfNoRate.select_dtypes(exclude=["number"]).columns


preprocessor = ColumnTransformer(
    transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)    
    ])


preprocessor.fit(dfNoRate)

#cat_columns = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names(categorical_features)
#columns_pipeline = np.append(cat_columns, numeric_features)

cat_columns = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_features)
columns_pipeline = np.append(cat_columns, numeric_features)

In [13]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler


# Linear regression with PCA for dimensionality reduction

In [42]:
preprocessor

In [28]:
# Define the pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('regressor', LinearRegression())
])

# Define the parameter grid
param_grid = {
    'preprocessor__cat__onehot__handle_unknown': ['error', 'ignore'], ##!!!!! when you run it without it you get "Value Error: Found unknown categories ..." for the first column PLEASE DEBUG SO MODEL USES EVERYTHING
    'pca__n_components': [10, 20, 30, 40],
    'regressor__fit_intercept': [True, False]
}

# Define the RandomizedSearchCV object
rs_cv = RandomizedSearchCV(pipe, param_grid, n_iter=10, cv=5, random_state=42)

# Fit the model
rs_cv.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred_lr = rs_cv.predict(X_test)

# Calculate the evaluation metrics
mse_LR = mean_squared_error(y_test, y_test_pred_lr)
mae_LR = mean_absolute_error(y_test, y_test_pred_lr)
r2_LR = r2_score(y_test, y_test_pred_lr)

print("R-squared:", r2_LR)
print("MAE:", mae_LR)
print("MSE:", mse_LR)

R-squared: 0.8523057082308287
MAE: 73.48080525200373
MSE: 13967.27508712233


# Regression Tree

In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(DecisionTreeRegressor())),
    ('regressor', DecisionTreeRegressor())
])

param_grid_dt = {
    'preprocessor__cat__onehot__handle_unknown': ['error', 'ignore'], ##!!!!! when you run it without it you get "Value Error: Found unknown categories ..." for the first column PLEASE DEBUG SO MODEL USES EVERYTHING
    'feature_selection__estimator__max_depth': [3, 5, 7],
    'feature_selection__estimator__min_samples_split': [2, 3, 4],
    'feature_selection__estimator__min_samples_leaf': [1, 2],
    'regressor__max_depth': [5, 8, 10],
    'regressor__min_samples_split': [3, 4, 5],
    'regressor__min_samples_leaf': [1, 2]
}


model_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=5)
model_dt.fit(X_train, y_train)
y_test_pred_dt = model_dt.predict(X_test)

# Calculate the evaluation metrics
mse_DT = mean_squared_error(y_test, y_test_pred_dt)
mae_DT = mean_absolute_error(y_test, y_test_pred_dt)
r2_DT = r2_score(y_test, y_test_pred_dt)

print("R-squared:", r2_DT)
print("MAE:", mae_DT)
print("MSE:", mse_DT)

R-squared: 0.9281635757006054
MAE: 36.88425122201197
MSE: 6793.485973263038


# Random Forest

In [35]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
    ('regressor', RandomForestRegressor())
])

param_grid_rf = {
    'preprocessor__cat__onehot__handle_unknown': ['error', 'ignore'], ##!!!!! when you run it without it you get "Value Error: Found unknown categories ..." for the first column PLEASE DEBUG SO MODEL USES EVERYTHING
    'feature_selection__estimator__n_estimators': [50, 100],
    'feature_selection__estimator__max_depth': [5, 10],
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [5, 7],
    'regressor__min_samples_split': [2, 4],
    'regressor__min_samples_leaf': [1, 3],
    'regressor__max_features': ['auto', 'sqrt']
}

model_rf = RandomizedSearchCV(rf_pipeline, param_distributions=param_grid_rf, n_iter=20, cv=5)
model_rf.fit(X_train, y_train)
y_test_pred_rf = model_rf.predict(X_test)

# Calculate the evaluation metrics
mse_rf = mean_squared_error(y_test, y_test_pred_rf)
mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
r2_rf = r2_score(y_test, y_test_pred_rf)

# Print the evaluation metrics
print("Mean Squared Error:", mse_rf)
print("Mean Absolute Error:", mae_rf)
print("R^2 Score:", r2_rf)


Mean Squared Error: 5325.797914603745
Mean Absolute Error: 54.05381278909612
R^2 Score: 0.9436833637057556


## KNeighborsRegressor

In [37]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define the pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

# Define the parameter grid
param_grid = {
    'preprocessor__cat__onehot__handle_unknown': ['error', 'ignore'], ##!!!!! when you run it without it you get "Value Error: Found unknown categories ..." for the first column PLEASE DEBUG SO MODEL USES EVERYTHING
    'regressor__n_neighbors': [3, 5],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__p': [1, 2]
}

# Define the GridSearchCV object
grid_cv = GridSearchCV(pipe, param_grid, cv=2) # turn the number of the grid up and see if your machine can handle it

# Fit the model
grid_cv.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred_knn = grid_cv.predict(X_test)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_test_pred_knn)
mae = mean_absolute_error(y_test, y_test_pred_knn)
r2 = r2_score(y_test, y_test_pred_knn)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

Mean Squared Error: 1132.4998771186008
Mean Absolute Error: 14.257247500535094
R^2 Score: 0.9880245956182304


In [50]:
pipe_KNN

# The results

In [41]:
from prettytable import PrettyTable

metrics = [
    {'model': 'RegressionTree', 'mse': mse_DT, 'mae': mae_DT, 'r2': r2_DT},
    {'model': 'RandomForest', 'mse': mse_rf, 'mae': mae_rf, 'r2': r2_rf},
    {'model': 'LinearRegression', 'mse': mse_LR, 'mae': mae_LR, 'r2': r2_LR},
    {'model': 'K-nearest neighbor', 'mse': mse, 'mae': mae, 'r2': r2}
]

cols = ['Model', 'Mean Squared Error', 'Mean Absolute Error', 'R^2 Score']

table = PrettyTable(cols)

for metric in metrics:
    row = [metric['model'], metric['mse'], metric['mae'], metric['r2']]
    table.add_row(row)

# Print the table
print(table)


+--------------------+--------------------+---------------------+--------------------+
|       Model        | Mean Squared Error | Mean Absolute Error |     R^2 Score      |
+--------------------+--------------------+---------------------+--------------------+
|   RegressionTree   | 6793.485973263038  |  36.88425122201197  | 0.9281635757006054 |
|    RandomForest    | 5325.797914603745  |  54.05381278909612  | 0.9436833637057556 |
|  LinearRegression  | 13967.27508712233  |  73.48080525200373  | 0.8523057082308287 |
| K-nearest neighbor | 1132.4998771186008 |  14.257247500535094 | 0.9880245956182304 |
+--------------------+--------------------+---------------------+--------------------+


# What still needs to be done

### Techniques still to use to improve model accuracy:

- Treat outliers: !! NOTE: Outlier detection / removal / imputation still needs to be done. Domain knowledge is important here. Boxplots to see if anything does not make sense. For this maybe safe a seperate DF in preProccesing function which is already numeric but not scaled.
- Scale features with other techniques
- Add new features
- Using different models
- Change hyperparameters tuning

### Build a regression tree and let it grow as deep as possible

### Improve the design of the GUI

### Implement the best performing model in a  GUI

### Implement the first preprocessing steps into the pipeline (the preprocessing functions we wrote by hand)

### Plot the 2-3 best performing features of the models

### Create a out of sample dataset and test it on that

### Linear regression with PCA is not good (Hochreiter said that) --> XGBOOST instead

### feature importance analysis of the best performing models

### try imputation and non-impution as well because imputing missing values maybe always leads to a positive results but ruins the dataset