# Data Science Lab

In [105]:
import pandas as pd
import numpy as np

df = pd.read_excel('data/dataset_2023.xlsx')
df = df.drop('Unnamed: 0', axis=1)

In [106]:
df.head()

Unnamed: 0,brand_name,model_name,milage,first_registration,duration,gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type
0,Skoda,Octavia ŠKODA Combi Style TDI DSG,201 km,03/2023,48 Monat (anpassbar),Automatik,"574,01 €",119 g/km,"5,0 l/100 km",150 PS,110 kW,Diesel
1,Volkswagen,T-Cross VW Life TSI,201 km,03/2023,48 Monat (anpassbar),Manuelle Schaltung,"382,58 €",131 g/km,"6,0 l/100 km",95 PS,70 kW,Benzin
2,Seat,Ibiza Austria Edition,15.000 km,10/2022,48 Monat (anpassbar),Manuelle Schaltung,"239,62 €",120 g/km,"5,0 l/100 km",80 PS,59 kW,Benzin
3,Volkswagen,Polo VW,1 km,01/2023,48 Monat (anpassbar),Manuelle Schaltung,"309,11 €",127 g/km,"6,0 l/100 km",80 PS,59 kW,Benzin
4,Audi,A4 Avant 40 TDI quattro S line,105.301 km,12/2019,48 Monat (anpassbar),Automatik,"587,75 €",138 g/km,"5,0 l/100 km",190 PS,140 kW,Diesel


# Preprocessing

NOTE: Outlier detection / removal / imputation still needs to be done. Domain knowledge is important here. Boxplots to see if anything does not make sense. For this maybe safe a seperate DF in preProccesing function which is already numeric but not scaled.

### Date column

In [107]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def calculate_age(registration_date):
    if registration_date == "Neuwagen":
        return 0
    else:
        today = datetime.now()
        date = datetime.strptime(registration_date, "%m/%Y")
        delta = relativedelta(today, date)
        return (delta.years * 12) + delta.months

### Consumption column

In [108]:
def getConsumption(consumption):
    return str(consumption).split()[0]

In [109]:
def replaceComma(text):
    if "." in str(text): 
        return str("".join(str(text).split("."))).replace(",",".")
    else:
        return str(text).replace(",",".")

def removeLetters(text):
    return "".join(c for c in text if c.isdigit() or c == ".")

def removeThousandPoint(text):
    return str(text).replace(".","")

### For the rest of the column

In [110]:
columnsNumeric = ["milage", "duration", "monthly_fee", "horsepower", "emission_value", "kilowatts", "consumption"]

In [111]:
def preProcess(columnsNumeric, df):
    
    # calculating age
    
    df['first_registration'] = df['first_registration'].apply(calculate_age)
    
    
    # removing unnecesary characters from consumption
    
    df['consumption'] = df['consumption'].apply(getConsumption)
    
    
    for i in range(len(columnsNumeric)):
        
        
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(replaceComma)
        df[columnsNumeric[i]] = df[columnsNumeric[i]].apply(removeLetters)
    
    # removing thousand point from milage column
    
    df["milage"] = df["milage"].apply(removeThousandPoint)
    
    
    # replacing "" and "nan" values from the different 

    df = df.replace('', np.nan)
    df = df.replace('nan', np.nan)
    
    
    # create the binary vector based on the "gear" column
    
    df['gear'] = np.where(df['gear'] == 'Automatik', 1.0, 0.0)
    df = df.rename(columns={'gear': 'Automatic_gear'})

    
    # create new column combining name and model
    # ------- might make more sense to keep this seperate, will evaluate once models are complete
    
    #df['model_name'] = df['brand_name'] + ', ' + df['model_name']
    #df = df.drop(columns=['brand_name'])
    
    
    # creating dummy variables for fuel_type and model_name column

    df = pd.get_dummies(df, columns=['fuel_type'])
    df = pd.get_dummies(df, columns=['model_name'])
    
    
    # cast numeric to int
    
    df[df.columns.difference(['brand_name'])] = df[df.columns.difference(['brand_name'])].astype(float)


    return df


In [112]:
df = preProcess(columnsNumeric, df)

In [113]:
df.head(5)
df.to_csv('data/preprocessed_df.csv', index=False)

# Implementation

In [114]:
# importing of libraries

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

import pandas as pd

In [115]:
def implementProc(df):
    
    # for later: not hardcode but make it for every column with missing values
    
    
    # create dictionary of mean values for each brand_name
    
    meansDicCons = df.groupby('brand_name')['consumption'].mean().to_dict()
    meansDicEmis = df.groupby('brand_name')['emission_value'].mean().to_dict()
    
    
    # fill nans with mean of brand_name
    
    df['consumption'] = df.apply(lambda x: meansDicCons[x['brand_name']] if pd.isna(x['consumption']) else x['consumption'], axis=1)
    df['emission_value'] = df.apply(lambda x: meansDicEmis[x['brand_name']] if pd.isna(x['emission_value']) else x['emission_value'], axis=1)

    
    # now turn brand_name to dummy variable
    
    df = pd.get_dummies(df, columns=['brand_name'])
    
    
    # Scale using MinMaxScaler
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
    
    return scaled_df

In [116]:
df = implementProc(df)


In [117]:
df.head()

Unnamed: 0,milage,first_registration,duration,Automatic_gear,monthly_fee,emission_value,consumption,horsepower,kilowatts,fuel_type_Benzin,...,brand_name_Mazda,brand_name_Mercedes-Benz,brand_name_Mitsubishi,brand_name_Nissan,brand_name_Opel,brand_name_Peugeot,brand_name_Seat,brand_name_Skoda,brand_name_Toyota,brand_name_Volkswagen
0,0.001064,0.024691,0.666667,1.0,0.150908,0.341954,0.285714,0.2,0.199396,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.001064,0.024691,0.666667,0.0,0.064517,0.376437,0.357143,0.077778,0.07855,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.079407,0.08642,0.666667,0.0,0.0,0.344828,0.285714,0.044444,0.045317,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,5e-06,0.049383,0.666667,0.0,0.03136,0.364943,0.357143,0.044444,0.045317,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.557443,0.506173,0.666667,1.0,0.157109,0.396552,0.285714,0.288889,0.29003,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
df.dtypes

milage                   float64
first_registration       float64
duration                 float64
Automatic_gear           float64
monthly_fee              float64
                          ...   
brand_name_Peugeot       float64
brand_name_Seat          float64
brand_name_Skoda         float64
brand_name_Toyota        float64
brand_name_Volkswagen    float64
Length: 378, dtype: object

## Creating the different models

## Splitting dataset

In [119]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [120]:
dfNoRate = df.drop('monthly_fee', axis=1)  # input features
dfRate = df['monthly_fee']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(dfNoRate, dfRate, test_size=0.25, shuffle=False)

## Random forest 

Not doing a PCA for the random forrest. Include WHY NOT

In [122]:
# Takes to long so doing it with less tress and splits
#rf_pipeline = Pipeline([
    #('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
    #('regressor', RandomForestRegressor())
#])

#param_grid_rf = {
    #'feature_selection__estimator__n_estimators': [100, 200, 300],
    #'feature_selection__estimator__max_depth': [5, 10, 15],
    #'regressor__n_estimators': [100, 200, 300],
    #'regressor__max_depth': [5, 9, 7, 10, 15],
    #'regressor__min_samples_split': [2, 4, 6, 8, 10],
    #'regressor__min_samples_leaf': [1, 3, 5, 7, 10],
    #'regressor__max_features': ['auto', 'sqrt', 'log2']
#}

#model_rf = RandomizedSearchCV(rf_pipeline, param_distributions=param_grid_rf, n_iter=100, cv=5)
#model_rf.fit(X_train, y_train)
#y_test_pred_rf = model_rf.predict(X_test)

In [123]:
rf_pipeline = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
    ('regressor', RandomForestRegressor())
])

param_grid_rf = {
    'feature_selection__estimator__n_estimators': [50, 100],
    'feature_selection__estimator__max_depth': [5, 10],
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [5, 7],
    'regressor__min_samples_split': [2, 4],
    'regressor__min_samples_leaf': [1, 3],
    'regressor__max_features': [1, 'sqrt']
}

model_rf = RandomizedSearchCV(rf_pipeline, param_distributions=param_grid_rf, n_iter=20, cv=5)
model_rf.fit(X_train, y_train)
y_test_pred_rf = model_rf.predict(X_test)


KeyboardInterrupt: 

In [None]:
# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_test_pred_rf)
mae = mean_absolute_error(y_test, y_test_pred_rf)
r2 = r2_score(y_test, y_test_pred_rf)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

Mean Squared Error: 0.0010680231351554248
Mean Absolute Error: 0.023694053381677013
R^2 Score: 0.9445484870564396


### Linear regression

also doing PCA. Explain WHY YES

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [None]:
# Define the pipeline
lr_pipeline = Pipeline([
    ('feature_selection', SelectFromModel(RandomForestRegressor(n_jobs=-1))),
    ('regressor', LinearRegression())
])

# Train the model using the training data
lr_pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred_lr = lr_pipeline.predict(X_test)

In [None]:
# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_test_pred_lr)
r2 = r2_score(y_test, y_test_pred_lr)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.002685246740425899
R^2 Score: 0.8605826133516188


### Linear regression with PC

#### We also tried to use a PCA for the linear regression

In [None]:
# Define the pipeline
pipe = Pipeline([
    ('pca', PCA()),
    ('regressor', LinearRegression())
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [10, 20, 30, 40],
    'regressor__fit_intercept': [True, False]
}

# Define the RandomizedSearchCV object
rs_cv = RandomizedSearchCV(pipe, param_grid, n_iter=8, cv=5, random_state=42)

# Fit the model
rs_cv.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred_lr = rs_cv.predict(X_test)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_test_pred_lr)
mae = mean_absolute_error(y_test, y_test_pred_lr)
r2 = r2_score(y_test, y_test_pred_lr)


In [None]:
# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

Mean Squared Error: 0.0028952196127534015
Mean Absolute Error: 0.032948580879392605
R^2 Score: 0.8496808706230115
