# Deep learning course
by IA Expert Academy
## kaggle dataset https://www.kaggle.com/datasets/thedevastator/uncovering-factors-that-affect-used-car-prices

# columns :

*   dateCrawled: Date the car was crawled. (Date)
*   name: Name of the car. (String)
*   seller: Type of seller (private or dealer). (String)
*   offerType: Type of offer (e.g. sale, repair, etc.). (String)
*   price: Price of the car. (Integer)
*   abtest: Test type (A or B). (String)
*   vehicleType: Type of vehicle (e.g. SUV, sedan, etc.). (String)
*   yearOfRegistration: Year the car was registered. (Integer)
*   gearbox: Type of gearbox (manual or automatic). (String)
*   powerPS: Power of the car in PS. (Integer)
*   model: Model of the car. (String)
*   kilometer: Kilometers the car has been driven. (Integer)
*   monthOfRegistration: Month the car was registered. (Integer)
*   fuelType: Type of fuel (e.g. diesel, petrol, etc.). (String)
*   brand: Brand of the car. (String)
*   notRepairedDamage: Whether or not the car has any damage that has not been repaired. (String)
*   dateCreated: Date the car was created. (Date)
*   nrOfPictures: Number of pictures of the car. (Integer)
*   postalCode: Postal code of the car. (Integer)
*   lastSeen: Date the car was last seen. (Date)

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import keras
print("Keras version:", keras.__version__)

TensorFlow version: 2.17.0
Keras version: 3.4.1


In [15]:
# prompt: Is there a way to get the data from Kaggle directly ?

!pip install kaggle

# Upload kaggle.json from your Kaggle account
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d thedevastator/uncovering-factors-that-affect-used-car-prices

!unzip uncovering-factors-that-affect-used-car-prices.zip




KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [151]:
base = pd.read_csv('autos.csv', encoding='ISO-8859-1')
print(base.shape)

(371528, 21)


In [22]:
base.describe()

Unnamed: 0,index,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration,nrOfPictures,postalCode
count,323941.0,323940.0,323940.0,323940.0,323940.0,323940.0,323940.0,323940.0
mean,161970.0,17836.52,2004.583204,115.691495,125533.771686,5.731166,0.0,50806.601704
std,93513.856113,3830124.0,94.039326,196.727093,40179.903514,3.712653,0.0,25800.849049
min,0.0,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,80985.0,1150.0,1999.0,70.0,100000.0,3.0,0.0,30457.0
50%,161970.0,2950.0,2003.0,105.0,150000.0,6.0,0.0,49589.5
75%,242955.0,7200.0,2008.0,150.0,150000.0,9.0,0.0,71549.0
max,323940.0,2147484000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0


In [23]:
base.head()

Unnamed: 0,index,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,...,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480.0,test,,1993.0,manuell,...,golf,150000.0,0.0,benzin,volkswagen,,2016-03-24 00:00:00,0.0,70435.0,2016-04-07 03:16:57
1,1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300.0,test,coupe,2011.0,manuell,...,,125000.0,5.0,diesel,audi,ja,2016-03-24 00:00:00,0.0,66954.0,2016-04-07 01:46:50
2,2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800.0,test,suv,2004.0,automatik,...,grand,125000.0,8.0,diesel,jeep,,2016-03-14 00:00:00,0.0,90480.0,2016-04-05 12:47:46
3,3,2016-03-17 16:54:04,GOLF_4_1_4__3TÃRER,privat,Angebot,1500.0,test,kleinwagen,2001.0,manuell,...,golf,150000.0,6.0,benzin,volkswagen,nein,2016-03-17 00:00:00,0.0,91074.0,2016-03-17 17:40:17
4,4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600.0,test,kleinwagen,2008.0,manuell,...,fabia,90000.0,7.0,diesel,skoda,nein,2016-03-31 00:00:00,0.0,60437.0,2016-04-06 10:17:21


In [140]:
base.columns

Index(['index', 'dateCrawled', 'name', 'seller', 'offerType', 'price',
       'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS',
       'model', 'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [152]:
columns = ['index','dateCrawled','dateCreated',
           'nrOfPictures','postalCode','lastSeen']
base = base.drop(columns, axis=1)
# show unique names
base['name'].value_counts()


Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Ford_Fiesta,657
BMW_318i,627
Opel_Corsa,622
Volkswagen_Golf_1.4,603
BMW_316i,523
...,...
Audi_A4_Avant_Klima_Gruene_Plakette_TÃV_&AU_NEU_XENON,1
Renault_clio_in_gold_450VB_!!,1
Fiat_Doblo_1.6_Multijet,1
Renault_Laguna_1,1


In [29]:
base['seller'].value_counts()

Unnamed: 0_level_0,count
seller,Unnamed: 1_level_1
privat,371525
gewerblich,3


In [30]:
base['offerType'].value_counts()

Unnamed: 0_level_0,count
offerType,Unnamed: 1_level_1
Angebot,371516
Gesuch,12


In [153]:
#drop another columns
columns = ['name','seller','offerType']
base = base.drop(columns, axis=1)

In [154]:
base.head()

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,
1,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja
2,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,
3,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein


In [155]:
filter = base.loc[base.price <= 10]
base = base.drop(filter.index, axis=0)
base.shape

(359410, 12)

In [156]:
filter = base.loc[base['price'] >= 350000]
base = base.drop(filter.index, axis=0)
base.shape

(359291, 12)

In [157]:
# all the null values
df_base_null = base.isnull().sum()
df_base_null = df_base_null[df_base_null > 0]
print(df_base_null)
#fill with mode value
columns = ['vehicleType','gearbox','model','fuelType','notRepairedDamage']
base = base.fillna(base[columns].mode().iloc[0])

vehicleType          33546
gearbox              17236
model                17967
fuelType             29391
notRepairedDamage    65986
dtype: int64


In [158]:
base.head()

Unnamed: 0,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,480,test,limousine,1993,manuell,0,golf,150000,0,benzin,volkswagen,nein
1,18300,test,coupe,2011,manuell,190,golf,125000,5,diesel,audi,ja
2,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,nein
3,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein
4,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein


In [159]:
y = base['price']
x = base.drop('price', axis=1)
x.shape, y.shape

((359291, 11), (359291,))

In [160]:
y

Unnamed: 0,price
0,480
1,18300
2,9800
3,1500
4,3600
...,...
371523,2200
371524,1199
371525,9200
371526,3400


In [161]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 359291 entries, 0 to 371527
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   abtest               359291 non-null  object
 1   vehicleType          359291 non-null  object
 2   yearOfRegistration   359291 non-null  int64 
 3   gearbox              359291 non-null  object
 4   powerPS              359291 non-null  int64 
 5   model                359291 non-null  object
 6   kilometer            359291 non-null  int64 
 7   monthOfRegistration  359291 non-null  int64 
 8   fuelType             359291 non-null  object
 9   brand                359291 non-null  object
 10  notRepairedDamage    359291 non-null  object
dtypes: int64(4), object(7)
memory usage: 32.9+ MB


In [162]:
# transforming the columns to one hot encoder
ohe = ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                       ['abtest','vehicleType','gearbox',
                                        'model','fuelType','brand',
                                        'notRepairedDamage'])],
                        remainder='passthrough')

X = ohe.fit_transform(x).toarray()
X


array([[0.00e+00, 1.00e+00, 0.00e+00, ..., 0.00e+00, 1.50e+05, 0.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.90e+02, 1.25e+05, 5.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.63e+02, 1.25e+05, 8.00e+00],
       ...,
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.02e+02, 1.50e+05, 3.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, ..., 1.00e+02, 1.50e+05, 6.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, ..., 3.20e+02, 5.00e+04, 8.00e+00]])

In [163]:
X.shape

(359291, 316)

In [164]:
# define neural
regressor = Sequential()
regressor.add(Dense(units=158, activation='relu',input_shape=(316,)))
regressor.add(Dense(units=158, activation='relu'))
regressor.add(Dense(units=1, activation='linear'))
regressor.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [168]:
regressor.compile(loss='mean_absolute_error',
                  optimizer='adam',
                  metrics=['mean_absolute_error'])
regressor.fit(X_train, y_train, epochs=100, batch_size=300)

Epoch 1/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 2449.1694 - mean_absolute_error: 2449.1694
Epoch 2/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 2243.9016 - mean_absolute_error: 2243.9016
Epoch 3/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 2253.7673 - mean_absolute_error: 2253.7673
Epoch 4/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 2263.7876 - mean_absolute_error: 2263.7876
Epoch 5/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 2263.2190 - mean_absolute_error: 2263.2190
Epoch 6/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 2253.8191 - mean_absolute_error: 2253.8191
Epoch 7/100
[1m899/899[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - loss: 2249.2480 - mean_absolute_error: 2249.2483
Epoch 8/100
[1m899/899[0m [32m━

<keras.src.callbacks.history.History at 0x79e58f3db3a0>

In [166]:
X.shape

(359291, 316)

In [169]:
predict_result = regressor.predict(X_test)

[1m2807/2807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step


In [170]:
# compare Y_test with predict_result

from sklearn.metrics import mean_squared_error, mean_absolute_error
print(mean_absolute_error(y_test, predict_result))
print(mean_squared_error(y_test, predict_result))


2276.3307493106227
27199205.34330258
