In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/car-prices-jordan/car_prices_jordan.csv


In [2]:
df=pd.read_csv('/kaggle/input/car-prices-jordan/car_prices_jordan.csv')

In [3]:
df.shape

(366, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Model,Property,Power,Price
0,0,Byd F0 2018,Manual,1000 CC,6900
1,1,Suzuki Alto 2023,manual,800 CC,8250
2,2,Suzuki Celerio 2019,Automatic,1000 CC,10499
3,3,Changan E Star 2023,Automatic,0 CC,10990
4,4,Hyundai Grand i10 2020,Automatic,1250 CC,11500


In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [6]:
df['Year'] = df['Model'].str.extract(r'(\d{4})')
df['Name'] = df['Model'].str.rsplit(' ', n=1).str[0]
df['Marka'] = df['Name'].str.split().str[0]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Model     366 non-null    object
 1   Property  366 non-null    object
 2   Power     366 non-null    object
 3   Price     366 non-null    object
 4   Year      366 non-null    object
 5   Name      366 non-null    object
 6   Marka     366 non-null    object
dtypes: object(7)
memory usage: 20.1+ KB


In [8]:
#convert price from object to float 
df['Price'] = df['Price'].str.replace(',', '').astype(float)

In [9]:
#getting price in usd

exchange_rate_jod_to_usd = 1.41
df['Price_USD'] = df['Price'] * exchange_rate_jod_to_usd

In [10]:
df.head()

Unnamed: 0,Model,Property,Power,Price,Year,Name,Marka,Price_USD
0,Byd F0 2018,Manual,1000 CC,6900.0,2018,Byd F0,Byd,9729.0
1,Suzuki Alto 2023,manual,800 CC,8250.0,2023,Suzuki Alto,Suzuki,11632.5
2,Suzuki Celerio 2019,Automatic,1000 CC,10499.0,2019,Suzuki Celerio,Suzuki,14803.59
3,Changan E Star 2023,Automatic,0 CC,10990.0,2023,Changan E Star,Changan,15495.9
4,Hyundai Grand i10 2020,Automatic,1250 CC,11500.0,2020,Hyundai Grand i10,Hyundai,16215.0


In [11]:
#top 10 expensive cars 
df_sorted = df.sort_values(by='Price', ascending=False)

top_10_expensive_cars = df_sorted.head(10)
top_10_cheapest_cars = df_sorted.tail(10)

print(top_10_expensive_cars['Model'])
print('===========================')
print(top_10_cheapest_cars['Model'])

365              Cadillac Ecalade V 2023
364                    Genesis GV80 2022
363                Maserati Levante 2022
362           Maserati Quattroporte 2022
361          Land Rover Range Rover 2023
360                     Genesis G80 2022
359                    Genesis GV70 2022
358          Land Rover Range Rover 2023
357                Maserati Levante 2022
356    Land Rover Range Rover Sport 2022
Name: Model, dtype: object
9            Kia Pegas 2021
8         Hyundai Atos 2021
7          Kia Picanto 2021
6         Nissan Micra 2020
5             Baic X35 2023
4    Hyundai Grand i10 2020
3       Changan E Star 2023
2       Suzuki Celerio 2019
1          Suzuki Alto 2023
0               Byd F0 2018
Name: Model, dtype: object


DATA VISUALISING

In [12]:
import plotly.express as px

In [13]:
fig = px.bar(
    top_10_expensive_cars,
    x='Model',
    y='Price',
    title='Top 10 Most Expensive Cars',
    labels={'Model': 'Car Model', 'Price': 'Price in JOD'}
)

fig.update_xaxes(categoryorder='total descending')
fig.update_traces(marker_color='skyblue')

fig.show()

fig = px.bar(
    top_10_cheapest_cars,
    x='Model',
    y='Price',
    title='Top 10 Most Cheapest Cars',
    labels={'Model': 'Car Model', 'Price': 'Price in JOD'}
)

fig.update_xaxes(categoryorder='total descending')
fig.update_traces(marker_color='skyblue')

fig.show()

In [14]:
top_10_expensive_cars = top_10_expensive_cars.sort_values(by='Price', ascending=False)

fig = px.bar(top_10_expensive_cars, x="Model", y=["Price", "Price_USD"], title='Top 10 Most Expensive Cars')
fig.update_xaxes(title='Car Model')
fig.update_yaxes(title='Price')
fig.show()

top_10_cheapest_cars = top_10_cheapest_cars.sort_values(by='Price', ascending=False)

fig = px.bar(top_10_cheapest_cars, x="Model", y=["Price", "Price_USD"], title='Top 10 Most Expensive Cars')
fig.update_xaxes(title='Car Model')
fig.update_yaxes(title='Price')
fig.show()

In [15]:
marka_counts = df['Marka'].value_counts()

pie_data = pd.DataFrame({'Marka': marka_counts.index, 'Count': marka_counts.values})

fig = px.pie(pie_data, names='Marka', values='Count', title='Most Frequent Markas (Top 20)')
fig.show()

In [16]:
fig_histogram = px.histogram(df, x='Price', title='Distribution of Car Prices')
fig_histogram.update_xaxes(title='Price')
fig_histogram.update_yaxes(title='Frequency')

fig_boxplot = px.box(df, y='Price', title='Box Plot of Car Prices')
fig_boxplot.update_xaxes(title='Price')
fig_boxplot.update_yaxes(title='Price')


from plotly.subplots import make_subplots

fig_combined = make_subplots(rows=1, cols=2, subplot_titles=('Histogram', 'Box Plot'))
fig_combined.add_trace(fig_histogram['data'][0], row=1, col=1)
fig_combined.add_trace(fig_boxplot['data'][0], row=1, col=2)

fig_combined.update_layout(showlegend=False)
fig_combined.show()

In [17]:
marka_power_counts = df.groupby(['Marka', 'Power']).size().reset_index(name='Count')

fig = px.bar(marka_power_counts, x='Marka', y='Count', color='Power', title='Marka and Power Occurrences')
fig.update_xaxes(title='Marka')
fig.update_yaxes(title='Count')
fig.show()

In [18]:
df.pop('Model')
# df.pop('Price')

0                      Byd F0 2018
1                 Suzuki Alto 2023
2              Suzuki Celerio 2019
3              Changan E Star 2023
4           Hyundai Grand i10 2020
                  ...             
361    Land Rover Range Rover 2023
362     Maserati Quattroporte 2022
363          Maserati Levante 2022
364              Genesis GV80 2022
365        Cadillac Ecalade V 2023
Name: Model, Length: 366, dtype: object

In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
columns=['Year','Power','Property','Marka','Name']
for i in columns :
    df[i] = label_encoder.fit_transform(df[i])

In [20]:
x=df.copy()
y=x.pop('Price_USD')

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Shape of x_train:", x_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of x_train: (292, 6)
Shape of y_train: (292,)


REGRESSION MODELS

Linear Regression 

In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²) Score:", r2)

Mean Absolute Error (MAE): 2.494965195404114e-11
Mean Squared Error (MSE): 7.594412272547662e-22
Root Mean Squared Error (RMSE): 2.7557961231824936e-11
R-squared (R²) Score: 1.0


In [23]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(x_train, y_train)
y_pred = rf_regressor.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²) Score:", r2)

Mean Absolute Error (MAE): 1660.9054986486437
Mean Squared Error (MSE): 27219875.25284317
Root Mean Squared Error (RMSE): 5217.267029091301
R-squared (R²) Score: 0.9944136602648211


In [24]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_model.fit(x_train, y_train)
y_pred = gradient_boosting_model.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R²) Score:", r2)

Mean Absolute Error (MAE): 1320.4066407161233
Mean Squared Error (MSE): 19001568.98453961
Root Mean Squared Error (RMSE): 4359.078914695123
R-squared (R²) Score: 0.9961003046904857
