In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
df_car = pd.read_csv('car_clean.csv')
df_car.head()

Unnamed: 0,Unnamed: 1,Ngày đăng bài,Tên,Giá,Năm SX,Nhiên liệu,Kiểu dáng,Tình trạng,Km đã đi,Hộp số,Xuất xứ,Tỉnh thành
0,0,12/09/2025,Lexus,4290,2024,Máy xăng,SUV,Xe cũ,3000,Số tự động,Nhập khẩu,Tp.HCM
1,1,05/10/2025,Hyundai,590,2019,Máy xăng,SUV,Xe cũ,59000,Số tự động,Trong nước,Lâm Đồng
2,2,29/09/2025,Honda,535,2019,Máy xăng,SUV,Xe cũ,69000,Số tự động,Nhập khẩu,Khánh Hòa
3,3,01/10/2025,Mitsubishi,443,2020,Máy xăng,MPV,Xe cũ,10000,Số sàn,Nhập khẩu,Hà Nội
4,4,27/09/2025,Suzuki,450,2022,Máy xăng,SUV,Xe cũ,52000,Số tự động,Nhập khẩu,Đồng Nai


In [3]:
df_car_have_date = df_car.copy()
day,month,year = [],[],[]
for date in df_car['Ngày đăng bài']:
    day.append(date.split('/')[0])
    month.append(date.split('/')[1])
    year.append(date.split('/')[2])

In [4]:
num_cols = ['Giá','Năm SX','Km đã đi']
categorical_cols = ['Nhiên liệu', 'Kiểu dáng', 'Tình trạng', 'Hộp số', 'Xuất xứ','Tên','Tỉnh thành']

encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(df_car[categorical_cols])
encoder_df = np.concatenate((df_car[num_cols], encoded_features), axis=1)
encoded_df = pd.DataFrame(encoder_df, columns=np.concatenate((num_cols, encoder.get_feature_names_out(categorical_cols))))
encoded_df

Unnamed: 0,Giá,Năm SX,Km đã đi,Nhiên liệu_Máy dầu,Nhiên liệu_Máy xăng,Nhiên liệu_Điện,Kiểu dáng_Convertible,Kiểu dáng_Coupe,Kiểu dáng_Crossover,Kiểu dáng_Hatchback,...,Tỉnh thành_Tp.HCM,Tỉnh thành_Trà Vinh,Tỉnh thành_Tây Ninh,Tỉnh thành_Vĩnh Long,Tỉnh thành_Vĩnh Phúc,Tỉnh thành_Đà Nẵng,Tỉnh thành_Đắk Lắk,Tỉnh thành_Đắk Nông,Tỉnh thành_Đồng Nai,Tỉnh thành_Đồng Tháp
0,4290.0,2024.0,3000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,590.0,2019.0,59000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,535.0,2019.0,69000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,443.0,2020.0,10000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,450.0,2022.0,52000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,310.0,2013.0,64000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2081,425.0,2015.0,78000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2082,2850.0,2022.0,40000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2083,500.0,2023.0,18000.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
encoded_df_have_date = encoded_df.copy()
encoded_df_have_date['Ngày đăng'] = day
encoded_df_have_date['Tháng đăng'] = month
encoded_df_have_date.head()

Unnamed: 0,Giá,Năm SX,Km đã đi,Nhiên liệu_Máy dầu,Nhiên liệu_Máy xăng,Nhiên liệu_Điện,Kiểu dáng_Convertible,Kiểu dáng_Coupe,Kiểu dáng_Crossover,Kiểu dáng_Hatchback,...,Tỉnh thành_Tây Ninh,Tỉnh thành_Vĩnh Long,Tỉnh thành_Vĩnh Phúc,Tỉnh thành_Đà Nẵng,Tỉnh thành_Đắk Lắk,Tỉnh thành_Đắk Nông,Tỉnh thành_Đồng Nai,Tỉnh thành_Đồng Tháp,Ngày đăng,Tháng đăng
0,4290.0,2024.0,3000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,9
1,590.0,2019.0,59000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,10
2,535.0,2019.0,69000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,9
3,443.0,2020.0,10000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10
4,450.0,2022.0,52000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,27,9


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(encoded_df)
df_scaled = pd.DataFrame(X_scaled, columns=encoded_df.columns)

pca = PCA(n_components=0.65)
X_pca = pca.fit_transform(X_scaled)
# price = np.array(encoded_df['Giá']).reshape(-1,1)
X_pca = pd.DataFrame(X_pca)
df_pca = pd.concat([X_pca, df_scaled['Giá']], axis=1)
df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,Giá
0,-3.094314,0.770776,1.440742,1.835258,0.363242,0.814916,-0.423697,-0.602307,0.048463,0.078501,...,-0.088821,-0.253444,-0.235739,0.108301,-0.191576,-0.140471,0.254372,0.066631,-0.081621,2.536680
1,-0.505319,0.076120,-0.223668,-0.613519,-0.917855,-0.145758,-0.019042,0.130224,0.062166,0.551759,...,-5.751410,6.047856,8.099430,-2.862013,5.601257,-3.360968,3.347328,-3.758432,6.415903,-0.222673
2,1.476442,-3.581598,15.551405,-6.065042,-1.875028,-0.524501,-0.192860,-0.215348,-0.883429,-0.752725,...,-0.018080,0.002702,0.006960,0.011524,-0.002944,-0.000840,0.014803,-0.010476,-0.009071,-0.263691
3,2.079794,0.223583,-0.031092,0.661423,1.596318,-0.346438,3.891540,0.592880,-0.819631,2.578589,...,0.004097,0.048216,0.032939,0.001289,0.016362,-0.020297,0.012089,0.013296,-0.031905,-0.332302
4,-0.365727,0.347059,0.170807,0.450102,-1.011345,0.238080,0.855752,-0.846081,-2.056503,1.203385,...,-0.091981,-0.370175,-0.286205,-0.095522,-0.039013,0.126056,-0.026791,0.079138,0.161425,-0.327081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080,-0.256390,2.068054,0.694144,0.313739,0.936718,-0.959352,-1.060665,0.113810,-0.246932,-1.325313,...,-0.019259,0.002871,0.003399,-0.007120,-0.025122,-0.013879,-0.014673,-0.009364,-0.008235,-0.431489
2081,1.084728,1.341828,-0.230356,-0.711860,-0.620938,-0.633271,-0.735775,0.352357,-1.085405,-0.761458,...,-1.370382,0.495446,-0.780398,-0.619044,0.183208,0.348991,-0.269731,0.073824,-0.169825,-0.345726
2082,-2.227164,0.085726,0.639906,0.696801,-1.004721,0.750112,-0.402554,-0.138192,1.425100,0.128160,...,-0.090928,-0.239902,-0.227327,0.103249,-0.209150,-0.137748,0.260174,0.077378,-0.083820,1.462770
2083,-1.404436,0.362691,-0.483749,-2.114375,0.555422,0.240508,5.269981,0.044961,-0.286441,-1.730972,...,-0.735206,-0.764393,-0.225153,0.194225,1.201885,0.314930,1.704315,0.731113,-0.233410,-0.289793


In [9]:
Data_train, Data_test = train_test_split(df_pca, test_size=0.2, random_state=42)
X_train = Data_train.drop('Giá', axis=1)
y_train = Data_train['Giá']
X_test = Data_test.drop('Giá', axis=1)
y_test = Data_test['Giá']

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error", mae)
print("R-squared:", r2)

Mean Squared Error: 0.26745079017823326
Mean Absolute Error 0.27071370066881284
R-squared: 0.7523367547741249
