In [1]:
# import 

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score


In [2]:
# 데이터 불러오기
X_train = pd.read_csv('./datasets/car/X_train.csv')
y_train = pd.read_csv('./datasets/car/y_train.csv')
X_test = pd.read_csv('./datasets/car/X_test.csv')
y_test = pd.read_csv('./datasets/car/y_test.csv')

In [3]:
# 데이터 확인
X_train.info()
X_test.info()
y_train.info()
y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2672 entries, 0 to 2671
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         2672 non-null   int64  
 1   brand         2672 non-null   object 
 2   model         2672 non-null  

In [4]:
# 컬럼별로 값 확인
# select_dtypes() : object형 데이터와 비object형(숫자형) 데이터를 구분해서 호출해주는 함수
# object형 데이터값만 호출 : include = 'object'
for col in X_train.select_dtypes(include = 'object'):
    target = X_train[col]
    print('-------------------')
    print(col)
    print(target.nunique()) # 값의 갯수
    print(target.value_counts()[:2])

-------------------
brand
9
merc    790
vw      768
Name: brand, dtype: int64
-------------------
model
90
 Arteon         167
 Grand C-MAX    164
Name: model, dtype: int64
-------------------
transmission
4
Manual       1948
Automatic    1660
Name: transmission, dtype: int64
-------------------
fuelType
5
Diesel    2989
Petrol    1754
Name: fuelType, dtype: int64


In [5]:
# exclude = 'object' : 숫자 데이터 유형만 가져오는 방법을 보여주는 것
for col in X_train.select_dtypes(exclude = 'object'):
    target = X_train[col]
    print('-------------------')
    print(col)
    print(target.nunique()) # 값의 갯수
    print(target.unique()[:2]) # 고유 값 이름들
    print(target.describe())

-------------------
carID
4960
[13207 17314]
count     4960.000000
mean     15832.446169
std       2206.717006
min      12002.000000
25%      13929.250000
50%      15840.000000
75%      17765.750000
max      19629.000000
Name: carID, dtype: float64
-------------------
year
24
[2019 2015]
count    4960.000000
mean     2016.737903
std         2.884035
min      1997.000000
25%      2016.000000
50%      2017.000000
75%      2019.000000
max      2020.000000
Name: year, dtype: float64
-------------------
mileage
3900
[ 4223 47870]
count      4960.000000
mean      24956.286895
std       24443.333662
min           1.000000
25%        5641.250000
50%       19000.000000
75%       36702.000000
max      259000.000000
Name: mileage, dtype: float64
-------------------
tax
41
[145. 125.]
count    4960.000000
mean      152.332661
std        82.403844
min         0.000000
25%       145.000000
50%       145.000000
75%       150.000000
max       580.000000
Name: tax, dtype: float64
-------------------
mp

In [6]:
# 데이터 X데이터들 합치고 라벨링하기, 널 값 확인
alldata = pd.concat([X_train,X_test])
print(alldata.isnull().sum())

le = LabelEncoder()
c = alldata.columns[alldata.dtypes ==object]
for i in c:
    alldata[i] = le.fit_transform(alldata[i])
alldata.head()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64


Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,3,69,2019,3,4223,0,145.0,39.8,2.2
1,17314,7,30,2015,1,47870,0,125.0,60.1,2.0
2,12342,0,58,2019,0,5151,4,145.0,29.1,2.9
3,13426,8,71,2016,0,20423,0,30.0,57.6,2.0
4,16004,5,70,2020,3,3569,4,145.0,47.1,1.0


In [7]:
# 전체  데이터 아이디 제거, 데이터 나누기

id = X_test['carID']
alldata = alldata.drop(columns=['carID'],axis=1)
X_train = alldata[:len(X_train)]
X_test = alldata[len(X_train):]
y_train = y_train.price

In [8]:
# 데이터 나누기
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_train, y_train, random_state=0, test_size=0.3)

In [9]:
# RandomForestRegressor
# cross_val_score : 교차 검증

rf = RandomForestRegressor(random_state=0)
score = cross_val_score(rf, X_train_m, y_train_m)
print('Average :',score.mean())

Average : 0.9214889243808766


In [10]:
# r2_score

rf.fit(X_train_m, y_train_m)
pred = rf.predict(X_test_m)
r2 = r2_score(y_test_m, pred)
print('r2_score : ', r2)

r2_score :  0.9491003927254356


In [11]:
result = rf.predict(X_test)

In [12]:
save = pd.DataFrame({'carID' : id, 'price' : result})
# save.to_csv('car.csv', index=False)

In [13]:
save

Unnamed: 0,carID,price
0,12000,41842.94
1,12001,23848.56
2,12004,53892.52
3,12013,15120.75
4,12017,50231.27
...,...,...
2667,19618,37587.30
2668,19620,19027.04
2669,19626,20943.63
2670,19630,24867.80
