In [1]:
# import 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


In [2]:
# 데이터 불러오기
X_train = pd.read_csv('./datasets/car/X_train.csv')
y_train = pd.read_csv('./datasets/car/y_train.csv')
X_test = pd.read_csv('./datasets/car/X_test.csv')
y_test = pd.read_csv('./datasets/car/y_test.csv')

In [4]:
# 데이터 확인
X_train.info()
X_test.info()
y_train.info()
y_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2672 entries, 0 to 2671
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         2672 non-null   int64  
 1   brand         2672 non-null   object 
 2   model         2672 non-null  

In [8]:
X_train

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0
...,...,...,...,...,...,...,...,...,...,...
4955,16898,merc,GL Class,2015,Automatic,24314,Diesel,125.0,56.6,2.1
4956,14416,bmw,6 Series,2017,Automatic,18000,Diesel,145.0,51.4,3.0
4957,15453,vw,CC,2015,Manual,84932,Diesel,30.0,60.1,2.0
4958,14666,audi,A7,2017,Semi-Auto,30150,Diesel,145.0,62.8,3.0


In [7]:
# 널 값 확인
X_train.isna().sum().sum()
X_test.isna().sum().sum()
y_train.isna().sum().sum()
y_test.isna().sum().sum()

0

In [17]:
# 컬럼별로 값 확인
# select_dtypes() : object형 데이터와 비object형(숫자형) 데이터를 구분해서 호출해주는 함수
# object형 데이터값만 호출 : include = 'object'
for col in X_train.select_dtypes(include = 'object'):
    target = X_train[col]
    print('-------------------')
    print(col)
    print(target.nunique()) # 값의 갯수
    print(target.value_counts()[:2])

-------------------
brand
9
merc    790
vw      768
Name: brand, dtype: int64
-------------------
model
90
 Arteon         167
 Grand C-MAX    164
Name: model, dtype: int64
-------------------
transmission
4
Manual       1948
Automatic    1660
Name: transmission, dtype: int64
-------------------
fuelType
5
Diesel    2989
Petrol    1754
Name: fuelType, dtype: int64


In [19]:
# exclude = 'object' : 숫자 데이터 유형만 가져오는 방법을 보여주는 것
for col in X_train.select_dtypes(exclude = 'object'):
    target = X_train[col]
    print('-------------------')
    print(col)
    print(target.nunique()) # 값의 갯수
    print(target.unique()[:2]) # 고유 값 이름들
    print(target.describe())

-------------------
carID
4960
[13207 17314]
count     4960.000000
mean     15832.446169
std       2206.717006
min      12002.000000
25%      13929.250000
50%      15840.000000
75%      17765.750000
max      19629.000000
Name: carID, dtype: float64
-------------------
year
24
[2019 2015]
count    4960.000000
mean     2016.737903
std         2.884035
min      1997.000000
25%      2016.000000
50%      2017.000000
75%      2019.000000
max      2020.000000
Name: year, dtype: float64
-------------------
mileage
3900
[ 4223 47870]
count      4960.000000
mean      24956.286895
std       24443.333662
min           1.000000
25%        5641.250000
50%       19000.000000
75%       36702.000000
max      259000.000000
Name: mileage, dtype: float64
-------------------
tax
41
[145. 125.]
count    4960.000000
mean      152.332661
std        82.403844
min         0.000000
25%       145.000000
50%       145.000000
75%       150.000000
max       580.000000
Name: tax, dtype: float64
-------------------
mp

In [22]:
# 데이터 전처리(삭제)
id = X_test['carID']
X_train = X_train.drop(['carID'], axis=1)
X_test = X_test.drop(['carID'], axis=1)
y_train = y_train['price']

In [9]:
# tax, mpg, engineSize 스케일링하기


