In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [39]:
import os
for dirname, _, filenames in os.walk('./kaggle/') :
    for filename in filenames :
        print(os.path.join(dirname, filename))

./kaggle/y_train.csv
./kaggle/X_train.csv
./kaggle/y_test.csv
./kaggle/X_test.csv


In [40]:
xtrain = pd.read_csv('./kaggle/X_train.csv')
xtest = pd.read_csv('./kaggle/X_test.csv')
ytrain = pd.read_csv('./kaggle/y_train.csv')
ytest = pd.read_csv('./kaggle/y_test.csv')

In [41]:
xtrain.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
3161,18717,merc,V Class,2018,Manual,16146,Diesel,145.0,46.3,2.1
1966,19428,hyundi,I40,2016,Manual,32309,Diesel,20.0,67.3,1.7
2720,16192,bmw,M6,2016,Semi-Auto,19000,Petrol,555.0,27.4,4.4
949,15177,bmw,M4,2019,Semi-Auto,19,Petrol,145.0,34.0,3.0
3913,14381,merc,X-CLASS,2017,Automatic,24800,Diesel,260.0,35.8,2.3


In [42]:
# null cehck
xtrain.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [43]:
xtest.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [44]:
xtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB


In [55]:
# unique 컬럼 확인
cols = ['brand', 'model', 'transmission', 'fuelType']
for col in cols :
    print(xtrain[col].unique())
for col in cols :
    print(xtest[col].unique())

['hyundi' 'vauxhall' 'audi' 'vw' 'skoda' 'merc' 'toyota' 'bmw' 'ford']
['Santa Fe' 'GTC' 'RS4' 'Scirocco' 'Scala' 'V Class' 'Prius' 'M4' 'Camry'
 'KA' 'Vivaro' 'CLS Class' 'Caravelle' 'Arteon' 'Shuttle' 'I40' 'IX20'
 '6 Series' 'GL Class' 'S Class' 'S3' 'Yeti' 'Galaxy' 'Puma' 'Edge' 'A8'
 'SLK' 'Kamiq' 'RS6' 'CLA Class' 'Land Cruiser' 'M Class' 'Q8' 'i3'
 'Verso' 'Mustang' 'IX35' 'Amarok' 'Avensis' 'Grand Tourneo Connect'
 'Antara' 'Tourneo Connect' 'Beetle' 'X4' 'CC' 'GT86' 'X-CLASS' 'I800'
 'i8' 'Caddy Maxi Life' 'Combo Life' 'Rapid' 'SQ7' 'Grand C-MAX'
 'Tourneo Custom' 'California' 'Agila' 'A7' 'Zafira Tourer' 'G Class'
 'Tiguan Allspace' 'X6' 'M2' 'X7' '7 Series' 'Z4' 'RS5' 'Hilux'
 'GLS Class' 'GLB Class' 'M5' 'RS3' 'Caddy Life' 'SQ5' 'Supra' '8 Series'
 'Fusion' 'M6' 'M3' 'Jetta' 'S4' 'R8' 'PROACE VERSO' 'Caddy' 'Getz' 'Eos'
 'CLK' 'IQ' 'Z3' 'Roomster']
['Semi-Auto' 'Manual' 'Automatic' 'Other']
['Diesel' 'Petrol' 'Hybrid' 'Other' 'Electric']
['merc' 'vw' 'skoda' 'audi' 'vauxhal

In [46]:
df = pd.concat([xtrain, ytrain], axis=1)
df.drop(df.columns[-2],axis=1,inplace=True)
df

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2,31995
1,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0,7700
2,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9,58990
3,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0,12999
4,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0,16990
...,...,...,...,...,...,...,...,...,...,...
4955,merc,GL Class,2015,Automatic,24314,Diesel,125.0,56.6,2.1,17999
4956,bmw,6 Series,2017,Automatic,18000,Diesel,145.0,51.4,3.0,28900
4957,vw,CC,2015,Manual,84932,Diesel,30.0,60.1,2.0,8998
4958,audi,A7,2017,Semi-Auto,30150,Diesel,145.0,62.8,3.0,23198


In [47]:
xtest=pd.concat([xtest, ytest], axis=1)
xtest.drop(xtest.columns[-2],axis=1,inplace=True)
xtest

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,merc,GLS Class,2017,Automatic,12046,Diesel,150.0,37.2,3.0,38000
1,vw,Amarok,2017,Automatic,37683,Diesel,260.0,36.2,3.0,23495
2,merc,GLS Class,2019,Automatic,10000,Diesel,145.0,34.0,3.0,59999
3,skoda,Scala,2019,Manual,3257,Petrol,145.0,49.6,1.0,16713
4,audi,RS6,2015,Semi-Auto,20982,Petrol,325.0,29.4,4.0,46000
...,...,...,...,...,...,...,...,...,...,...
2667,audi,A7,2015,Semi-Auto,21100,Petrol,325.0,29.7,4.0,28990
2668,merc,CLS Class,2015,Automatic,60972,Diesel,160.0,52.3,3.0,17450
2669,ford,Puma,2020,Manual,4111,Petrol,145.0,50.4,1.0,21995
2670,merc,CLA Class,2016,Automatic,25726,Petrol,200.0,41.5,2.0,18700


In [48]:
df=pd.concat([df, xtest], axis=0)
df.reset_index(inplace = True)

In [49]:
# brand label 
df.replace({'brand' : { 'audi' : 1, 'bmw' : 2, 'merc' : 3, 'vw': 4, 'toyota': 5, 'ford': 6, 'hyundi': 7, 'skoda': 8, 'vauxhall':9 }},inplace=True)
df

Unnamed: 0,index,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price
0,0,7,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2,31995
1,1,9,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0,7700
2,2,1,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9,58990
3,3,4,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0,12999
4,4,8,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0,16990
...,...,...,...,...,...,...,...,...,...,...,...
7627,2667,1,A7,2015,Semi-Auto,21100,Petrol,325.0,29.7,4.0,28990
7628,2668,3,CLS Class,2015,Automatic,60972,Diesel,160.0,52.3,3.0,17450
7629,2669,6,Puma,2020,Manual,4111,Petrol,145.0,50.4,1.0,21995
7630,2670,3,CLA Class,2016,Automatic,25726,Petrol,200.0,41.5,2.0,18700


In [54]:
# 표본이 적은것들 삭제 및 outlier 제거
df=df[df.transmission!='Other']
df=df[df.fuelType!='Electric']
df=df[df.mpg<300]

In [61]:
# one-hot encoding
def OHE(dfcolumn):
    global df
    finallencol = (dfcolumn.nunique() - 1) + (len(df.columns)-1)    
    dummies = pd.get_dummies(dfcolumn, prefix=dfcolumn.name)    
    df=pd.concat([df,dummies],axis='columns')
    df.drop(columns=dfcolumn.name,axis=1,inplace=True) # We have to drop columns to aviod multi-collinearity
    if(finallencol==len(df.columns)):
        print('One Hot Encoding was successful!') 
        print('')
    else:
        print('Error in OHE XXXX')
    return df

In [62]:
OHE(df['transmission'])
OHE(df['fuelType'])

Error in OHE XXXX
Error in OHE XXXX


Unnamed: 0,index,brand,model,year,mileage,tax,mpg,engineSize,price,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,0,7,Santa Fe,2019,4223,145.0,39.8,2.2,31995,0,0,1,1,0,0,0
1,1,9,GTC,2015,47870,125.0,60.1,2.0,7700,0,1,0,1,0,0,0
2,2,1,RS4,2019,5151,145.0,29.1,2.9,58990,1,0,0,0,0,0,1
3,3,4,Scirocco,2016,20423,30.0,57.6,2.0,12999,1,0,0,1,0,0,0
4,4,8,Scala,2020,3569,145.0,47.1,1.0,16990,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7627,2667,1,A7,2015,21100,325.0,29.7,4.0,28990,0,0,1,0,0,0,1
7628,2668,3,CLS Class,2015,60972,160.0,52.3,3.0,17450,1,0,0,1,0,0,0
7629,2669,6,Puma,2020,4111,145.0,50.4,1.0,21995,0,1,0,0,0,0,1
7630,2670,3,CLA Class,2016,25726,200.0,41.5,2.0,18700,1,0,0,0,0,0,1


In [63]:
df.drop(['index'],axis=1,inplace=True)
df

Unnamed: 0,brand,model,year,mileage,tax,mpg,engineSize,price,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,7,Santa Fe,2019,4223,145.0,39.8,2.2,31995,0,0,1,1,0,0,0
1,9,GTC,2015,47870,125.0,60.1,2.0,7700,0,1,0,1,0,0,0
2,1,RS4,2019,5151,145.0,29.1,2.9,58990,1,0,0,0,0,0,1
3,4,Scirocco,2016,20423,30.0,57.6,2.0,12999,1,0,0,1,0,0,0
4,8,Scala,2020,3569,145.0,47.1,1.0,16990,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7627,1,A7,2015,21100,325.0,29.7,4.0,28990,0,0,1,0,0,0,1
7628,3,CLS Class,2015,60972,160.0,52.3,3.0,17450,1,0,0,1,0,0,0
7629,6,Puma,2020,4111,145.0,50.4,1.0,21995,0,1,0,0,0,0,1
7630,3,CLA Class,2016,25726,200.0,41.5,2.0,18700,1,0,0,0,0,0,1


In [75]:
df.drop(['model'], axis=1, inplace=True)
df

Unnamed: 0,brand,year,mileage,tax,mpg,engineSize,price,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,7,2019,4223,145.0,39.8,2.2,31995,0,0,1,1,0,0,0
1,9,2015,47870,125.0,60.1,2.0,7700,0,1,0,1,0,0,0
2,1,2019,5151,145.0,29.1,2.9,58990,1,0,0,0,0,0,1
3,4,2016,20423,30.0,57.6,2.0,12999,1,0,0,1,0,0,0
4,8,2020,3569,145.0,47.1,1.0,16990,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7627,1,2015,21100,325.0,29.7,4.0,28990,0,0,1,0,0,0,1
7628,3,2015,60972,160.0,52.3,3.0,17450,1,0,0,1,0,0,0
7629,6,2020,4111,145.0,50.4,1.0,21995,0,1,0,0,0,0,1
7630,3,2016,25726,200.0,41.5,2.0,18700,1,0,0,0,0,0,1


In [76]:
xdata = df[df.columns.difference(['price'])]
xdata

Unnamed: 0,brand,engineSize,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,mileage,mpg,tax,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,year
0,7,2.2,1,0,0,0,4223,39.8,145.0,0,0,1,2019
1,9,2.0,1,0,0,0,47870,60.1,125.0,0,1,0,2015
2,1,2.9,0,0,0,1,5151,29.1,145.0,1,0,0,2019
3,4,2.0,1,0,0,0,20423,57.6,30.0,1,0,0,2016
4,8,1.0,0,0,0,1,3569,47.1,145.0,0,0,1,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7627,1,4.0,0,0,0,1,21100,29.7,325.0,0,0,1,2015
7628,3,3.0,1,0,0,0,60972,52.3,160.0,1,0,0,2015
7629,6,1.0,0,0,0,1,4111,50.4,145.0,0,1,0,2020
7630,3,2.0,0,0,0,1,25726,41.5,200.0,1,0,0,2016


In [77]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
x_train, x_test, y_train, y_test = train_test_split(xdata, df['price'], test_size=0.2, random_state=123)

In [78]:
# 최적 파라미터 찾기
from sklearn.model_selection import RandomizedSearchCV
model = RandomForestRegressor()
# Number of trees in random forest
n_estimators = [50, 100, 150]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [6, 8, 10, 12]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [80]:
model_random =  RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=123, n_jobs = -1)
model_random.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True 


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   1.3s
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   1.0s
[CV] n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True 
[CV]  n_estimators=50, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   1.2s
[CV] n_estimators=150, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=False, total=  10.7s
[CV] n_estimators=150, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=False 
[CV]  n_estimators=150, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=False, total=  10.8s
[CV] n_estimators=150, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=False 
[CV]  n_es

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [6, 8, 10, 12, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 150]},
                   random_state=123, verbose=2)

In [81]:
# best params, score 확인
print('최적 파라미터:', model_random.best_params_)
print(f'최고 점수: {model_random.best_score_:.4f}')

최적 파라미터: {'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 12, 'bootstrap': True}
최고 점수: 0.9346


In [82]:
# scaler, pca 이런거 다 안하고 
# model 만 제외
model = RandomForestRegressor(n_estimators = 150, min_samples_split= 10, min_samples_leaf= 1, max_features= 'auto', max_depth= 12, bootstrap= True)
model.fit(x_train, y_train)
print('Train 정확도 :', round(100 * model.score(x_train, y_train), 2), '%')
print('Test 정확도 :', round(100 * model.score(x_test, y_test), 2), '%')

Train 정확도 : 97.39 %
Test 정확도 : 94.81 %


In [84]:
pred = model.predict(x_test)
round(r2_score(y_test, pred), 4)

0.9481

In [85]:
pred

array([ 5716.53400666, 18485.77571956, 26840.99032431, ...,
        3840.1176379 , 24505.92858583, 15885.03353019])

In [86]:
# model 저장하기
import joblib
joblib.dump(model, './simplemodel.pkl')

['./simplemodel.pkl']