In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
X_train=pd.read_csv("X_train.csv")
X_test=pd.read_csv("X_test.csv")
y_train=pd.read_csv("y_train.csv")
y_test=pd.read_csv("y_test.csv")

In [3]:
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
3561,17355,toyota,Prius,2015,Automatic,32314,Hybrid,0.0,72.4,1.8
906,19482,merc,V Class,2019,Semi-Auto,3999,Diesel,145.0,36.7,2.0
2762,13669,vw,Scirocco,2016,Manual,34995,Petrol,125.0,52.3,1.4
3061,15058,merc,S Class,2017,Automatic,35980,Diesel,160.0,52.3,3.0
2183,15397,toyota,Supra,2019,Semi-Auto,1902,Petrol,145.0,34.5,3.0


In [4]:
y_train.sample(5)

Unnamed: 0,carID,price
1334,17840,7298
1133,18856,45000
689,19246,23780
294,16249,14972
3732,17575,18295


In [5]:
X_train.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [6]:
X_test.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [7]:
y_test.isnull().sum()

carID    0
price    0
dtype: int64

In [8]:
y_train.isnull().sum()

carID    0
price    0
dtype: int64

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB


In [10]:
cols=['brand','model','transmission','fuelType']
for col in cols:
    print(X_train[col].unique())
for col in cols:
    print(X_test[col].unique())

['hyundi' 'vauxhall' 'audi' 'vw' 'skoda' 'merc' 'toyota' 'bmw' 'ford']
[' Santa Fe' ' GTC' ' RS4' ' Scirocco' ' Scala' ' V Class' ' Prius' ' M4'
 ' Camry' ' KA' ' Vivaro' ' CLS Class' ' Caravelle' ' Arteon' ' Shuttle'
 ' I40' ' IX20' ' 6 Series' ' GL Class' ' S Class' ' S3' ' Yeti' ' Galaxy'
 ' Puma' ' Edge' ' A8' ' SLK' ' Kamiq' ' RS6' ' CLA Class' ' Land Cruiser'
 ' M Class' ' Q8' ' i3' ' Verso' ' Mustang' ' IX35' ' Amarok' ' Avensis'
 ' Grand Tourneo Connect' ' Antara' ' Tourneo Connect' ' Beetle' ' X4'
 ' CC' ' GT86' ' X-CLASS' ' I800' ' i8' ' Caddy Maxi Life' ' Combo Life'
 ' Rapid' ' SQ7' ' Grand C-MAX' ' Tourneo Custom' ' California' ' Agila'
 ' A7' ' Zafira Tourer' ' G Class' ' Tiguan Allspace' ' X6' ' M2' ' X7'
 ' 7 Series' ' Z4' ' RS5' ' Hilux' ' GLS Class' ' GLB Class' ' M5' ' RS3'
 ' Caddy Life' ' SQ5' ' Supra' ' 8 Series' ' Fusion' ' M6' ' M3' ' Jetta'
 ' S4' ' R8' ' PROACE VERSO' ' Caddy' ' Getz' ' Eos' ' CLK' ' IQ' ' Z3'
 ' Roomster']
['Semi-Auto' 'Manual' 'Automatic' 'O

In [11]:
for col in cols:
    le=LabelEncoder()
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col]= le.transform(X_test[col])
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
4618,16940,6,78,2017,1,36756,4,200.0,41.5,1.6
1242,16818,3,40,2015,1,64925,0,165.0,51.4,2.0
2940,18612,8,71,2015,1,54045,0,30.0,53.3,2.0
704,18430,3,39,2016,1,14600,4,125.0,50.4,1.4
101,17633,8,6,2017,0,22000,0,260.0,36.2,3.0


In [12]:
X_train = X_train.drop('carID',axis=1)
X_test = X_test.drop('carID',axis=1)
X_train.sample(5)

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
1907,0,56,2019,0,6954,4,145.0,21.6,5.2
2868,5,61,2016,1,42379,4,20.0,60.1,1.2
3677,8,18,2019,0,10288,0,260.0,33.2,2.0
949,1,48,2019,3,19,4,145.0,34.0,3.0
392,4,63,2015,0,77719,0,160.0,51.4,3.0


In [32]:
model = RandomForestRegressor()

params={
    'n_estimators': [10,50,100],
    'max_depth' : [6,8,10,12],
    'min_samples_leaf' : [8,12,18],
    'min_samples_split' : [8,16,20]
}
grid_cv=GridSearchCV(model,param_grid=params,cv=3,n_jobs=-1)
grid_cv.fit(X_train,y_train['price'])
print('Parametreler: ',grid_cv.best_params_)
print(f'Best Score : {grid_cv.best_score_:.4f}')

Parametreler:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 100}
Best Score : 0.9062


In [29]:
model = RandomForestRegressor(n_estimators = 100,
                             max_depth = 12,
                             min_samples_leaf = 8,
                             min_samples_split = 8)
model.fit(X_train,y_train['price'])
print(model.score(X_train, y_train['price']))

0.9477462464711961


In [30]:
y_pred=model.predict(X_test)
round(r2_score(y_test['price'],y_pred),4)

0.9209

In [31]:
print(model.predict([[0,56,2019,0,6954,4,145.0,21.6,5.2]]))

[114232.73440189]
