In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [6]:
X_train=pd.read_csv("X_train.csv")
X_test=pd.read_csv("X_test.csv")
y_train=pd.read_csv("y_train.csv")
y_test=pd.read_csv("y_test.csv")

In [7]:
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
216,16900,ford,Edge,2017,Manual,22922,Diesel,160.0,48.7,2.0
4277,15983,hyundi,IX35,2015,Manual,47456,Diesel,160.0,51.4,2.0
3594,15572,vw,Tiguan Allspace,2020,Semi-Auto,2500,Petrol,145.0,30.7,2.0
2690,13332,toyota,Prius,2019,Automatic,3754,Hybrid,135.0,217.3,1.8
1702,14086,bmw,M4,2019,Semi-Auto,5644,Petrol,145.0,34.0,3.0


In [8]:
y_train.sample(5)

Unnamed: 0,carID,price
1570,14380,50388
3925,15542,18490
130,13514,16781
3619,19433,4689
1230,12929,15601


In [9]:
X_train.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [10]:
X_test.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [11]:
y_test.isnull().sum()

carID    0
price    0
dtype: int64

In [12]:
y_train.isnull().sum()

carID    0
price    0
dtype: int64

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB


In [14]:
cols=['brand','model','transmission','fuelType']
for col in cols:
    print(X_train[col].unique())
for col in cols:
    print(X_test[col].unique())

['hyundi' 'vauxhall' 'audi' 'vw' 'skoda' 'merc' 'toyota' 'bmw' 'ford']
[' Santa Fe' ' GTC' ' RS4' ' Scirocco' ' Scala' ' V Class' ' Prius' ' M4'
 ' Camry' ' KA' ' Vivaro' ' CLS Class' ' Caravelle' ' Arteon' ' Shuttle'
 ' I40' ' IX20' ' 6 Series' ' GL Class' ' S Class' ' S3' ' Yeti' ' Galaxy'
 ' Puma' ' Edge' ' A8' ' SLK' ' Kamiq' ' RS6' ' CLA Class' ' Land Cruiser'
 ' M Class' ' Q8' ' i3' ' Verso' ' Mustang' ' IX35' ' Amarok' ' Avensis'
 ' Grand Tourneo Connect' ' Antara' ' Tourneo Connect' ' Beetle' ' X4'
 ' CC' ' GT86' ' X-CLASS' ' I800' ' i8' ' Caddy Maxi Life' ' Combo Life'
 ' Rapid' ' SQ7' ' Grand C-MAX' ' Tourneo Custom' ' California' ' Agila'
 ' A7' ' Zafira Tourer' ' G Class' ' Tiguan Allspace' ' X6' ' M2' ' X7'
 ' 7 Series' ' Z4' ' RS5' ' Hilux' ' GLS Class' ' GLB Class' ' M5' ' RS3'
 ' Caddy Life' ' SQ5' ' Supra' ' 8 Series' ' Fusion' ' M6' ' M3' ' Jetta'
 ' S4' ' R8' ' PROACE VERSO' ' Caddy' ' Getz' ' Eos' ' CLK' ' IQ' ' Z3'
 ' Roomster']
['Semi-Auto' 'Manual' 'Automatic' 'O

In [15]:
for col in cols:
    le=LabelEncoder()
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col]= le.transform(X_test[col])
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
4861,15833,7,21,2019,1,1,4,145.0,42.2,1.2
871,19608,0,58,2006,1,66000,4,535.0,20.3,4.2
3179,17532,4,12,2017,0,18550,0,145.0,70.6,2.1
1960,14554,8,71,2016,1,22000,4,150.0,47.1,2.0
2280,12658,2,33,2015,1,64000,0,30.0,64.2,1.5


In [16]:
X_train = X_train.drop('carID',axis=1)
X_test = X_test.drop('carID',axis=1)
X_train.sample(5)

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
2404,7,87,2017,0,15411,4,145.0,40.9,1.4
4599,2,42,2010,1,61575,4,30.0,55.4,1.2
4207,4,66,2012,0,47800,0,145.0,56.5,2.1
1477,5,84,2014,1,72500,0,150.0,55.4,2.0
3838,3,69,2016,1,23828,0,200.0,47.1,2.2


In [17]:
model = RandomForestRegressor()

params={
    'n_estimators': [10,50,100],
    'max_depth' : [6,8,10,12],
    'min_samples_leaf' : [8,12,18],
    'min_samples_split' : [8,16,20]
}
grid_cv=GridSearchCV(model,param_grid=params,cv=3,n_jobs=-1)
grid_cv.fit(X_train,y_train['price'])
print('Parametreler: ',grid_cv.best_params_)
print(f'Best Score : {grid_cv.best_score_:.4f}')

Parametreler:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 50}
Best Score : 0.9062


In [18]:
model = RandomForestRegressor(n_estimators = 100,
                             max_depth = 12,
                             min_samples_leaf = 8,
                             min_samples_split = 8)
model.fit(X_train,y_train['price'])
print(model.score(X_train, y_train['price']))

0.9462137882465467


In [19]:
y_pred=model.predict(X_test)
round(r2_score(y_test['price'],y_pred),4)

0.9213

In [20]:
print(model.predict([[0,56,2019,0,6954,4,145.0,21.6,5.2]]))

[114018.34332019]
