In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
X_train=pd.read_csv("X_train.csv")
X_test=pd.read_csv("X_test.csv")
y_train=pd.read_csv("y_train.csv")
y_test=pd.read_csv("y_test.csv")

In [3]:
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
1967,19173,audi,A8,2017,Automatic,3338,Diesel,145.0,50.4,3.0
1147,14249,merc,GL Class,2016,Manual,34132,Petrol,145.0,48.7,1.6
2721,18024,merc,CLS Class,2019,Semi-Auto,7444,Diesel,145.0,48.7,2.9
3112,12676,bmw,X6,2016,Semi-Auto,31585,Diesel,200.0,47.1,3.0
631,19109,skoda,Kamiq,2019,Semi-Auto,1250,Petrol,145.0,44.1,1.5


In [4]:
y_train.sample(5)

Unnamed: 0,carID,price
1646,16634,25498
2105,19461,77880
629,15937,8000
1203,19040,9791
232,18259,9499


In [5]:
X_train.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [6]:
X_test.isnull().sum()

carID           0
brand           0
model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [7]:
y_test.isnull().sum()

carID    0
price    0
dtype: int64

In [8]:
y_train.isnull().sum()

carID    0
price    0
dtype: int64

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB


In [10]:
cols=['brand','model','transmission','fuelType']
for col in cols:
    print(X_train[col].unique())
for col in cols:
    print(X_test[col].unique())

['hyundi' 'vauxhall' 'audi' 'vw' 'skoda' 'merc' 'toyota' 'bmw' 'ford']
[' Santa Fe' ' GTC' ' RS4' ' Scirocco' ' Scala' ' V Class' ' Prius' ' M4'
 ' Camry' ' KA' ' Vivaro' ' CLS Class' ' Caravelle' ' Arteon' ' Shuttle'
 ' I40' ' IX20' ' 6 Series' ' GL Class' ' S Class' ' S3' ' Yeti' ' Galaxy'
 ' Puma' ' Edge' ' A8' ' SLK' ' Kamiq' ' RS6' ' CLA Class' ' Land Cruiser'
 ' M Class' ' Q8' ' i3' ' Verso' ' Mustang' ' IX35' ' Amarok' ' Avensis'
 ' Grand Tourneo Connect' ' Antara' ' Tourneo Connect' ' Beetle' ' X4'
 ' CC' ' GT86' ' X-CLASS' ' I800' ' i8' ' Caddy Maxi Life' ' Combo Life'
 ' Rapid' ' SQ7' ' Grand C-MAX' ' Tourneo Custom' ' California' ' Agila'
 ' A7' ' Zafira Tourer' ' G Class' ' Tiguan Allspace' ' X6' ' M2' ' X7'
 ' 7 Series' ' Z4' ' RS5' ' Hilux' ' GLS Class' ' GLB Class' ' M5' ' RS3'
 ' Caddy Life' ' SQ5' ' Supra' ' 8 Series' ' Fusion' ' M6' ' M3' ' Jetta'
 ' S4' ' R8' ' PROACE VERSO' ' Caddy' ' Getz' ' Eos' ' CLK' ' IQ' ' Z3'
 ' Roomster']
['Semi-Auto' 'Manual' 'Automatic' 'O

In [11]:
for col in cols:
    le=LabelEncoder()
    le.fit(X_train[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col]= le.transform(X_test[col])
X_train.sample(5)

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
3499,12105,4,80,2019,0,8974,0,265.0,35.8,2.3
820,14342,6,78,2015,1,16232,0,30.0,62.8,1.6
1984,19520,8,20,2019,3,452,0,150.0,33.6,2.0
2472,14470,6,9,2014,1,51353,0,30.0,62.8,2.0
1846,16205,6,9,2016,1,59244,0,20.0,67.3,1.6


In [12]:
X_train = X_train.drop('carID',axis=1)
X_test = X_test.drop('carID',axis=1)
X_train.sample(5)

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
4871,2,42,2015,1,55000,4,30.0,57.7,1.2
2620,4,26,2016,1,30140,0,30.0,64.2,2.1
4769,1,1,2017,3,22879,4,205.0,40.4,3.0
1522,0,57,2016,3,26930,4,300.0,34.9,2.5
1503,4,14,2011,0,88559,0,200.0,46.3,3.0


In [23]:
model = RandomForestRegressor()

params={
    'n_estimators': [10,50,100],
    'max_depth' : [6,8,10,12],
    'min_samples_leaf' : [8,12,18],
    'min_samples_split' : [8,16,20]
}
grid_cv=GridSearchCV(model,param_grid=params,cv=3,n_jobs=-1)
grid_cv.fit(X_train,y_train['price'])
print('Parametreler: ',grid_cv.best_params_)
print(f'Best Score : {grid_cv.best_score_:.4f}')

Parametreler:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
Best Score : 0.9048


In [24]:
model = RandomForestRegressor(n_estimators = 100,
                             max_depth = 12,
                             min_samples_leaf = 8,
                             min_samples_split = 8)
model.fit(X_train,y_train['price'])
print(model.score(X_train, y_train['price']))

0.9462487642494075


In [25]:
y_pred=model.predict(X_test)
round(r2_score(y_test['price'],y_pred),4)

0.9214

In [26]:
print(model.predict([[0,56,2019,0,6954,4,145.0,21.6,5.2]]))

[115662.66172695]
