# EASY

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

import pandas as pd

In [2]:
df_bmw = pd.read_csv("../data/audi.csv")
df_bmw.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [3]:
num_val = ["year", "mileage", "tax", "mpg", "engineSize"]
df_bmw_clear = df_bmw[num_val]
x_train, x_test, y_train, y_test = train_test_split(df_bmw_clear, df_bmw["price"], train_size=0.8)

In [4]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

In [5]:
print(
    f"R2 score на тестовой выборке: {r2_score(y_test, model.predict(x_test))}\nR2 score на обучающей выборке: {r2_score(y_train, model.predict(x_train))}")
# r2_score(y_train, y_pred_train)

R2 score на тестовой выборке: 0.8900636169105752
R2 score на обучающей выборке: 0.996602899770547


In [6]:
pd.Series(model.feature_importances_, index=num_val)

year          0.200324
mileage       0.083958
tax           0.048534
mpg           0.457521
engineSize    0.209663
dtype: float64

Как можно заметить, самый важный признак для audi - mpg

Но поподставляв разные марки можно выяснить что для большинства самый важный признак это mpg, но для некоторых больше важен год выпуска (bmw, ford), а для некоторых engineSize (toyota). А для мерседесов например, одинаково важен mileage и engineSize. 

# MEDIUM

In [7]:
brands = ["audi", "bmw", "cclass", "focus", "ford", "hyundi", "merc", "skoda", "toyota", "vauxhall", "vw"]
df_all = []
index = 0
for brand in brands:
    df = pd.read_csv(f"../data/{brand}.csv")
    df["brand"] = index
    index += 1
    df_all.append(df)

В датасете hyundi поле `tax` называется `tax(£)`. Поэтому переименуем это поле, чтобы при слиянии оно не доставляло проблем.

Также зададим бренд уникальным числом сразу, чтобы потом не переводить строки в числа

In [8]:
df_all[5].rename(columns={'tax(£)': 'tax'}, inplace=True)

In [9]:
merged = pd.concat(df_all)
merged

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,0
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,0
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,0
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,0
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,0
...,...,...,...,...,...,...,...,...,...,...
15152,Eos,2012,5990,Manual,74000,Diesel,125.0,58.9,2.0,10
15153,Fox,2008,1799,Manual,88102,Petrol,145.0,46.3,1.2,10
15154,Fox,2009,1590,Manual,70000,Petrol,200.0,42.0,1.4,10
15155,Fox,2006,1250,Manual,82704,Petrol,150.0,46.3,1.2,10


Преобразуем категориальные признаки

In [10]:
merged.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax             float64
mpg             float64
engineSize      float64
brand             int64
dtype: object

In [11]:
labelencoder = LabelEncoder()
merged["model"] = labelencoder.fit_transform(merged["model"])
merged["transmission"] = labelencoder.fit_transform(merged["transmission"])
merged["fuelType"] = labelencoder.fit_transform(merged["fuelType"])
merged.dropna(inplace=True)
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99187 entries, 0 to 15156
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         99187 non-null  int64  
 1   year          99187 non-null  int64  
 2   price         99187 non-null  int64  
 3   transmission  99187 non-null  int64  
 4   mileage       99187 non-null  int64  
 5   fuelType      99187 non-null  int64  
 6   tax           99187 non-null  float64
 7   mpg           99187 non-null  float64
 8   engineSize    99187 non-null  float64
 9   brand         99187 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 8.3 MB


In [12]:
params = {
    'min_samples_leaf': [1, 5, 15, 30],
    'splitter': ['best', 'random'],
    'max_depth': [5, 25, 50, 100, 200, 300, 400, 500],
    'criterion': ['squared_error', 'friedman_mse']
}
train = merged[['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize', 'brand']]
x_train, x_test, y_train, y_test = train_test_split(train, merged['price'], train_size=0.8)


In [13]:
%%time

search = GridSearchCV(DecisionTreeRegressor(), params)
search.fit(x_train, y_train)
search.best_estimator_

CPU times: user 1min 27s, sys: 1.18 s, total: 1min 28s
Wall time: 1min 28s


In [14]:
df = pd.DataFrame(search.cv_results_)
df[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
34,"{'criterion': 'squared_error', 'max_depth': 20...",0.933010
90,"{'criterion': 'friedman_mse', 'max_depth': 100...",0.933002
82,"{'criterion': 'friedman_mse', 'max_depth': 50,...",0.932955
122,"{'criterion': 'friedman_mse', 'max_depth': 500...",0.932955
106,"{'criterion': 'friedman_mse', 'max_depth': 300...",0.932890
...,...,...
71,"{'criterion': 'friedman_mse', 'max_depth': 5, ...",0.618918
65,"{'criterion': 'friedman_mse', 'max_depth': 5, ...",0.618437
67,"{'criterion': 'friedman_mse', 'max_depth': 5, ...",0.616081
69,"{'criterion': 'friedman_mse', 'max_depth': 5, ...",0.605133


Посмотрев на табличку можно сказать, что хорошие модели сильно не отличаются, а заведомо плохие, как и ожидалось, предсказывают очень плохо