## Auto.ru price prediction with XGBRegressor

In [34]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the dataset
#### Dropping the title and models columns. There are too much models, and i think that Brand name is more valuable in considering the price of the car.

In [35]:
df = pd.read_csv('autoru_cleaned.csv', sep=';')
df = df.drop(['title', 'model'], axis=1)

### Getting rid of elite and high society extra expensive cars

In [36]:
df = df[df['price'] < df['price'].quantile(0.95)]
df.shape

(60734, 10)

### Melting the categorical columns into features

In [37]:
cat_columns = df.columns[df.dtypes == object].tolist()
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df[cat_columns]).toarray())
df_2 = df.drop(cat_columns, axis=1)
df_2 = df_2.join(enc_df)

In [38]:
X = df_2.drop('price', axis=1)
y = df_2['price']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Instantiate the model and parameters for grid-search

In [40]:
model = xgboost.XGBRegressor()
parameters_for_testing = {
#    'colsample_bytree': [0.4, 0.6, 0.8],
#    'gamma': [0, 0.03, 0.1, 0.3],
#    'min_child_weight': [1.5, 6, 10],
#    'learning_rate': [0.1, 0.07],
   'max_depth': [7, 8, 9, 10],
   'n_estimators': range(20, 100, 20),
#    'reg_alpha': [1e-5, 1e-2, 0.75],
#    'reg_lambda': [1e-5, 1e-2, 0.45],
#    'subsample': [0.6, 0.95]
}

In [41]:
gsearch = GridSearchCV(estimator=model, 
                       param_grid=parameters_for_testing, 
                       n_jobs=-1, 
                       verbose=10, 
#                        scoring='neg_mean_squared_error'
                      )

### Fitting the model. This will take a time (depends on grid search parameters quantity)

In [None]:
gsearch.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.7min


In [None]:
print('Best params: {}'.format(gsearch.best_params_))
print('Best score: {}'.format(gsearch.best_score_))

In [None]:
y_pred = gsearch.best_estimator_.predict(X_test)
print(r2_score(y_test, y_pred))

In [None]:
sns.scatterplot(x=y_pred, y=y_test)

### Check on real-life car price

In [None]:
df_valid = pd.DataFrame.from_dict({'brand': ['mitsubishi'], 
    'gear': ['автомат'], 
    'drive': ['передний'], 
    'color': ['красный'], 
    'year': [2006], 
    'mileage': [230000], 
    'price': [250000], 
    'eng_vol': [1.6], 
    'hps': [60], 
    'gas_type': ['Бензин']})

In [None]:
enc_df_valid = pd.DataFrame(enc.transform(df_valid[cat_columns]).toarray())
df_valid = df_valid.drop(cat_columns, axis=1)
df_valid = df_valid.join(enc_df_valid)

In [None]:
X_valid = df_valid.drop('price', axis=1)
y_valid = df_valid['price']

In [None]:
y_predicted = gsearch.best_estimator_.predict(X_valid)
print('Predicted car price: {}'.format(y_predicted[0]))
print('Actual car price: {}'.format(y_valid[0]));