In [75]:
import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split

In [2]:
ucd = pd.read_parquet('processed_ucd.parquet')

In [7]:
X = ucd[ucd.columns[1:]]
y = ucd.price

In [34]:
categ = X.columns.map(lambda x: x in {'bed', 'body_type', 'cabin', 'city', 'engine_cylinders', 'franchise_make', 'fuel_type', 'listing_color', 'transmission', 'wheel_system'})
boolean = X.columns.map(lambda x: x in {'fleet', 'frame_damaged', 'franchise_dealer', 'has_accidents', 'isCab', 'is_cpo', 'is_new', 'salvage'})
integer = X.columns.map(lambda x: x in {'year_listed', 'month_listed', 'owner_count', 'maximum_seating'})

In [57]:
sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), n_features_to_select='auto', tol=None, n_jobs=-1)

In [71]:
X_sfs, __, y_sfs, __ = train_test_split(X, y, train_size=0.05)
del __
X_sfs[X_sfs.columns[~categ]]

Unnamed: 0,fleet,frame_damaged,franchise_dealer,has_accidents,isCab,is_cpo,is_new,salvage,back_legroom,bed_length,...,length,maximum_seating,mileage,owner_count,seller_rating,width,wheelbase,year_listed,month_listed,model_age
1898903,False,False,True,False,False,False,False,False,38.6,,...,201.2,7.0,29738.0,1.0,3.666667,85.5,119.8,2020,9,1
1222743,False,False,False,False,False,False,False,False,32.9,,...,189.9,7.0,180000.0,1.0,4.000000,75.8,109.8,2020,8,7
1579978,False,False,True,False,False,True,False,False,38.2,,...,176.4,5.0,32686.0,1.0,4.263158,73.0,105.1,2020,8,2
462791,False,False,True,False,False,True,False,False,40.3,67.4,...,229.0,6.0,24981.0,1.0,3.333333,79.4,140.5,2020,7,3
150589,False,False,True,False,False,False,True,False,40.7,,...,180.5,5.0,5.0,1.0,4.500000,85.6,106.7,2020,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1847556,False,False,True,False,False,True,False,False,39.5,,...,189.5,5.0,39971.0,1.0,4.000000,75.0,112.5,2020,8,3
299187,False,False,True,False,False,False,True,False,,,...,,,10000.0,1.0,4.692308,,,2020,8,-1
48870,False,False,False,False,False,False,False,False,40.9,69.3,...,230.0,6.0,101995.0,2.0,4.857143,80.0,143.5,2020,6,6
1367312,False,False,False,True,False,False,False,False,36.3,,...,178.7,5.0,197250.0,2.0,5.000000,69.3,102.4,2019,7,10


In [72]:
sfs_non_cat = sfs.fit(X_sfs[X_sfs.columns[~categ]], y_sfs)

In [73]:
sfs_non_cat.get_feature_names_out()

array(['fleet', 'back_legroom', 'bed_length', 'city_fuel_economy',
       'engine_displacement', 'fuel_tank_volume', 'height', 'horsepower',
       'maximum_seating', 'mileage', 'width', 'wheelbase', 'year_listed',
       'model_age'], dtype=object)

In [74]:
# IDK what fleet means, so we'll take 
# 'back_legroom', 'bed_length', 'city_fuel_economy', 'engine_displacement', 
# 'fuel_tank_volume', 'height', 'horsepower', 'maximum_seating', 'mileage', 
# 'width', 'wheelbase', 'year_listed','model_age'

In [76]:
X_sfs[X_sfs.columns[categ]]

Unnamed: 0,bed,body_type,cabin,city,engine_cylinders,franchise_make,fuel_type,listing_color,transmission,wheel_system
1898903,3,5,4,57,10,10,5,4,0,2
1222743,3,5,4,30,9,48,5,13,0,1
1579978,3,5,4,28,4,22,5,13,0,3
462791,3,4,4,51,10,37,5,13,0,0
150589,3,5,4,51,4,13,5,10,0,2
...,...,...,...,...,...,...,...,...,...,...
1847556,3,5,4,50,9,7,5,0,0,0
299187,3,4,4,51,4,37,5,13,0,3
48870,3,4,4,51,9,48,4,13,0,0
1367312,3,6,4,83,4,48,5,12,0,3


In [82]:
mi = mutual_info_regression(X[X.columns[categ]], y, discrete_features=[True for _ in X.columns[categ]])
mi

array([0.00532119, 0.25691869, 0.01328851, 0.33934268, 0.24108075,
       0.61537409, 0.06943575, 0.05704944, 0.0962967 , 0.22728871])

In [83]:
pd.Series(mi, index=X.columns[categ]).sort_values()

bed                 0.005321
cabin               0.013289
listing_color       0.057049
fuel_type           0.069436
transmission        0.096297
wheel_system        0.227289
engine_cylinders    0.241081
body_type           0.256919
city                0.339343
franchise_make      0.615374
dtype: float64

In [85]:
# We'll take 'franchise_make', 'city', 'body_type', 'engine_cylinders', 'wheel_system'

In [88]:
# Ok, so in total we have:
# 'franchise_make', 'city', 'body_type', 'engine_cylinders', 'wheel_system',
# 'back_legroom', 'bed_length', 'city_fuel_economy', 'engine_displacement', 
# 'fuel_tank_volume', 'height', 'horsepower', 'maximum_seating', 'mileage', 
# 'width', 'wheelbase', 'year_listed', 'model_age'
# 
# But that seems like maybe a bit too many features.
# I can't be bothered to cut it down though, let's just get rollin!

In [89]:
X_best = X[['franchise_make', 'city', 'body_type', 'engine_cylinders', 'wheel_system',
            'back_legroom', 'bed_length', 'city_fuel_economy', 'engine_displacement', 
            'fuel_tank_volume', 'height', 'horsepower', 'maximum_seating', 'mileage', 
            'width', 'wheelbase', 'year_listed','model_age']]

categ_best = X_best.columns.map(lambda x: x in {'bed', 'body_type', 'cabin', 'city', 'engine_cylinders', 'franchise_make', 'fuel_type', 'listing_color', 'transmission', 'wheel_system'})

In [156]:
Xb_train, Xb_test, y_train, y_test = train_test_split(X_best, y, train_size=0.3)
Xb_train

Unnamed: 0,franchise_make,city,body_type,engine_cylinders,wheel_system,back_legroom,bed_length,city_fuel_economy,engine_displacement,fuel_tank_volume,height,horsepower,maximum_seating,mileage,width,wheelbase,year_listed,model_age
675527,48,9,6,9,3,36.2,,19.0,3600.0,16.9,58.4,283.0,5.0,132632.0,81.3,108.9,2020,7
1568816,28,28,5,4,3,37.9,,26.0,2500.0,14.5,68.1,170.0,5.0,27630.0,72.4,106.5,2020,3
1841345,7,39,6,4,2,37.9,,21.0,2000.0,17.4,57.2,237.0,5.0,979.0,80.3,116.0,2020,0
1541479,48,27,6,4,3,39.1,,23.0,1800.0,18.5,58.5,170.0,5.0,27784.0,72.2,110.4,2020,3
1522418,13,55,4,9,0,33.5,78.9,18.0,3300.0,23.0,77.2,290.0,6.0,5.0,96.8,145.0,2019,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2150163,9,53,3,9,3,39.0,,19.0,3600.0,19.0,69.9,287.0,8.0,0.0,90.4,121.6,2020,0
1000107,34,20,5,9,2,38.7,,20.0,3500.0,19.0,67.8,260.0,5.0,3.0,75.4,111.2,2020,0
2437480,48,2,6,4,3,35.4,,,1800.0,15.6,58.1,136.0,5.0,112109.0,70.7,105.7,2020,9
2531407,44,2,4,10,0,34.7,78.7,13.0,5700.0,26.4,76.4,381.0,6.0,5.0,79.9,145.7,2020,0


In [187]:
hgbr = HistGradientBoostingRegressor(categorical_features=categ_best, early_stopping=False, max_iter=250, max_leaf_nodes=400, learning_rate=0.07)

In [188]:
hgbr.fit(Xb_train, y_train)
hgbr.score(Xb_test, y_test)

0.8501212565255537

In [209]:
example = pd.DataFrame([[23, 48, 0, 10, 2, 40, float('nan'), 20, 2000, 30, 60, 400, 6, 0, 80, 100, 2022, 0]], columns=X_best.columns)
example

Unnamed: 0,franchise_make,city,body_type,engine_cylinders,wheel_system,back_legroom,bed_length,city_fuel_economy,engine_displacement,fuel_tank_volume,height,horsepower,maximum_seating,mileage,width,wheelbase,year_listed,model_age
0,23,48,0,10,2,40,,20,2000,30,60,400,6,0,80,100,2022,0


In [208]:
predictions = hgbr.predict(example)

array([128509.45186946])

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))