In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import re

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 72 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# I converted everything to parquet because I had tried it for the preprocessed data and it was really fast to load.
# It's still fairly slow for reading the raw, unprocessed data though :(
# You can change back to csv if you like, but I think the parquet is at least slightly faster.
used_car_data = pd.read_parquet('used_cars_data.parquet')

In [3]:
used_car_data.columns

Index(['vin', 'back_legroom', 'bed', 'bed_height', 'bed_length', 'body_type',
       'cabin', 'city', 'city_fuel_economy', 'combine_fuel_economy',
       'daysonmarket', 'dealer_zip', 'description', 'engine_cylinders',
       'engine_displacement', 'engine_type', 'exterior_color', 'fleet',
       'frame_damaged', 'franchise_dealer', 'franchise_make', 'front_legroom',
       'fuel_tank_volume', 'fuel_type', 'has_accidents', 'height',
       'highway_fuel_economy', 'horsepower', 'interior_color', 'isCab',
       'is_certified', 'is_cpo', 'is_new', 'is_oemcpo', 'latitude', 'length',
       'listed_date', 'listing_color', 'listing_id', 'longitude',
       'main_picture_url', 'major_options', 'make_name', 'maximum_seating',
       'mileage', 'model_name', 'owner_count', 'power', 'price', 'salvage',
       'savings_amount', 'seller_rating', 'sp_id', 'sp_name', 'theft_title',
       'torque', 'transmission', 'transmission_display', 'trimId', 'trim_name',
       'vehicle_damage_category', 'whe

In [4]:
used_car_data.loc[used_car_data.back_legroom == '--', 'back_legroom'] = float('nan')

In [5]:
used_car_data.back_legroom.dropna().str.endswith(' in').all()

True

In [6]:
used_car_data.back_legroom = used_car_data.back_legroom.str.replace(' in', '').astype('float')

In [7]:
used_car_data.loc[used_car_data.bed_height == '--', 'bed_height'] = float('nan')

In [8]:
used_car_data.bed_height.dropna()

Series([], Name: bed_height, dtype: object)

In [9]:
used_car_data.loc[used_car_data.bed_length == '--', 'bed_length'] = float('nan')

In [10]:
used_car_data.bed_length.dropna().str.endswith(' in').all()

True

In [11]:
used_car_data.bed_length = used_car_data.bed_length.str.replace(' in', '').astype('float')

In [12]:
used_car_data.body_type.unique()

array(['SUV / Crossover', 'Sedan', 'Coupe', 'Hatchback', 'Pickup Truck',
       'Wagon', 'Minivan', 'Van', 'Convertible', None], dtype=object)

In [13]:
used_car_data.cabin.unique()

array([None, 'Crew Cab', 'Extended Cab', 'Regular Cab', 'Large Crew Cab'],
      dtype=object)

In [14]:
city_vc = used_car_data.city.value_counts()
# top_cities = city_vc[city_vc>5000].index
# DC->Alexandria
top_cities = np.array(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 'Indianapolis', 'Charlotte', 'San Francisco', 'Seattle', 'Denver', 'Oklahoma City', 'Nashville', 'El Paso', 'Alexandria', 'Boston', 'Las Vegas', 'Portland', 'Detroit', 'Louisville', 'Memphis', 'Baltimore', 'Milwaukee', 'Albuquerque', 'Fresno', 'Tucson', 'Sacramento', 'Mesa', 'Kansas City', 'Atlanta', 'Omaha', 'Colorado Springs', 'Raleigh', 'Virginia Beach', 'Long Beach', 'Miami', 'Oakland', 'Minneapolis', 'Tulsa', 'Bakersfield', 'Wichita', 'Arlington'])

In [15]:
city_data = []
for city in top_cities:
    entries = used_car_data[used_car_data.city==city]
    #print(len(entries))
    city_data += [[entries.latitude.median(), entries.longitude.median()]]
city_data = pd.DataFrame(city_data, columns=['latitude', 'longitude'], index=top_cities)

In [16]:
((city_data.latitude-32)**2+(city_data.longitude+96)**2).idxmin()

'Dallas'

In [17]:
top_city_set = set(top_cities)

In [18]:
mapped_cities = used_car_data.parallel_apply(lambda x: x.city if x.city in top_city_set else ((city_data.latitude-x.latitude)**2+(city_data.longitude-x.longitude)**2).idxmin(), axis=1)

In [19]:
mapped_cities

0            Miami
1            Miami
2            Miami
3            Miami
4            Miami
            ...   
3000035    Oakland
3000036    Oakland
3000037    Oakland
3000038    Oakland
3000039    Oakland
Length: 3000040, dtype: object

In [20]:
mapped_cities.value_counts()

New York            156744
Chicago             140151
Atlanta             136539
Jacksonville        136091
Detroit             123342
Houston             122767
Columbus            121324
Memphis             120543
Miami               108595
Boston              104694
Charlotte           102595
Alexandria          102269
Minneapolis          91848
Philadelphia         85833
Long Beach           76519
Dallas               75857
Milwaukee            69574
Denver               69007
Baltimore            67635
Raleigh              62167
Las Vegas            61473
Los Angeles          57089
San Antonio          56557
Indianapolis         53984
Nashville            51978
Kansas City          51047
Louisville           47790
Omaha                47701
Phoenix              45474
Sacramento           45421
San Diego            37269
San Jose             34701
Oklahoma City        31615
Austin               31571
Tulsa                26020
Fort Worth           24845
Portland             21943
A

In [21]:
used_car_data.city[mapped_cities=='Los Angeles'].value_counts().head(20)

Los Angeles        5790
Van Nuys           5412
Valencia           3085
Glendale           2946
Thousand Oaks      2639
Santa Monica       2626
Oxnard             2513
Victorville        2288
Alhambra           2082
Ventura            2052
North Hollywood    1680
Pasadena           1607
Mission Hills      1553
El Monte           1336
Beverly Hills      1166
Woodland Hills     1124
Culver City        1068
Hawthorne           955
Inglewood           900
Duarte              794
Name: city, dtype: int64

In [22]:
used_car_data.city = mapped_cities

In [23]:
used_car_data.city_fuel_economy

0           NaN
1           NaN
2          17.0
3           NaN
4           NaN
           ... 
3000035    26.0
3000036    18.0
3000037     NaN
3000038    30.0
3000039    26.0
Name: city_fuel_economy, Length: 3000040, dtype: float64

In [24]:
used_car_data.combine_fuel_economy.dropna()

Series([], Name: combine_fuel_economy, dtype: float64)

In [25]:
used_car_data.daysonmarket

0           522
1           207
2          1233
3           196
4           137
           ... 
3000035      16
3000036     171
3000037      91
3000038      11
3000039      17
Name: daysonmarket, Length: 3000040, dtype: int64

In [26]:
used_car_data.engine_cylinders = used_car_data.engine_cylinders.str[:2]

In [27]:
used_car_data.engine_cylinders.value_counts()

I4    1507448
V6     818732
V8     386953
H4      65955
I3      54304
I6      51776
I5       5446
H6       4686
V1       2611
I2        897
W1        583
R2         65
W8          3
Name: engine_cylinders, dtype: int64

In [28]:
used_car_data[used_car_data.body_type.isna()]

Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,transmission,transmission_display,trimId,trim_name,vehicle_damage_category,wheel_system,wheel_system_display,wheelbase,width,year
851,5XXG14J28MG001288,,,,,,,New York,,,...,A,8-Speed Automatic,,,,,,,,2021
857,5XXG14J2XMG006458,,,,,,,New York,,,...,A,8-Speed Automatic,,,,,,,,2021
865,5XXG14J28MG006443,,,,,,,New York,,,...,A,8-Speed Automatic,,,,,,,,2021
896,5XXG14J22MG003974,,,,,,,New York,,,...,A,8-Speed Automatic,,,,,,,,2021
1079,5XXG64J29MG017274,,,,,,,New York,,,...,A,8-Speed Automatic,,,,,,,,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2998327,3MVDMABL4MM201291,,,,,,,Sacramento,,,...,,,,,,,,,,2021
2998625,3MVDMBCL5MM202079,,,,,,,Sacramento,,,...,,,,,,,,,,2021
2999182,JA4AT4AA0KZ050139,,,,,,,San Francisco,,,...,CVT,Continuously Variable Transmission,,,,,,,,2019
2999350,1FDRU6ZG5LKA46426,,,,,,,Sacramento,,,...,,,,,,,,,,2020


In [29]:
mode_cyl_by_body = used_car_data.groupby('body_type').engine_cylinders.agg(pd.Series.mode)
na_cyl = used_car_data.engine_cylinders.isna() & ~used_car_data.body_type.isna()
used_car_data.loc[na_cyl, 'engine_cylinders'] = mode_cyl_by_body[used_car_data[na_cyl].body_type].values

In [30]:
mode_cyl_by_mk = used_car_data.groupby('make_name').engine_cylinders.agg(pd.Series.mode)
na_cyl = used_car_data.engine_cylinders.isna()
used_car_data.loc[na_cyl, 'engine_cylinders'] = mode_cyl_by_mk[used_car_data[na_cyl].make_name].values

In [31]:
used_car_data.engine_cylinders = used_car_data.engine_cylinders.apply(lambda x: 'I4' if not len(x) else x)

In [32]:
cylinder_config = used_car_data.engine_cylinders.str[0]

In [33]:
num_cylinders = used_car_data.engine_cylinders.str[1:].astype(int)

In [34]:
used_car_data['cylinder_config'] = cylinder_config

In [35]:
used_car_data['num_cylinders'] = num_cylinders

In [36]:
mode_body_by_mk = used_car_data.groupby('make_name').body_type.agg(pd.Series.mode)
na_body = used_car_data.body_type.isna()
used_car_data.loc[na_body, 'body_type'] = mode_body_by_mk[used_car_data[na_body].make_name].values

In [37]:
old_car_body_map = {'Kaiser': 'Sedan', 'Hudson': 'Coupe', 'Edsel': 'Wagon', 'DeSoto': 'Coupe', 'Nash': 'Convertible', 'Jensen': 'Coupe'}

In [38]:
bad_old_cars = used_car_data.body_type.apply(lambda x: not len(x))

In [39]:
used_car_data.loc[bad_old_cars, 'body_type'] = used_car_data[bad_old_cars].make_name.apply(old_car_body_map.get)

In [40]:
used_car_data.body_type.value_counts()

SUV / Crossover    1427711
Sedan               743990
Pickup Truck        474789
Hatchback            88387
Minivan              79806
Coupe                71643
Van                  47166
Wagon                40506
Convertible          26042
Name: body_type, dtype: int64

In [41]:
used_car_data.wheel_system.value_counts()

FWD    1261367
AWD     695731
4WD     584524
RWD     190757
4X2     120929
Name: wheel_system, dtype: int64

In [42]:
used_car_data.groupby('wheel_system').body_type.value_counts()

wheel_system  body_type      
4WD           Pickup Truck       335895
              SUV / Crossover    247670
              Van                   954
              Convertible             2
              Wagon                   2
              Coupe                   1
4X2           Pickup Truck        84930
              SUV / Crossover     33078
              Van                  2921
AWD           SUV / Crossover    562402
              Sedan               93373
              Wagon               15935
              Coupe                8610
              Hatchback            7002
              Pickup Truck         3598
              Convertible          2622
              Minivan              1538
              Van                   651
FWD           Sedan              557130
              SUV / Crossover    497891
              Minivan             76887
              Hatchback           76872
              Wagon               21443
              Van                 17347
          

In [43]:
used_car_data[used_car_data.body_type == 'SUV / Crossover'].groupby('wheel_system').price.mean()

wheel_system
4WD    35695.731996
4X2    42099.295170
AWD    32270.872053
FWD    24759.062040
RWD    36806.219749
Name: price, dtype: float64

In [44]:
used_car_data[used_car_data.body_type == 'Convertible'].groupby('wheel_system').price.median()

wheel_system
4WD    46987.5
AWD    65167.5
FWD     9995.0
RWD    29891.0
Name: price, dtype: float64

In [45]:
used_car_data.wheel_system.value_counts()

FWD    1261367
AWD     695731
4WD     584524
RWD     190757
4X2     120929
Name: wheel_system, dtype: int64

In [46]:
mode_ws_by_body = used_car_data.groupby('body_type').wheel_system.agg(pd.Series.mode)
na_ws = used_car_data.wheel_system.isna()
used_car_data.loc[na_ws, 'wheel_system'] = mode_ws_by_body[used_car_data[na_ws].body_type].values

In [47]:
front_ctrl = used_car_data.wheel_system.isin({'FWD', '4WD', 'AWD'})
rear_ctrl = used_car_data.wheel_system.isin({'RWD', '4X2', '4WD', 'AWD'}) # Assume 4X2 means RWD

In [48]:
used_car_data['front_ctrl'] = front_ctrl
used_car_data['rear_ctrl'] = rear_ctrl

In [49]:
used_car_data[used_car_data.franchise_make.isna()].make_name

220                  BMW
221                  BMW
223           Land Rover
225              Porsche
228        Mercedes-Benz
               ...      
3000021              BMW
3000030             Ford
3000033           Toyota
3000035        Chevrolet
3000038           Jaguar
Name: make_name, Length: 572635, dtype: object

In [50]:
used_car_data.make_name.value_counts()

Ford         476336
Chevrolet    376892
Toyota       239128
Nissan       217896
Honda        214459
              ...  
Austin            1
Hillman           1
Edsel             1
Pagani            1
Rover             1
Name: make_name, Length: 100, dtype: int64

In [51]:
all_fms = set(used_car_data.franchise_make.values)

In [52]:
can_use_make = used_car_data.franchise_make.isna() & used_car_data.make_name.isin(all_fms)
used_car_data.loc[can_use_make, 'franchise_make'] = used_car_data[can_use_make].make_name

In [53]:
top50_makes = set(used_car_data.make_name.value_counts().head(50).index)

In [54]:
used_car_data.loc[used_car_data.make_name.isin(top50_makes), 'common_make'] = used_car_data[used_car_data.make_name.isin(top50_makes)].make_name

In [55]:
used_car_data[used_car_data.common_make.isna()]

Unnamed: 0,vin,back_legroom,bed,bed_height,bed_length,body_type,cabin,city,city_fuel_economy,combine_fuel_economy,...,wheel_system,wheel_system_display,wheelbase,width,year,cylinder_config,num_cylinders,front_ctrl,rear_ctrl,common_make
3031,YH4K14AA1CA001652,31.4,,,,Sedan,,New York,20.0,,...,RWD,Rear-Wheel Drive,124.4 in,84 in,2012,I,4,False,True,
11304,SCCPC11105HL30473,,,,,Convertible,,New York,20.0,,...,RWD,Rear-Wheel Drive,90.5 in,72.8 in,2005,I,4,False,True,
17094,XL9AABAG3AZ363244,,,,,Convertible,,New York,,,...,RWD,,,,2010,I,4,False,True,
24250,K1177021,,,,,Sedan,,Detroit,,,...,FWD,,,,1953,I,4,True,False,
37549,WDBVF79J67A001813,40.3,,,,Sedan,,New York,10.0,,...,RWD,Rear-Wheel Drive,133.5 in,78 in,2007,V,1,False,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2972191,SCCLMDDC7LHA10495,,,,,Coupe,,Oakland,17.0,,...,RWD,Rear-Wheel Drive,101.4 in,77.6 in,2020,V,6,False,True,
2985683,SCCPC11115HL34273,,,,,Convertible,,San Francisco,20.0,,...,RWD,Rear-Wheel Drive,90.5 in,72.8 in,2005,I,4,False,True,
2986446,000000000PXEB1284,,,,,Coupe,,San Francisco,,,...,RWD,,,,1960,V,8,False,True,
2990382,WDYPE8DC4E5888494,,,,,Van,,San Jose,,,...,RWD,,,,2014,V,6,False,True,


In [56]:
used_car_data.engine_displacement

0          1300.0
1          2000.0
2          2500.0
3          3000.0
4          2000.0
            ...  
3000035    1500.0
3000036    3600.0
3000037    2000.0
3000038    2000.0
3000039    2500.0
Name: engine_displacement, Length: 3000040, dtype: float64

In [57]:
used_car_data.frame_damaged = used_car_data.frame_damaged.fillna(False)

In [58]:
used_car_data.franchise_dealer

0           True
1           True
2           True
3           True
4           True
           ...  
3000035    False
3000036     True
3000037     True
3000038    False
3000039     True
Name: franchise_dealer, Length: 3000040, dtype: bool

In [59]:
used_car_data.franchise_make.value_counts()

Ford             477084
Chevrolet        407525
Toyota           233284
Honda            217375
Jeep             203978
Nissan           200064
Hyundai          130170
Kia              112805
RAM               88184
Buick             82680
GMC               78880
Volkswagen        77308
Dodge             74054
Subaru            73697
Mercedes-Benz     64268
BMW               62557
Mazda             54230
Cadillac          43513
Lexus             38716
Audi              36048
Acura             32096
Chrysler          30077
Lincoln           29912
INFINITI          26496
Mitsubishi        24620
Volvo             23150
Land Rover        16706
Porsche           14027
MINI               8043
Jaguar             7177
Scion              5544
FIAT               5471
Maserati           3775
Alfa Romeo         2173
Genesis            1504
Bentley            1054
Ferrari             965
Rolls-Royce         735
Aston Martin        653
Lotus               506
McLaren             407
Lamborghini     

In [60]:
used_car_data.front_legroom = used_car_data.front_legroom.replace('--', float('nan'))

In [61]:
used_car_data.front_legroom.dropna().str.endswith(' in').all()

True

In [62]:
used_car_data.front_legroom = used_car_data.front_legroom.str.replace(' in', '').astype('float')

In [63]:
used_car_data.fuel_tank_volume = used_car_data.fuel_tank_volume.replace('--', float('nan'))

In [64]:
used_car_data.fuel_tank_volume.dropna().str.endswith(' gal').all()

True

In [65]:
used_car_data.fuel_tank_volume = used_car_data.fuel_tank_volume.str.replace(' gal', '').astype('float')

In [66]:
used_car_data.fuel_type

0          Gasoline
1          Gasoline
2          Gasoline
3          Gasoline
4          Gasoline
             ...   
3000035    Gasoline
3000036    Gasoline
3000037        None
3000038      Diesel
3000039    Gasoline
Name: fuel_type, Length: 3000040, dtype: object

In [67]:
used_car_data.has_accidents = used_car_data.has_accidents.fillna(False)

In [68]:
used_car_data.height = used_car_data.height.replace('--', float('nan'))
used_car_data.height.str.endswith(' in').all()

True

In [69]:
used_car_data.height = used_car_data.height.str.replace(' in', '').astype('float')

In [70]:
used_car_data.fleet = used_car_data.fleet.fillna(False)

In [71]:
used_car_data.highway_fuel_economy

0           NaN
1           NaN
2          23.0
3           NaN
4           NaN
           ... 
3000035    32.0
3000036    27.0
3000037     NaN
3000038    40.0
3000039    33.0
Name: highway_fuel_economy, Length: 3000040, dtype: float64

In [72]:
used_car_data.horsepower

0          177.0
1          246.0
2          305.0
3          340.0
4          246.0
           ...  
3000035    170.0
3000036    310.0
3000037    240.0
3000038    180.0
3000039    170.0
Name: horsepower, Length: 3000040, dtype: float64

In [73]:
used_car_data.isCab = used_car_data.isCab.fillna(False)

In [74]:
used_car_data.is_cpo = used_car_data.is_cpo.fillna(False)

In [75]:
used_car_data.is_new.hasnans

False

In [76]:
((used_car_data.is_oemcpo.fillna(False) | used_car_data.is_cpo) == used_car_data.is_cpo).all()

True

In [77]:
used_car_data.length = used_car_data.length.replace('--', float('nan'))
used_car_data.length.str.endswith(' in').all()

True

In [78]:
used_car_data.length = used_car_data.length.str.replace(' in', '').astype('float')

In [79]:
month_listed = used_car_data.listed_date.astype('datetime64').dt.month
year_listed = used_car_data.listed_date.astype('datetime64').dt.year

In [80]:
month_listed.value_counts()

8     1106388
9      565170
7      536309
6      225923
3      115727
2       93972
5       88574
1       69368
4       63353
12      57037
11      44123
10      34096
Name: listed_date, dtype: int64

In [81]:
used_car_data.maximum_seating = used_car_data.maximum_seating.replace('--', float('nan'))
used_car_data.maximum_seating.str.endswith(' seats').all()

True

In [82]:
used_car_data.maximum_seating = used_car_data.maximum_seating.str.replace(' seats', '').astype('float')

In [83]:
used_car_data.mileage

0              7.0
1              8.0
2              NaN
3             11.0
4              7.0
            ...   
3000035    41897.0
3000036        5.0
3000037    57992.0
3000038    27857.0
3000039    22600.0
Name: mileage, Length: 3000040, dtype: float64

In [84]:
used_car_data.owner_count = used_car_data.owner_count.fillna(1.)

In [85]:
used_car_data.price

0          23141.0
1          46500.0
2          46995.0
3          67430.0
4          48880.0
            ...   
3000035    17998.0
3000036    36490.0
3000037    12990.0
3000038    26998.0
3000039    19900.0
Name: price, Length: 3000040, dtype: float64

In [86]:
used_car_data.seller_rating = used_car_data.seller_rating.fillna(4.)

In [87]:
used_car_data.sp_name

0                                          Flagship Chrysler
1                                        Land Rover San Juan
2                                           FIAT de San Juan
3                                        Land Rover San Juan
4                                        Land Rover San Juan
                                 ...                        
3000035    CarMax Fairfield - Now offering Curbside Picku...
3000036                                       Team Chevrolet
3000037              Hanlees Chrysler Dodge Jeep Ram of Napa
3000038    CarMax Fairfield - Now offering Curbside Picku...
3000039                                      Napa Nissan Inc
Name: sp_name, Length: 3000040, dtype: object

In [88]:
used_car_data.vehicle_damage_category.dropna()

Series([], Name: vehicle_damage_category, dtype: float64)

In [89]:
used_car_data.wheel_system.value_counts()

FWD    1294209
AWD     753762
4WD     628790
RWD     202350
4X2     120929
Name: wheel_system, dtype: int64

In [90]:
used_car_data.width = used_car_data.width.replace('--', float('nan'))
used_car_data.width.str.endswith(' in').all()

True

In [91]:
used_car_data.width = used_car_data.width.str.replace(' in', '').astype('float')

In [92]:
used_car_data.year

0          2019
1          2020
2          2016
3          2020
4          2020
           ... 
3000035    2018
3000036    2020
3000037    2016
3000038    2017
3000039    2017
Name: year, Length: 3000040, dtype: int64

In [93]:
model_age = year_listed - used_car_data.year

In [94]:
used_car_data.wheelbase = used_car_data.wheelbase.replace('--', float('nan'))
used_car_data.wheelbase.str.endswith(' in').all()

True

In [95]:
used_car_data.wheelbase = used_car_data.wheelbase.str.replace(' in', '').astype('float')

In [96]:
used_car_data.body_type.hasnans

False

In [97]:
#le_bed = preprocessing.LabelEncoder()
#le_bed.fit(used_car_data.bed)
#le_bed.classes_

In [98]:
#used_car_data.bed = le_bed.transform(used_car_data.bed)

In [99]:
enc_map = {}
def trf_cat(feat):
    le = preprocessing.LabelEncoder()
    le.fit(used_car_data[feat])
    print(le.classes_)
    #print('Good?')
    if True: #input() == 'y':
        used_car_data[feat] = le.transform(used_car_data[feat])
        enc_map[feat] = le
        # print('Done')

        

In [100]:
trf_cat('bed')

['Long' 'Regular' 'Short' None]


In [101]:
trf_cat('body_type')

['Convertible' 'Coupe' 'Hatchback' 'Minivan' 'Pickup Truck'
 'SUV / Crossover' 'Sedan' 'Van' 'Wagon']


In [102]:
trf_cat('cabin')

['Crew Cab' 'Extended Cab' 'Large Crew Cab' 'Regular Cab' None]


In [103]:
trf_cat('city')

['Albuquerque' 'Alexandria' 'Arlington' 'Atlanta' 'Austin' 'Bakersfield'
 'Baltimore' 'Boston' 'Charlotte' 'Chicago' 'Colorado Springs' 'Columbus'
 'Dallas' 'Denver' 'Detroit' 'El Paso' 'Fort Worth' 'Fresno' 'Houston'
 'Indianapolis' 'Jacksonville' 'Kansas City' 'Las Vegas' 'Long Beach'
 'Los Angeles' 'Louisville' 'Memphis' 'Mesa' 'Miami' 'Milwaukee'
 'Minneapolis' 'Nashville' 'New York' 'Oakland' 'Oklahoma City' 'Omaha'
 'Philadelphia' 'Phoenix' 'Portland' 'Raleigh' 'Sacramento' 'San Antonio'
 'San Diego' 'San Francisco' 'San Jose' 'Seattle' 'Tucson' 'Tulsa'
 'Virginia Beach' 'Wichita']


In [104]:
trf_cat('cylinder_config')

['H' 'I' 'R' 'V' 'W']


In [105]:
trf_cat('common_make')

['Acura' 'Alfa Romeo' 'Aston Martin' 'Audi' 'BMW' 'Bentley' 'Buick'
 'Cadillac' 'Chevrolet' 'Chrysler' 'Dodge' 'FIAT' 'Ferrari' 'Ford' 'GMC'
 'Genesis' 'Honda' 'Hummer' 'Hyundai' 'INFINITI' 'Jaguar' 'Jeep' 'Kia'
 'Lamborghini' 'Land Rover' 'Lexus' 'Lincoln' 'MINI' 'Maserati' 'Mazda'
 'McLaren' 'Mercedes-Benz' 'Mercury' 'Mitsubishi' 'Nissan' 'Oldsmobile'
 'Pontiac' 'Porsche' 'RAM' 'Rolls-Royce' 'Saab' 'Saturn' 'Scion' 'Subaru'
 'Suzuki' 'Tesla' 'Toyota' 'Volkswagen' 'Volvo' 'smart' nan]


In [106]:
used_car_data.fuel_type = used_car_data.fuel_type.fillna('Gasoline')

In [107]:
trf_cat('fuel_type')

['Biodiesel' 'Compressed Natural Gas' 'Diesel' 'Electric'
 'Flex Fuel Vehicle' 'Gasoline' 'Hybrid' 'Propane']


In [108]:
trf_cat('listing_color')

['BLACK' 'BLUE' 'BROWN' 'GOLD' 'GRAY' 'GREEN' 'ORANGE' 'PINK' 'PURPLE'
 'RED' 'SILVER' 'TEAL' 'UNKNOWN' 'WHITE' 'YELLOW']


In [109]:
used_car_data.transmission = used_car_data.transmission.fillna('A')

In [110]:
trf_cat('transmission')

['A' 'CVT' 'Dual Clutch' 'M']


In [111]:
used_car_data.wheel_system = used_car_data.wheel_system.fillna('FWD')

In [112]:
used_car_data.cabin.value_counts()

4    2936507
0      51083
1       7960
3       2966
2       1524
Name: cabin, dtype: int64

In [113]:
sum(used_car_data.mileage.isna())

144387

In [114]:
used_car_data.mileage.median()

8267.0

In [115]:
used_car_data.mileage = used_car_data.mileage.fillna(10000)

In [116]:
processed_data = pd.DataFrame(used_car_data[
    ['price',
     'bed', 'body_type', 'cabin', 'city', 'common_make', 'fuel_type', 'listing_color', 'transmission',
     'cylinder_config', 'num_cylinders', 'rear_ctrl', 'front_ctrl',
     'fleet', 'frame_damaged', 'franchise_dealer', 'has_accidents', 'isCab', 'is_cpo', 'is_new', 'salvage',
     'back_legroom', 'bed_length', 'city_fuel_economy', 'daysonmarket', 'engine_displacement', 'front_legroom', 'fuel_tank_volume', 'height', 'highway_fuel_economy', 'horsepower', 'length', 'maximum_seating', 'mileage', 'owner_count', 'seller_rating', 'width', 'wheelbase']
])
processed_data['year_listed'] = year_listed
processed_data['month_listed'] = month_listed
processed_data['model_age'] = model_age
processed_data

Unnamed: 0,price,bed,body_type,cabin,city,common_make,fuel_type,listing_color,transmission,cylinder_config,...,length,maximum_seating,mileage,owner_count,seller_rating,width,wheelbase,year_listed,month_listed,model_age
0,23141.0,3,5,4,28,21,5,14,0,1,...,166.6,5.0,7.0,1.0,2.800000,79.6,101.2,2019,4,0
1,46500.0,3,5,4,28,24,5,0,0,1,...,181.0,7.0,8.0,1.0,3.000000,85.6,107.9,2020,2,0
2,46995.0,3,6,4,28,43,5,12,3,0,...,180.9,5.0,10000.0,3.0,4.000000,78.9,104.3,2017,4,1
3,67430.0,3,5,4,28,24,5,4,0,3,...,195.1,7.0,11.0,1.0,3.000000,87.4,115.0,2020,2,0
4,48880.0,3,5,4,28,24,5,0,0,1,...,181.0,7.0,7.0,1.0,3.000000,85.6,107.9,2020,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000035,17998.0,3,5,4,33,8,5,10,0,1,...,183.1,5.0,41897.0,1.0,4.272727,72.6,107.3,2020,8,2
3000036,36490.0,3,5,4,33,8,5,0,0,3,...,204.3,8.0,5.0,1.0,4.533333,78.6,120.9,2020,3,0
3000037,12990.0,3,6,4,33,13,5,4,0,1,...,191.7,5.0,57992.0,2.0,4.142857,83.5,112.2,2020,6,4
3000038,26998.0,3,6,4,33,20,2,5,0,1,...,183.9,5.0,27857.0,1.0,4.272727,81.7,111.6,2020,9,3


In [117]:
enc_map['cabin'].transform(['Crew Cab'])

array([0])

In [118]:
processed_data.salvage = processed_data.salvage.fillna(False)

In [119]:
med_bleg_by_body = processed_data.groupby('body_type').back_legroom.median()
na_bleg = processed_data.back_legroom.isna()
processed_data.loc[na_bleg, 'back_legroom'] = med_bleg_by_body[processed_data[na_bleg].body_type].values

In [120]:
# nan means no bed
processed_data.bed_length = processed_data.bed_length.fillna(0.)

In [121]:
med_cfe_by_body = processed_data.groupby('body_type').city_fuel_economy.median()
na_cfe = processed_data.city_fuel_economy.isna()
processed_data.loc[na_cfe, 'city_fuel_economy'] = med_cfe_by_body[processed_data[na_cfe].body_type].values

In [122]:
med_ed_by_body = processed_data.groupby('body_type').engine_displacement.median()
na_ed = processed_data.engine_displacement.isna()
processed_data.loc[na_ed, 'engine_displacement'] = med_ed_by_body[processed_data[na_ed].body_type].values

In [123]:
med_fl_by_body = processed_data.groupby('body_type').front_legroom.median()
na_fl = processed_data.front_legroom.isna()
processed_data.loc[na_fl, 'front_legroom'] = med_fl_by_body[processed_data[na_fl].body_type].values

In [124]:
med_fv_by_body = processed_data.groupby('body_type').fuel_tank_volume.median()
na_fv = processed_data.fuel_tank_volume.isna()
processed_data.loc[na_fv, 'fuel_tank_volume'] = med_fv_by_body[processed_data[na_fv].body_type].values

In [125]:
med_height_by_body = processed_data.groupby('body_type').height.median()
na_height = processed_data.height.isna()
processed_data.loc[na_height, 'height'] = med_height_by_body[processed_data[na_height].body_type].values

In [126]:
med_hfe_by_body = processed_data.groupby('body_type').highway_fuel_economy.median()
na_hfe = processed_data.highway_fuel_economy.isna()
processed_data.loc[na_hfe, 'highway_fuel_economy'] = med_hfe_by_body[processed_data[na_hfe].body_type].values

In [127]:
med_hp_by_body = processed_data.groupby('body_type').horsepower.median()
na_hp = processed_data.horsepower.isna()
processed_data.loc[na_hp, 'horsepower'] = med_hp_by_body[processed_data[na_hp].body_type].values

In [128]:
med_length_by_body = processed_data.groupby('body_type').length.median()
na_length = processed_data.length.isna()
processed_data.loc[na_length, 'length'] = med_length_by_body[processed_data[na_length].body_type].values

In [129]:
med_ms_by_body = processed_data.groupby('body_type').maximum_seating.median()
na_ms = processed_data.maximum_seating.isna()
processed_data.loc[na_ms, 'maximum_seating'] = med_ms_by_body[processed_data[na_ms].body_type].values

In [130]:
med_width_by_body = processed_data.groupby('body_type').width.median()
na_width = processed_data.width.isna()
processed_data.loc[na_width, 'width'] = med_width_by_body[processed_data[na_width].body_type].values

In [131]:
processed_data.groupby('body_type').wheelbase.std()

body_type
0     6.685326
1     5.310521
2     4.712866
3     1.757404
4    10.313188
5     6.506079
6     4.406548
7    15.091283
8     4.403115
Name: wheelbase, dtype: float64

In [132]:
med_wb_by_body = processed_data.groupby('body_type').wheelbase.median()
na_wb = processed_data.wheelbase.isna()
processed_data.loc[na_wb, 'wheelbase'] = med_wb_by_body[processed_data[na_wb].body_type].values

In [133]:
[k for k in processed_data if processed_data[k].hasnans]

[]

In [134]:
processed_data.to_parquet("processed_ucd_2.parquet", compression=None)

In [135]:
def ask_data(col):
    if col in enc_map:
        print(f'Input {col}')
        print(f'    Options: {enc_map[col].classes_}')
        return enc_map[col].transform([input('    Take a pick:')])[0]
    else:
        return float(input(f'Input {col}:'))

def ask_column(cols):
    return list(map(ask_data, cols))

In [136]:
processed_data.groupby('city').price.median().sort_values()

city
46    22294.00
25    23966.00
48    24000.00
17    24250.00
19    24825.00
37    25135.50
49    25253.00
21    25415.00
20    25589.00
31    25615.50
28    25638.00
39    25695.00
23    25781.00
11    25800.00
9     25839.00
22    25900.00
47    25977.00
3     25995.00
1     26048.00
27    26076.00
29    26082.00
26    26161.00
8     26500.00
30    26610.50
36    26720.00
40    26737.00
34    26774.00
6     26788.00
4     26800.00
42    26888.00
10    26900.00
0     26964.00
2     26974.00
38    26990.00
35    26995.00
12    27064.00
7     27190.00
45    27356.50
24    27449.00
14    27474.33
41    27535.00
13    27670.00
5     27785.00
44    27850.00
32    27877.50
18    28195.00
15    28320.00
33    28486.00
43    29445.00
16    29721.00
Name: price, dtype: float64

In [137]:
enc_map['city'].inverse_transform([17])

array(['Fresno'], dtype=object)

In [138]:
ppd_corr = processed_data.corr()

In [139]:
ppd_corr.style.background_gradient(cmap='coolwarm').format(precision=2).hide(axis="columns")

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
price,1.0,0.03,-0.18,-0.04,0.01,-0.05,-0.15,0.02,-0.13,0.29,0.33,0.38,-0.1,-0.17,-0.06,0.28,-0.2,-0.14,-0.01,0.38,-0.05,0.22,0.23,-0.19,0.06,0.35,0.08,0.38,0.29,-0.26,0.57,0.36,0.12,-0.27,-0.28,0.07,0.37,0.35,-0.05,-0.02,-0.36
bed,0.03,1.0,0.04,0.31,-0.01,0.02,0.1,-0.02,0.02,-0.07,-0.1,-0.06,0.05,-0.0,-0.01,0.12,-0.03,0.01,0.02,0.06,-0.01,0.02,-0.16,0.06,-0.01,-0.12,-0.0,-0.14,-0.1,0.07,-0.07,-0.14,0.01,-0.08,-0.07,-0.02,-0.07,-0.15,0.01,-0.0,-0.1
body_type,-0.18,0.04,1.0,0.1,0.0,0.09,0.09,-0.0,-0.01,-0.26,-0.3,-0.21,0.21,0.04,0.01,0.01,0.01,0.05,0.03,-0.01,-0.0,0.12,-0.28,0.15,0.0,-0.29,-0.04,-0.18,-0.12,0.22,-0.32,-0.14,-0.0,-0.0,-0.05,-0.02,-0.18,-0.2,0.0,-0.01,-0.08
cabin,-0.04,0.31,0.1,1.0,-0.01,-0.02,0.14,-0.01,0.05,-0.14,-0.21,-0.13,0.03,0.02,-0.0,0.09,-0.02,0.03,0.03,0.03,-0.01,-0.09,-0.32,0.11,-0.0,-0.24,-0.03,-0.23,-0.2,0.15,-0.18,-0.27,-0.04,-0.06,-0.05,-0.02,-0.13,-0.29,0.01,-0.01,-0.06
city,0.01,-0.01,0.0,-0.01,1.0,0.01,0.0,0.01,0.01,-0.01,-0.01,-0.02,-0.01,0.01,0.0,-0.0,-0.01,0.01,0.01,-0.01,0.01,-0.01,-0.01,0.03,-0.0,-0.01,-0.0,-0.01,-0.02,0.03,-0.01,-0.01,-0.0,0.0,0.01,0.01,-0.02,-0.01,0.0,-0.0,0.01
common_make,-0.05,0.02,0.09,-0.02,0.01,1.0,0.06,-0.02,0.2,-0.18,-0.14,-0.07,0.12,-0.0,0.0,-0.01,0.01,-0.0,0.05,-0.01,0.01,-0.06,-0.06,0.15,-0.05,-0.08,-0.06,-0.13,-0.09,0.17,-0.18,-0.09,-0.07,-0.0,-0.02,-0.03,-0.25,-0.11,0.03,0.03,-0.01
fuel_type,-0.15,0.1,0.09,0.14,0.0,0.06,1.0,-0.03,0.12,-0.16,-0.26,-0.14,0.04,0.01,-0.0,0.03,-0.0,0.01,0.03,0.01,0.0,-0.15,-0.34,0.09,-0.0,-0.33,-0.05,-0.43,-0.31,0.1,-0.23,-0.39,-0.06,-0.04,-0.03,-0.01,-0.26,-0.42,0.0,-0.0,-0.03
listing_color,0.02,-0.02,-0.0,-0.01,0.01,-0.02,-0.03,1.0,-0.03,0.04,0.04,-0.01,-0.04,0.01,-0.01,0.04,-0.02,0.01,-0.01,0.04,-0.01,0.02,0.04,-0.02,0.01,0.05,-0.01,0.06,0.07,-0.03,0.03,0.06,0.01,-0.02,-0.03,0.01,0.04,0.07,-0.01,-0.01,-0.03
transmission,-0.13,0.02,-0.01,0.05,0.01,0.2,0.12,-0.03,1.0,-0.26,-0.21,-0.16,0.04,-0.02,0.0,0.01,-0.0,-0.03,0.01,0.01,0.01,-0.16,-0.15,0.25,-0.02,-0.21,0.05,-0.28,-0.27,0.26,-0.26,-0.23,-0.16,-0.01,0.01,-0.03,-0.27,-0.24,0.02,-0.01,0.03
cylinder_config,0.29,-0.07,-0.26,-0.14,-0.01,-0.18,-0.16,0.04,-0.26,1.0,0.87,0.36,-0.23,0.02,-0.01,-0.05,0.02,0.01,-0.03,-0.07,-0.0,0.25,0.39,-0.5,-0.02,0.75,0.08,0.66,0.53,-0.56,0.75,0.62,0.39,0.08,0.09,0.03,0.47,0.59,-0.01,0.04,0.12


In [140]:
ppd_corr.price.abs().sort_values(ascending=False)

price                   1.000000
horsepower              0.567266
is_new                  0.378275
rear_ctrl               0.376462
fuel_tank_volume        0.375563
width                   0.369239
model_age               0.358916
length                  0.356081
wheelbase               0.354062
engine_displacement     0.352521
num_cylinders           0.334364
height                  0.294759
cylinder_config         0.292794
franchise_dealer        0.282265
owner_count             0.281391
mileage                 0.271031
highway_fuel_economy    0.257575
bed_length              0.225028
back_legroom            0.224200
has_accidents           0.197793
city_fuel_economy       0.186369
body_type               0.176794
fleet                   0.168321
fuel_type               0.146617
isCab                   0.138605
transmission            0.125201
maximum_seating         0.122047
front_ctrl              0.104542
front_legroom           0.075267
seller_rating           0.068455
daysonmark

In [142]:
import pickle
with open('enc_map.pickle', 'wb') as f:
    pickle.dump(enc_map, f)

In [143]:
[c for c in processed_data.columns if processed_data[c].hasnans]

[]

In [None]:
len(used_car_data.columns)

