# Clean Used Car Data

In [None]:
import pandas as pd 
import numpy as np 
import statistics as stat 
import sys
sys.path.append(".")
import logging as logger

In [2]:
#load data into dataframe for cleaning 

raw_data = pd.read_csv("../data/vehicles.csv")
  

In [4]:
raw_data.shape # checking the expected data has arrived

(426880, 26)

In [27]:
raw_data.head(10)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,
5,7222379453,https://hudsonvalley.craigslist.org/cto/d/west...,hudson valley,https://hudsonvalley.craigslist.org,1600,,,,,,...,,,,,,,ny,,,
6,7221952215,https://hudsonvalley.craigslist.org/cto/d/west...,hudson valley,https://hudsonvalley.craigslist.org,1000,,,,,,...,,,,,,,ny,,,
7,7220195662,https://hudsonvalley.craigslist.org/cto/d/poug...,hudson valley,https://hudsonvalley.craigslist.org,15995,,,,,,...,,,,,,,ny,,,
8,7209064557,https://medford.craigslist.org/cto/d/grants-pa...,medford-ashland,https://medford.craigslist.org,5000,,,,,,...,,,,,,,or,,,
9,7219485069,https://erie.craigslist.org/cto/d/erie-2012-su...,erie,https://erie.craigslist.org,3000,,,,,,...,,,,,,,pa,,,


In [7]:
%%time
raw_data.info() # get the information breakdown of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  object 
 18  pain

## Initial Notes
1. The data shows a number of features that contain null values.
2. There are a number of features that are not required for modelling

In [29]:
#drop all columns not required for modelling 
data_1 = raw_data.drop(columns=['id', 'url','region_url','VIN', 'county', 'lat','long', 'posting_date', 'image_url', 'description'])

In [31]:
#count up the number of nulls 

null_check = data_1.isnull().sum()
null_check = pd.DataFrame(null_check, columns=["Null"])
total = len(raw_data)
null_check['%'] = pd.DataFrame(null_check['Null']/total)
null_check.sort_values('%',ascending=False)


Unnamed: 0,Null,%
size,306361,0.717675
cylinders,177678,0.416225
condition,174104,0.407852
drive,130567,0.305863
paint_color,130203,0.305011
type,92858,0.217527
manufacturer,17646,0.041337
title_status,8242,0.019308
model,5277,0.012362
odometer,4400,0.010307


In [36]:
#removing the rows that are missing <= 10% of the over data 
data_2 = data_1.dropna(subset=['year','transmission', 'fuel', 'odometer'])
null_check_2 = data_2.isnull().sum()
null_check_2 = pd.DataFrame(null_check_2, columns=["Null"])
total = len(raw_data)
null_check_2['%'] = pd.DataFrame(null_check_2['Null']/total)
null_check_2.sort_values('%',ascending=False)

Unnamed: 0,Null,%
size,298050,0.698206
cylinders,172351,0.403746
condition,166738,0.390597
drive,127627,0.298976
paint_color,127026,0.297568
type,91314,0.21391
manufacturer,16039,0.037573
title_status,6809,0.015951
model,5179,0.012132
region,0,0.0


In [39]:
# check the catigorical type of data
data_2['size'].value_counts()

full-size      63100
mid-size       34332
compact        19018
sub-compact     3051
Name: size, dtype: int64

In [40]:
data_2['condition'].value_counts()

good         121235
excellent     99731
like new      21178
fair           6769
new            1299
salvage         601
Name: condition, dtype: int64

In [41]:
data_2['drive'].value_counts()

4wd    129307
fwd    102848
rwd     57769
Name: drive, dtype: int64

In [42]:
data_2['paint_color'].value_counts()

white     77475
black     61485
silver    42101
blue      30667
red       29919
grey      23903
green      7267
brown      6510
custom     6508
yellow     2068
orange     1945
purple      677
Name: paint_color, dtype: int64

In [43]:
data_2['type'].value_counts()

sedan          84924
SUV            75190
pickup         42112
truck          35148
other          21255
coupe          18885
hatchback      16402
wagon          10534
van             8315
convertible     7629
mini-van        4717
offroad          609
bus              517
Name: type, dtype: int64

In [44]:
data_2['manufacturer'].value_counts()

ford               69601
chevrolet          53910
toyota             33383
honda              20874
jeep               18651
nissan             18521
ram                18100
gmc                16417
bmw                14562
dodge              13456
mercedes-benz      11520
hyundai            10125
subaru              9346
volkswagen          9212
kia                 8296
lexus               8070
audi                7481
cadillac            6833
acura               5914
chrysler            5887
buick               5415
mazda               5352
infiniti            4751
lincoln             4150
volvo               3352
mitsubishi          3202
mini                2356
pontiac             2270
rover               2082
jaguar              1935
porsche             1352
mercury             1183
saturn              1077
alfa-romeo           888
tesla                865
fiat                 784
harley-davidson      135
ferrari               93
datsun                63
aston-martin          24


In [46]:
data_2['title_status'].value_counts()

clean         397362
rebuilt         7083
salvage         3863
lien            1422
missing          814
parts only       198
Name: title_status, dtype: int64

In [47]:
data_2['model'].value_counts()

f-150                        7814
silverado 1500               5009
1500                         4156
camry                        3064
silverado                    2948
                             ... 
f-350sd limited                 1
gl-class 5.5l                   1
series lll                      1
echosport                       1
e150 super duty passenger       1
Name: model, Length: 28539, dtype: int64

In [48]:
data_2['region'].value_counts()

columbus                   3596
jacksonville               3504
spokane / coeur d'alene    2975
tulsa                      2965
baltimore                  2962
                           ... 
meridian                     27
southwest MS                 14
kansas city                  11
fort smith, AR                9
west virginia (old)           8
Name: region, Length: 404, dtype: int64

In [49]:
data_2['fuel'].value_counts()

gas         350528
other        30526
diesel       29699
hybrid        5127
electric      1671
Name: fuel, dtype: int64

In [50]:
data_2['transmission'].value_counts()

automatic    330152
other         62551
manual        24848
Name: transmission, dtype: int64

In [51]:
data_2['state'].value_counts()

ca    49200
fl    27830
tx    22539
ny    19196
oh    17460
mi    16826
or    16542
nc    14711
pa    13429
wa    13162
wi    11236
tn    10973
co    10656
va    10466
il    10209
nj     9361
id     8916
az     8460
ia     8111
ma     8054
mn     7657
ga     6898
ok     6577
mt     6213
sc     6152
ks     6134
in     5701
ct     5161
al     4806
md     4760
nm     4359
mo     4252
ky     4119
ar     4027
ak     3414
la     3161
nv     3104
nh     2961
me     2934
dc     2883
hi     2732
vt     2475
ri     2314
sd     1295
ut     1139
wv     1049
ms      998
ne      974
de      948
wy      607
nd      410
Name: state, dtype: int64

## Notes 
1. fuel has 30k records  classed as other. Unable to know or predict the fuel type so these row will be removed
2. Transmission as ~62k records classed as other. Unable to know or predict the transamission type so these rows will be removed. 