The purpose of this project is to take data from a craiglist car sales database and use it to predict prices for other used cars.  This first section focuses on cleaning the data and simplifying it to focus on only certain qualities of the car that can be used.

In [1]:
#Import anything that I might be using durinng this project
import pandas as pd
import seaborn as sns
import os
import pandas_profiling as profile
from sqlalchemy import create_engine
from sklearn.preprocessing import LabelEncoder

In [2]:
#This file is too large for memory, but I want to look and see the initial data
cars_small = pd.read_csv('vehicles.csv', nrows=5)
cars_small

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,drive,size,type,paint_color,image_url,description,county,state,lat,long
0,7184791621,https://duluth.craigslist.org/ctd/d/duluth-200...,duluth / superior,https://duluth.craigslist.org,6995,2000,gmc,new sierra 1500,excellent,8 cylinders,...,4wd,,,red,https://images.craigslist.org/00n0n_f06ykBMcdh...,2000 *** GMC New Sierra 1500 Ext Cab 157.5 WB...,,mn,46.8433,-92.255
1,7184773187,https://duluth.craigslist.org/cto/d/saginaw-20...,duluth / superior,https://duluth.craigslist.org,8750,2013,hyundai,sonata,excellent,4 cylinders,...,fwd,,,grey,https://images.craigslist.org/00d0d_kgZ6xoeRw2...,For Sale: 2013 Hyundai Sonata GLS - $8750. O...,,mn,46.9074,-92.4638
2,7193375964,https://newhaven.craigslist.org/cto/d/stratfor...,new haven,https://newhaven.craigslist.org,10900,2013,toyota,prius,good,4 cylinders,...,fwd,,,blue,https://images.craigslist.org/00d0d_3sHGxPbY2O...,2013 Prius V Model Two. One owner—must sell my...,,ct,41.177,-73.1336
3,7195108810,https://albuquerque.craigslist.org/cto/d/albuq...,albuquerque,https://albuquerque.craigslist.org,12500,2003,mitsubishi,lancer,good,4 cylinders,...,4wd,mid-size,sedan,grey,https://images.craigslist.org/00m0m_4a8Pb6JbMG...,"2003 Mitsubishi Lancer Evolution, silver. Abo...",,nm,35.1868,-106.665
4,7184712241,https://duluth.craigslist.org/ctd/d/rush-city-...,duluth / superior,https://duluth.craigslist.org,16995,2007,gmc,sierra classic 2500hd,good,8 cylinders,...,4wd,full-size,truck,white,https://images.craigslist.org/01414_g093aPtSMW...,"**Bad Credit, No Credit... No Problem!**2007 G...",,mn,45.6836,-92.9648


In [3]:
#Since it is large, I am already deleting several columns I know that I am not using, including URL, region, condition, fuel, title status, vin, size, type, image_url, description, county, lat, and long 
#Also I will load only the first part of the data
col_list= ['id', 'price', 'year', 'manufacturer', 'model', 'cylinders', 'odometer', 'transmission', 'drive', 'paint_color', 'state']
cars = pd.read_csv('vehicles.csv', usecols=col_list ,nrows=32000)
cars.head()
#32000 rows was the maximum the memory could hold

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783.0,automatic,4wd,red,mn
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821.0,automatic,fwd,grey,mn
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800.0,automatic,fwd,blue,ct
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,,manual,4wd,grey,nm
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217.0,automatic,4wd,white,mn


In [4]:
#Take an initial profile of everything
profile.ProfileReport(cars)

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






In [5]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            32000 non-null  int64  
 1   price         32000 non-null  int64  
 2   year          20667 non-null  float64
 3   manufacturer  19791 non-null  object 
 4   model         20489 non-null  object 
 5   cylinders     12236 non-null  object 
 6   odometer      16992 non-null  float64
 7   transmission  20670 non-null  object 
 8   drive         14428 non-null  object 
 9   paint_color   13959 non-null  object 
 10  state         32000 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 1.8+ MB


In [6]:
#Take a closer look at null data
missing = pd.concat([cars.isnull().sum(), 100 * cars.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count',ascending=True)

Unnamed: 0,count,%
id,0,0.0
price,0,0.0
state,0,0.0
transmission,11330,35.40625
year,11333,35.415625
model,11511,35.971875
manufacturer,12209,38.153125
odometer,15008,46.9
drive,17572,54.9125
paint_color,18041,56.378125


In [7]:
#First I am going to get rid of the cars with a price that cannot be used
cars['price'].describe()

count    3.200000e+04
mean     1.523954e+04
std      1.189579e+05
min      0.000000e+00
25%      4.495000e+03
50%      9.995000e+03
75%      1.939625e+04
max      1.850000e+07
Name: price, dtype: float64

In [8]:
cars = cars[cars['price'] !=0]
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29601 entries, 0 to 31999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            29601 non-null  int64  
 1   price         29601 non-null  int64  
 2   year          19135 non-null  float64
 3   manufacturer  18319 non-null  object 
 4   model         18974 non-null  object 
 5   cylinders     11686 non-null  object 
 6   odometer      15822 non-null  float64
 7   transmission  19148 non-null  object 
 8   drive         13527 non-null  object 
 9   paint_color   13080 non-null  object 
 10  state         29601 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 1.9+ MB


In [9]:
#There are some high values listed for price, what are realistic?
cars.loc[cars.price > 200000]

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
2735,7192116434,1495000,2014.0,lexus,is 250,6 cylinders,79577.0,automatic,rwd,grey,tx
2846,7191933878,9999999,2008.0,ram,,6 cylinders,,automatic,rwd,white,tx
3274,7189861093,239000,2019.0,,"Lamborghini Urus 2,500 MILES",8 cylinders,2500.0,automatic,4wd,,ca
3727,7190798333,239999,1933.0,chrysler,,,,automatic,,,ca
3778,7190627325,239999,1933.0,ford,victoria,8 cylinders,5.0,automatic,,black,ca
6636,7193378609,500000,1970.0,,1970,,,manual,,,fl
9581,7193932258,250000,1941.0,lincoln,continental,12 cylinders,38.0,manual,rwd,black,fl
12458,7196760755,219995,,,,,,,,,ca
12475,7196759502,249995,,,,,,,,,ca
12476,7196759175,299995,,,,,,,,,ca


In [10]:
#After looking at some of these vehicles, the lexus is not worth $18,500,000, and the values over 500000 are useless.  We will remove them
cars = cars.loc[cars.price< 500000]
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783.0,automatic,4wd,red,mn
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821.0,automatic,fwd,grey,mn
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800.0,automatic,fwd,blue,ct
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,,manual,4wd,grey,nm
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217.0,automatic,4wd,white,mn


In [11]:
cars.loc[cars.price <100]
#These low values also seem useless, so they will be removed

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
166,7191488664,19,1986.0,,Grand National,,,automatic,,,tx
243,7195024987,1,2005.0,cadillac,escalade,,,automatic,,,nm
579,7193371115,1,2010.0,gmc,sierra 1500,8 cylinders,186049.0,automatic,rwd,silver,tx
1282,7182377328,1,2012.0,,All,,,other,,,fl
2400,7185992984,1,2010.0,,autos,,111.0,automatic,,,wa
...,...,...,...,...,...,...,...,...,...,...,...
31036,7187994230,1,2020.0,kia,,,,other,,,or
31127,7196091724,12,,,,,,,,,ca
31315,7187993793,1,2008.0,ford,f-150,8 cylinders,,automatic,4wd,,or
31393,7191019419,1,2007.0,nissan,maxima,6 cylinders,,automatic,fwd,grey,nc


In [12]:
cars = cars.loc[cars.price >100]

In [13]:
#I saw a lot of NaN values for the make, year, and the model of the cars.  They would be useless for comparison since that is the core of this project, so we are going to take them out
cars = cars.dropna(subset= ['manufacturer','model','year'])
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17994 entries, 0 to 31999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            17994 non-null  int64  
 1   price         17994 non-null  int64  
 2   year          17994 non-null  float64
 3   manufacturer  17994 non-null  object 
 4   model         17994 non-null  object 
 5   cylinders     11039 non-null  object 
 6   odometer      15073 non-null  float64
 7   transmission  17935 non-null  object 
 8   drive         12858 non-null  object 
 9   paint_color   12400 non-null  object 
 10  state         17994 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 1.2+ MB


In [14]:
#The rest of the Null values are acceptable but should be replaced.  They will be given the value unknown, which can be used if needed by anyone that will be using the final model if they do not know everything about their car.
cars = cars.fillna('Unknown')
cars.head()


Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,mn
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,mn
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,ct
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,nm
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,mn


In [15]:
#Check to make sure there are only real states and capitalize them
cars['state'].unique()

array(['mn', 'ct', 'nm', 'tx', 'ny', 'dc', 'nc', 'va', 'wa', 'fl', 'ga',
       'ca', 'de', 'tn', 'wi', 'al', 'or', 'oh', 'ar', 'ok', 'ia', 'ma',
       'nv', 'mo', 'pa', 'ms', 'ut', 'ky', 'la', 'ks', 'vt', 'ak', 'mt'],
      dtype=object)

In [16]:
cars['state']=cars['state'].str.upper()
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN


In [17]:
#Give a numerical value to each object in order to use later.  Also check to make sure there are no unusual values in each.
cars['manufacturer'].unique()

array(['gmc', 'hyundai', 'toyota', 'mitsubishi', 'ford', 'chevrolet',
       'ram', 'buick', 'jeep', 'subaru', 'nissan', 'dodge', 'audi',
       'rover', 'lexus', 'honda', 'chrysler', 'mini', 'pontiac',
       'mercedes-benz', 'cadillac', 'bmw', 'kia', 'volvo', 'volkswagen',
       'jaguar', 'acura', 'saturn', 'mazda', 'mercury', 'lincoln',
       'infiniti', 'ferrari', 'fiat', 'tesla', 'land rover',
       'harley-davidson', 'datsun', 'alfa-romeo', 'morgan',
       'aston-martin'], dtype=object)

In [18]:
encoder= LabelEncoder()
encoder.fit(cars.manufacturer)
cars["manufacturer id"] = encoder.transform(cars.manufacturer)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14


In [19]:
encoder.fit(cars.model)
cars['model id']= encoder.transform(cars.model)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351


In [20]:
cars['cylinders'].unique()

array(['8 cylinders', '4 cylinders', '6 cylinders', 'Unknown',
       '10 cylinders', '5 cylinders', '12 cylinders', '3 cylinders',
       'other'], dtype=object)

In [21]:
cars['cylinders']=cars['cylinders'].replace(['other'],'Unknown')

encoder.fit(cars.cylinders)
cars['cylinders id']= encoder.transform(cars.cylinders)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id,cylinders id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677,6
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488,3
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873,3
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348,3
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351,6


In [22]:
cars['transmission'].unique()

array(['automatic', 'manual', 'other', 'Unknown'], dtype=object)

In [25]:
cars['transmission']= cars['transmission'].replace('other','Unknown')
cars['transmission'].unique()

array(['automatic', 'manual', 'Unknown'], dtype=object)

In [26]:
encoder.fit(cars.transmission)
cars['transmission id']= encoder.transform(cars.transmission)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id,cylinders id,transmission id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677,6,1
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488,3,1
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873,3,1
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348,3,2
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351,6,1


In [27]:
cars['drive'].unique()

array(['4wd', 'fwd', 'rwd', 'Unknown'], dtype=object)

In [28]:
encoder.fit(cars.drive)
cars['drive id']= encoder.transform(cars.drive)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id,cylinders id,transmission id,drive id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677,6,1,0
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488,3,1,2
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873,3,1,2
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348,3,2,0
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351,6,1,0


In [29]:
cars['paint_color'].unique()

array(['red', 'grey', 'blue', 'white', 'custom', 'silver', 'brown',
       'black', 'Unknown', 'purple', 'green', 'orange', 'yellow'],
      dtype=object)

In [30]:
encoder.fit(cars.paint_color)
cars['color id']= encoder.transform(cars.paint_color)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id,cylinders id,transmission id,drive id,color id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677,6,1,0,9
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488,3,1,2,6
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873,3,1,2,2
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348,3,2,0,6
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351,6,1,0,11


In [31]:
encoder.fit(cars.state)
cars['state id']= encoder.transform(cars.state)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state,manufacturer id,model id,cylinders id,transmission id,drive id,color id,state id
0,7184791621,6995,2000.0,gmc,new sierra 1500,8 cylinders,167783,automatic,4wd,red,MN,14,2677,6,1,0,9,14
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821,automatic,fwd,grey,MN,17,3488,3,1,2,6,14
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800,automatic,fwd,blue,CT,38,2873,3,1,2,2,4
3,7195108810,12500,2003.0,mitsubishi,lancer,4 cylinders,Unknown,manual,4wd,grey,NM,29,2348,3,2,0,6,19
4,7184712241,16995,2007.0,gmc,sierra classic 2500hd,8 cylinders,254217,automatic,4wd,white,MN,14,3351,6,1,0,11,14


In [32]:
#Take a final look at where we are
profile.ProfileReport(cars)

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=33.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




