In [1]:
import pandas as pd
import plotly_express as px
import altair as alt
import streamlit as st

# Load the dataset
df = pd.read_csv('../vehicles_us.csv')

# Display the basic information about the dataset
# - information about columns
# - first few rows of the dataset
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17


In [2]:
# Clean the dataset:
#  1. Convert 'date_posted' column to datetime format
df['date_posted'] = pd.to_datetime(df['date_posted'])

#   2. # Removing the 'is_4wd' column from the dataset
#   - It contains only values '1.0' and 'NaN. 
#   - This means that we cannot compare 4WD with non-4WD cars, as we don't know if any car is non-4WD for certain. 
#   - So we cannot use this column for any analysis.
df = df.drop(columns=['is_4wd'])

#   3. Replacing 'NaN' with 'not_listed' for for categorical columns
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna('not_listed')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    47906 non-null  float64       
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     46265 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      43633 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  date_posted   51525 non-null  datetime64[ns]
 11  days_listed   51525 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(2), object(6)
memory usage: 4.7+ MB
None


In [3]:
# Viewing unique values in every column with categorical data

# List of categorical columns
categorical_columns = ['condition', 'cylinders', 'fuel', 'transmission', 'type', 'paint_color']

# Display unique values for each categorical column and their counts
for column in categorical_columns:
    print(f'Column: {column}\n')
    for value in df[column].unique():
        print(f'{value:<15}', end='')
        print(df[df[column] == value][column].count())
    print('\n\n')


Column: condition

good           20145
like new       4742
fair           1607
excellent      24773
salvage        115
new            143



Column: cylinders

6.0            15700
4.0            13864
8.0            15844
nan            0
5.0            272
10.0           549
3.0            34
12.0           2



Column: fuel

gas            47288
diesel         3714
other          108
hybrid         409
electric       6



Column: transmission

automatic      46902
manual         2829
other          1794



Column: type

SUV            12405
pickup         6988
sedan          12154
truck          12353
coupe          2303
van            633
convertible    446
hatchback      1047
wagon          1541
mini-van       1161
other          256
offroad        214
bus            24



Column: paint_color

not_listed     9267
white          10029
red            4421
black          7692
blue           4475
grey           5037
silver         6244
custom         1153
orange         231
yellow   

In [4]:
# Checking basic info about columns with numerical data
numerical_columns = ['price', 'model_year', 'odometer', 'days_listed', 'date_posted'] 
for column in numerical_columns:
    print(f'Column: {column}')
    print('Max:', df[column].max())
    print('Min:', df[column].min())
    print('Mean:', df[column].mean())
    print('Median:', df[column].median())
    print('\n')


Column: price
Max: 375000
Min: 1
Mean: 12132.464919941776
Median: 9000.0


Column: model_year
Max: 2019.0
Min: 1908.0
Mean: 2009.75046966977
Median: 2011.0


Column: odometer
Max: 990000.0
Min: 0.0
Mean: 115553.4617376756
Median: 113000.0


Column: days_listed
Max: 271
Min: 0
Mean: 39.55475982532751
Median: 33.0


Column: date_posted
Max: 2019-04-19 00:00:00
Min: 2018-05-01 00:00:00
Mean: 2018-10-25 01:57:46.270742528
Median: 2018-10-25 00:00:00




In [None]:
# Checking if the database contains information only about cars that are currently listed
# - If 'days_listed' is equal to the difference between maximum of 'date_posted' and 'date_posted' for this row, then the car is still listed
# - If 'days_listed' is not equal to this difference, then the car is not listed anymore (for project purposes, we assume that the car is sold)

# For every row - check the difference between maximum of 'date_posted' and value of 'date_posted' for this row
# Append the result to a new column 'days_since_posted'

df['days_since_posted'] = (df['date_posted'].max() - df['date_posted']).dt.days

# For every row - check if 'days_since_posted' equals 'days_listed'
# Print count for True and False values
print(df['days_since_posted'].eq(df['days_listed']).value_counts())


False    51367
True       158
Name: count, dtype: int64
