In [14]:
# Loading libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio
import streamlit as st

In [15]:
# Load the data file into a dataframe
df = pd.read_csv('/Users/eric/eric4/eric4/vehicles_us.csv')

In [16]:
# Explore the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


The following columns have no missing values: price, model, condition, fuel, transmission, type, date_posted, days_listed.

In [17]:
# Explore the df
df.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [18]:
# Check for duplicate rows in df.
print('There are', df.duplicated().sum(),'duplicate entries in the vehicles_us data file.')

There are 0 duplicate entries in the vehicles_us data file.


In [19]:
# Plot price vs odometer
# remove rows where odometer is NaN
odo_df = df.dropna(subset=['odometer'])

# Remove outliers for odometer above 400,000
odo_df = odo_df[odo_df['odometer']<400000]

# Remove outliers for price over $100k
odo_df = odo_df[odo_df['price']<60000]

odo_df.info()
odo_plot = px.scatter(odo_df, x='odometer', y='price', title='Odometer vs price of vehicle', labels={'x': 'Odometer', 'y': 'Price $'}, opacity=0.7)
odo_plot.show()

<class 'pandas.core.frame.DataFrame'>
Index: 43523 entries, 0 to 51523
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         43523 non-null  int64  
 1   model_year    40457 non-null  float64
 2   model         43523 non-null  object 
 3   condition     43523 non-null  object 
 4   cylinders     39086 non-null  float64
 5   fuel          43523 non-null  object 
 6   odometer      43523 non-null  float64
 7   transmission  43523 non-null  object 
 8   type          43523 non-null  object 
 9   paint_color   35728 non-null  object 
 10  is_4wd        21636 non-null  float64
 11  date_posted   43523 non-null  object 
 12  days_listed   43523 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 4.6+ MB


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Plot scatter plot for days listed vs price
# Remove outliers
days_price_df = df[df['price']<100000]
days_price_df = days_price_df[days_price_df['days_listed']<200]

days_price = px.scatter(days_price_df, x='days_listed', y='price', title='Days listed vs price of vehicle', labels={'x': 'days listed', 'y': 'Price $'}, opacity=0.7)
days_price.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Explore relationship between condition and days listed
days_to_sell_condition = df.groupby('condition')['days_listed'].mean().round(1)
days_to_sell_condition = pd.DataFrame(days_to_sell_condition).reset_index()
print(days_to_sell_condition)

dc_bar = px.bar(days_to_sell_condition, x='condition', y='days_listed', title='How many days to sell based on car condition')
dc_bar.update_layout(yaxis=dict(range=[30, 40]))
dc_bar.show()


   condition  days_listed
0  excellent         39.6
1       fair         39.1
2       good         39.6
3   like new         39.2
4        new         37.1
5    salvage         39.0


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Cars listed as new sold 2 days faster on average

In [None]:
# Explore df based on price and type of vehicle
price_type = df.groupby('type')['price'].mean().sort_values().round(0)
price_type = pd.DataFrame(price_type).reset_index()
price_type.columns = ['type', 'average_price']
print(price_type)

pt_bar = px.bar(price_type, x='type', y='average_price', title='The average price based on type of vehicle')
pt_bar.update_layout(yaxis=dict(range=[5000, 20000]))
pt_bar.show()

           type  average_price
0     hatchback         6869.0
1         sedan         6965.0
2      mini-van         8193.0
3         wagon         9088.0
4           van        10547.0
5         other        10990.0
6           SUV        11149.0
7       offroad        14292.0
8         coupe        14353.0
9   convertible        14576.0
10       pickup        16057.0
11        truck        16735.0
12          bus        17136.0


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Explore df based on price and type of vehicle for histogram
# Remove outliers based on price
price_typeh = df[df['price']<50000]

# Histogram for price and type of vehicles
pt_hist = px.histogram(price_typeh, x='price', color = 'type', title='Price of vehicle', nbins=50, opacity=0.7)
pt_hist.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# Explore df based on price and condition of vehicle
# Remove outliers based on price
pc = df[df['price']<50000]

# Histogram for price and type of vehicles
pc_hist = px.histogram(pc, x='price', color = 'condition', title='Price of vehicle based on condition', nbins=50, opacity=0.7)
pc_hist.show()

NameError: name 'df' is not defined