In [1]:
import pandas as pd
import numpy as np
from scipy import stats as stt
import streamlit as st
import matplotlib.pyplot as plt
import plotly.express as px
import altair as alt

In [2]:
cars = pd.read_csv(r"C:\Personal Files\Coding\GitHub\sprint-4-project\vehicles_us.csv", sep=',', decimal='.')
cars.sample(10)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
34172,5300,,toyota camry le,excellent,4.0,gas,63912.0,automatic,sedan,green,,2018-10-24,74
33675,2600,2002.0,ford expedition,excellent,8.0,gas,,automatic,SUV,,1.0,2018-07-24,66
39754,2000,,ford focus se,fair,4.0,gas,219738.0,manual,sedan,blue,,2018-12-22,63
20138,5300,2007.0,ford expedition,good,,gas,170000.0,automatic,SUV,white,1.0,2018-05-24,18
50535,38000,2016.0,ram 2500,like new,6.0,diesel,,automatic,pickup,grey,1.0,2018-11-26,35
21137,30500,2015.0,gmc sierra 1500,excellent,8.0,gas,46987.0,automatic,truck,white,1.0,2018-11-22,51
36474,34900,2018.0,ram 2500,excellent,8.0,gas,10450.0,automatic,truck,white,1.0,2019-02-27,22
14798,13995,2015.0,nissan altima,excellent,4.0,gas,,automatic,sedan,,,2018-06-01,56
46348,7990,,ford econoline,good,,gas,105429.0,automatic,van,white,,2018-08-10,102
45105,4500,2010.0,volkswagen jetta,excellent,5.0,gas,142000.0,manual,sedan,silver,,2018-06-25,22


In [3]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [4]:
st.header('Car Data')
st.dataframe(cars)

2024-05-06 21:01:37.116 
  command:

    streamlit run c:\Personal Files\Coding\Anaconda3\envs\standard\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [5]:
#cars['model_year'] = cars['model_year'].astype(int)
#cars['cylinders'] = cars['cylinders'].astype(int)
cars['is_4wd'] = cars['is_4wd'].astype(str).replace('1.0', 'Yes').fillna('No')

This is a pretty interesting dataset, and there are a number of possibilities for the data analysis that can be done. I would like to compare the prices cars are sold for based on a few different sets of data to get an idea as to how those markers affect the price of a car. In particular, I'd like to explore the relationships the car's price has with its mileage (odometer), the type of car sold, and the length that each car was listed. This could gleam some information as to the conditions under which cars are sold for the highest prices. 

Lots of the ploty.express graphs are very suitable for this with the right data. I will try to do some comparing and contrasting of the infromation conveyed between plots and charts from that library and the ones used for other TripleTen work. As such, some additional grouping will be needed.

In [6]:
mileage = cars.groupby('price')['odometer'].mean().reset_index() #These methods are used to drop the NA values
mileage.sample(5)

Unnamed: 0,price,odometer
3343,52776,34813.0
1862,16999,101018.514706
2421,23995,90318.591837
3438,145000,140000.0
2058,18990,108346.283019


In [7]:
mileage_price = mileage.groupby('price')
mileage_price
mileage_price.sample()

Unnamed: 0,price,odometer
0,1,40767.195556
1,3,
2,5,
3,6,173500.000000
4,9,119.000000
...,...,...
3438,145000,140000.000000
3439,175000,149000.000000
3440,189000,151248.000000
3441,300000,


In [9]:
car_type = cars.groupby('type')['price'].mean().reset_index()
car_type.sample(5)

Unnamed: 0,type,price
5,mini-van,8193.177433
2,convertible,14575.881166
8,pickup,16057.410418
7,other,10989.714844
1,bus,17135.666667


In [10]:
car_type_price = car_type.groupby('price')
car_type_price.sample()

Unnamed: 0,type,price
4,hatchback,6868.513849
9,sedan,6965.358647
5,mini-van,8193.177433
12,wagon,9088.134328
11,van,10546.941548
7,other,10989.714844
0,SUV,11149.4
6,offroad,14292.294393
3,coupe,14353.442901
2,convertible,14575.881166


In [16]:
list_length = cars.groupby('days_listed')['price'].mean().reset_index()
list_length.sample(5)

Unnamed: 0,days_listed,price
179,179,15995.0
139,139,11081.266667
15,15,12710.092841
155,155,24635.5
95,95,11633.586957


In [17]:
list_length_price = list_length.groupby('price')
list_length_price.sample()

Unnamed: 0,days_listed,price
212,223,1500.0
195,195,1600.5
209,213,1975.5
188,188,2995.0
223,261,3800.0
...,...,...
202,204,25988.0
197,199,27500.0
218,240,31995.0
194,194,34000.0


I initally planned to also compare the differences in plots between pyplot and plotly, but pyplot gave me some issues. So, we'll be focusing on the plotly.express charts, which should still give plenty of information.

In [18]:
st.header('price by mileage')
mp_hist = px.histogram(mileage_price, x=0, y=1)
st.write(mp_hist)

TypeError: Object of type DataFrame is not JSON serializable

In [19]:
st.header('price by mileage')
#cars_json = cars.to_json()
mp_hist = px.histogram(cars, x='price', y='odometer')
st.write(mp_hist)


In [20]:
st.header('price by type')
type_hist = px.histogram(cars, x='price', y='type')
st.write(type_hist)

In [21]:
st.header('price by list time')
time_hist = px.histogram(cars, x='price', y='days_listed')
st.write(time_hist)

In [22]:
st.header('price by year')
year_scatt = px.scatter(cars, x='price', y='days_listed', color='model_year')
st.write(year_scatt)

I've gotten some code that works after a healthy amount of fiddling. There's a lot of code I didn't end up using, and my plans for this project had to change on the fly a bit. Maybe it'll be good practice for the real thing. I will keep the code in for review as a means of showing my prior work.

This space will have observations on the data once I have loaded up a page so that I coul take a look at them. If a reveiwer is seeing this, then it's likely because I haven't seen the actual charts for some reason.