# Web Application Development: US Car Sales

##### Installing neccesary packages

In [1]:
#Loading libraries

#For data manipulation
import pandas as pd
import numpy as np
import yfinance as yf

#for daa visualization
import plotly_express as px

#for web application
import streamlit as st

In [2]:
df_vehicles = pd.read_csv('vehicles_us.csv')

display(df_vehicles)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


## Initial Review

In [3]:
# Displaying general summary information about the plan's dataframe
df_vehicles.describe()

df_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


##### Observations
1. **Column 0 (price)** - Dtype should be converted to float64
2. **Column 1 (model_year)** - Dtype should be converted to int64
3. **Column 10 (is_4wd)** - Dtype needs to be converted to boolean to eliminate false Null values
4. **Column 11 (date_posted)** - Dtype should be converted to datetime format

#### Checking for Missing Values

In [4]:
display(df_vehicles.isnull().sum)

<bound method DataFrame.sum of        price  model_year  model  condition  cylinders   fuel  odometer  \
0      False       False  False      False      False  False     False   
1      False        True  False      False      False  False     False   
2      False       False  False      False      False  False     False   
3      False       False  False      False      False  False      True   
4      False       False  False      False      False  False     False   
...      ...         ...    ...        ...        ...    ...       ...   
51520  False       False  False      False      False  False     False   
51521  False       False  False      False      False  False     False   
51522  False       False  False      False      False  False     False   
51523  False       False  False      False      False  False     False   
51524  False       False  False      False      False  False      True   

       transmission   type  paint_color  is_4wd  date_posted  days_listed  
0   

In [5]:
#Fill missing values
df_vehicles = df_vehicles.fillna(0)

#Verifying changes
display(df_vehicles)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,0,1.0,2018-06-23,19
1,25500,0.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0.0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,0.0,automatic,pickup,0,0.0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,0.0,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,0.0,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,0.0,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,0.0,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,0.0,2018-07-02,71


#### Checking for Duplicaed Values

In [6]:
#Creating a new variable for duplicated values
duplicate_rows = df_vehicles[df_vehicles.duplicated()]
display(duplicate_rows)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


**Great!** There appears to be no duplicated entries in this dataframe.

In [7]:
#Displaying a sample of listed vehicles
display(df_vehicles.head())

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,0,1.0,2018-06-23,19
1,25500,0.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0.0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,0.0,automatic,pickup,0,0.0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,0.0,2019-04-02,28


#### Observed Issues
1. Model year Dtype is a float64 and should be changed to int64 type.
2. There are missing values in multiple columns
3. Column 10 ("is_4wd") has a Dtype of float64 and should be converted to integer of either '0' (No) or '1' (Yes)
4. Column 2 ("model") needs to be split into 2 seperate columns that seperately identify the vehicles' manufacturer and the model type.


## Fixing Existing Data

### Identifying Missing Values

#### Dtype Repairs

In [8]:
#Column 0 (price) - Dtype should be converted to float64
df_vehicles['price'] = df_vehicles['price'].astype(float)
#Column 1 (model_year) - Dtype should be converted to int64
df_vehicles['model_year'] = df_vehicles['model_year'].astype(int)
#Column 2 (model) - Dtype should be converted to integer
df_vehicles['model'] = df_vehicles['model'].astype(object)
#Column 10 (is_4wd) - Dtype needs to be converted to boolean to eliminate false Null values
df_vehicles['is_4wd'] = df_vehicles['is_4wd'].astype(bool)
#mapping boolean values to "yes"/"no"
df_vehicles['is_4wd'] = df_vehicles['is_4wd'].map({True: 'Yes', False: 'No'})

#Column 11 (date_posted) - Dtype should be converted to datetime format



df_vehicles.info()

display(df_vehicles)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  float64
 1   model_year    51525 non-null  int64  
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  object 
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(3), int64(2), object(8)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400.0,2011,bmw x5,good,6.0,gas,145000.0,automatic,SUV,0,Yes,2018-06-23,19
1,25500.0,0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,Yes,2018-10-19,50
2,5500.0,2013,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,No,2019-02-07,79
3,1500.0,2003,ford f-150,fair,8.0,gas,0.0,automatic,pickup,0,No,2019-03-22,9
4,14900.0,2017,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,No,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249.0,2013,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,No,2018-10-03,37
51521,2700.0,2002,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,No,2018-11-14,22
51522,3950.0,2009,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,No,2018-11-15,32
51523,7455.0,2013,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,No,2018-07-02,71


In [9]:
#Checking for missing val

In [10]:
#splitting 'model' to give a seperate column called 'manufacturer'
df_vehicles['manufacturer'] = df_vehicles['model'].apply(lambda x:x.split()[0])
display(df_vehicles)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,manufacturer
0,9400.0,2011,bmw x5,good,6.0,gas,145000.0,automatic,SUV,0,Yes,2018-06-23,19,bmw
1,25500.0,0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,Yes,2018-10-19,50,ford
2,5500.0,2013,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,No,2019-02-07,79,hyundai
3,1500.0,2003,ford f-150,fair,8.0,gas,0.0,automatic,pickup,0,No,2019-03-22,9,ford
4,14900.0,2017,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,No,2019-04-02,28,chrysler
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249.0,2013,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,No,2018-10-03,37,nissan
51521,2700.0,2002,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,No,2018-11-14,22,honda
51522,3950.0,2009,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,No,2018-11-15,32,hyundai
51523,7455.0,2013,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,No,2018-07-02,71,toyota


In [11]:
manufacturer_column = df_vehicles.pop('manufacturer')
df_vehicles.insert(1, 'manufacturer', manufacturer_column)
display(df_vehicles)
                            

Unnamed: 0,price,manufacturer,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400.0,bmw,2011,bmw x5,good,6.0,gas,145000.0,automatic,SUV,0,Yes,2018-06-23,19
1,25500.0,ford,0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,Yes,2018-10-19,50
2,5500.0,hyundai,2013,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,No,2019-02-07,79
3,1500.0,ford,2003,ford f-150,fair,8.0,gas,0.0,automatic,pickup,0,No,2019-03-22,9
4,14900.0,chrysler,2017,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,No,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249.0,nissan,2013,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,No,2018-10-03,37
51521,2700.0,honda,2002,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,No,2018-11-14,22
51522,3950.0,hyundai,2009,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,No,2018-11-15,32
51523,7455.0,toyota,2013,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,No,2018-07-02,71


2024-12-07 16:03:47.416 
  command:

    streamlit run c:\Users\gheet\OneDrive\My Documents\sprint projects\Dashboard-Car-Sales\env\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


NameError: name 'df' is not defined

In [None]:
#Creating a text header from the above data
st.header('Data Viewer')
#Displaying Dataframe with Streamlit
st.dataframe(df)