# US Car Sales Data Viewer

##### Installing neccesary packages

In [1]:
#Loading libraries

#For data manipulation
import pandas as pd
import numpy as np

#for daa visualization
import plotly_express as px
from dash import Dash, dcc, html, Input, Output

#for web application
import streamlit as st

##### Installing our Dataset

In [2]:
df_vehicles = pd.read_csv('vehicles_us.csv')
df_vehicles.head(10)



Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17


In [3]:
#Reviewing the shape of the dataframe
df_vehicles.shape

(51525, 13)

## Initial Review

In [4]:
# Displaying general summary information about the plan's dataframe
df_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


##### Observations
1. **Column 0 (price)** - Dtype should be converted to float64
2. **Column 1 (model_year)** - Dtype should be converted to int64
3. **Column 10 (is_4wd)** - Dtype needs to be converted to boolean to eliminate false Null values
4. **Column 11 (date_posted)** - Dtype should be converted to datetime format

In [5]:
df_vehicles.describe()

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


#### Checking for Duplicated Values

In [6]:
#Creating a new variable for duplicated values
duplicate_rows = df_vehicles[df_vehicles.duplicated()]
display(duplicate_rows)
df_vehicles.shape

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


(51525, 13)

**Great!** We called for a display of duplicated values and recieved an empty list. There appears to be no duplicated entries in this dataframe. We can double check this by comparing our data shape from before dropping the values

In [7]:
#Displaying a sample of listed vehicles
display(df_vehicles.head())

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [8]:
display(df_vehicles.isnull().sum)

<bound method DataFrame.sum of        price  model_year  model  condition  cylinders   fuel  odometer  \
0      False       False  False      False      False  False     False   
1      False        True  False      False      False  False     False   
2      False       False  False      False      False  False     False   
3      False       False  False      False      False  False      True   
4      False       False  False      False      False  False     False   
...      ...         ...    ...        ...        ...    ...       ...   
51520  False       False  False      False      False  False     False   
51521  False       False  False      False      False  False     False   
51522  False       False  False      False      False  False     False   
51523  False       False  False      False      False  False     False   
51524  False       False  False      False      False  False      True   

       transmission   type  paint_color  is_4wd  date_posted  days_listed  
0   

#### Observed Issues
1. Model year Dtype is a float64 and should be changed to int64 type.
2. There are missing values in multiple columns
3. Column 10 ("is_4wd") has a Dtype of float64 and should be converted to integer of either '0' (No) or '1' (Yes)
4. Column 2 ("model") needs to be split into 2 seperate columns that seperately identify the vehicles' manufacturer and the model type.


## Preprossing/Cleaning Data

#### Dtype Repairs

##### Reformatting Dtype to their appropriate settings and optimizing column organization

In [9]:
#splitting 'model' to give a seperate column called 'manufacturer'
df_vehicles['manufacturer'] = df_vehicles['model'].apply(lambda x:x.split()[0])
# Remove the 'manufacturer' column and store it
manufacturer_column = df_vehicles.pop('manufacturer')
# Insert the 'manufacturer' column at the second position (index 1)
df_vehicles.insert(2, 'manufacturer', manufacturer_column)
# Renaming 'type' column to 'body type' for improved user friendliness
df_vehicles.rename(columns={'type': 'body_type'}, inplace=True)

df_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   manufacturer  51525 non-null  object 
 3   model         51525 non-null  object 
 4   condition     51525 non-null  object 
 5   cylinders     46265 non-null  float64
 6   fuel          51525 non-null  object 
 7   odometer      43633 non-null  float64
 8   transmission  51525 non-null  object 
 9   body_type     51525 non-null  object 
 10  paint_color   42258 non-null  object 
 11  is_4wd        25572 non-null  float64
 12  date_posted   51525 non-null  object 
 13  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(8)
memory usage: 5.5+ MB


In [10]:
# Remove the manufacturer's name from the 'model' column
df_vehicles['model'] = df_vehicles['model'].apply(lambda x: x.split(' ', 1)[1] if ' ' in x else x)

In [11]:
#display(df_vehicles.isnull().sum)

In [12]:
# Count rows with missing values
num_rows_with_missing = df_vehicles .isna().any(axis=1).sum()

# Print the result
display(f'Number of rows with missing values: {num_rows_with_missing}')
display(df_vehicles)


'Number of rows with missing values: 36673'

Unnamed: 0,price,model_year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,body_type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw,x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford,f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai,sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford,f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler,200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan,maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda,civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai,sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota,corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


**Observation** - We can see that the majority of the rows in our dataframe have missing values. To drop all rows with missing values would certainly hinder the accuracy of our analysis. To prevent data lossage whist being able to adequitely use the usable data, we will fill the missing values with placeholders that will blantantly indicate as null.

In [13]:
display(df_vehicles)
# Fill missing values for columns with numerical value
numerical_to_fill = {
    'price': 0,
    'model_year': 0,
    'cylinders': 0,
    'odometer': 9000000,
    'is_4wd': 0      
}
df_vehicles.fillna(value=numerical_to_fill, inplace=True)

text_to_fill = ['manufacturer', 'model', 'condition', 'fuel', 'transmission', 'body_type', 'paint_color']
shared_fill_value = 'Unspecified'

df_vehicles[text_to_fill] = df_vehicles[text_to_fill].fillna(shared_fill_value)

# Verify the change
display(df_vehicles)


Unnamed: 0,price,model_year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,body_type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw,x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford,f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai,sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford,f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler,200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan,maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda,civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai,sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota,corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


Unnamed: 0,price,model_year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,body_type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw,x5,good,6.0,gas,145000.0,automatic,SUV,Unspecified,1.0,2018-06-23,19
1,25500,0.0,ford,f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai,sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0.0,2019-02-07,79
3,1500,2003.0,ford,f-150,fair,8.0,gas,9000000.0,automatic,pickup,Unspecified,0.0,2019-03-22,9
4,14900,2017.0,chrysler,200,excellent,4.0,gas,80903.0,automatic,sedan,black,0.0,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan,maxima,like new,6.0,gas,88136.0,automatic,sedan,black,0.0,2018-10-03,37
51521,2700,2002.0,honda,civic,salvage,4.0,gas,181500.0,automatic,sedan,white,0.0,2018-11-14,22
51522,3950,2009.0,hyundai,sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,0.0,2018-11-15,32
51523,7455,2013.0,toyota,corolla,good,4.0,gas,139573.0,automatic,sedan,black,0.0,2018-07-02,71


In [14]:
#Column 0 (price) - Dtype should be converted to float64
df_vehicles['price'] = df_vehicles['price'].astype(float)
#Column 1 (model_year) - Dtype should be converted to int64
df_vehicles['model_year'] = df_vehicles['model_year'].astype(int)
#Colmn 2 (manufacturer) - Dtype should be converted to object
df_vehicles['manufacturer'] = df_vehicles['manufacturer'].astype(object)
#Column 3 (model) - Dtype should be converted to object
df_vehicles['model'] = df_vehicles['model'].astype(object)
#Column 10 (is_4wd) - Dtype needs to be converted to boolean to eliminate false Null values
df_vehicles['is_4wd'] = df_vehicles['is_4wd'].astype(bool)
#mapping boolean values to "yes"/"no"
df_vehicles['is_4wd'] = df_vehicles['is_4wd'].map({True: 'Yes', False: 'No'})
#Column 11 (date_posted) - Dtype should be converted to datetime format and read in US sandards (MM:DD:YYYY)
df_vehicles['date_posted'] = pd.to_datetime(df_vehicles['date_posted'])

# Format 'date_posted' to "Month Day Year" format and assign it back
df_vehicles['date_posted'] = df_vehicles['date_posted'].dt.strftime('%B %d %Y')

display(df_vehicles)


Unnamed: 0,price,model_year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,body_type,paint_color,is_4wd,date_posted,days_listed
0,9400.0,2011,bmw,x5,good,6.0,gas,145000.0,automatic,SUV,Unspecified,Yes,June 23 2018,19
1,25500.0,0,ford,f-150,good,6.0,gas,88705.0,automatic,pickup,white,Yes,October 19 2018,50
2,5500.0,2013,hyundai,sonata,like new,4.0,gas,110000.0,automatic,sedan,red,No,February 07 2019,79
3,1500.0,2003,ford,f-150,fair,8.0,gas,9000000.0,automatic,pickup,Unspecified,No,March 22 2019,9
4,14900.0,2017,chrysler,200,excellent,4.0,gas,80903.0,automatic,sedan,black,No,April 02 2019,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249.0,2013,nissan,maxima,like new,6.0,gas,88136.0,automatic,sedan,black,No,October 03 2018,37
51521,2700.0,2002,honda,civic,salvage,4.0,gas,181500.0,automatic,sedan,white,No,November 14 2018,22
51522,3950.0,2009,hyundai,sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,No,November 15 2018,32
51523,7455.0,2013,toyota,corolla,good,4.0,gas,139573.0,automatic,sedan,black,No,July 02 2018,71


In [15]:
df_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  float64
 1   model_year    51525 non-null  int64  
 2   manufacturer  51525 non-null  object 
 3   model         51525 non-null  object 
 4   condition     51525 non-null  object 
 5   cylinders     51525 non-null  float64
 6   fuel          51525 non-null  object 
 7   odometer      51525 non-null  float64
 8   transmission  51525 non-null  object 
 9   body_type     51525 non-null  object 
 10  paint_color   51525 non-null  object 
 11  is_4wd        51525 non-null  object 
 12  date_posted   51525 non-null  object 
 13  days_listed   51525 non-null  int64  
dtypes: float64(3), int64(2), object(9)
memory usage: 5.5+ MB


# Creating the App Parameters

In [16]:
# Filter the DataFrame to include only rows where the odometer value is between 1,000 and 500,000
filtered_df = df_vehicles[(df_vehicles['odometer'] >= 1000) & (df_vehicles['odometer'] <= 500000)]

# Additional data calculations for example purposes
model_avg_days = df_vehicles.groupby(['manufacturer', 'model'])['days_listed'].mean().reset_index()
body_type_counts = df_vehicles.groupby(['body_type', 'manufacturer']).size().reset_index(name='count')
condition_counts = df_vehicles.groupby(['condition', 'manufacturer']).size().reset_index(name='count')

# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.Div([
        html.Label('Select a Manufacturer:'),
        dcc.Dropdown(
            id='manufacturer-dropdown',
            options=[{'label': 'All Manufacturers', 'value': 'all'}] +
                    [{'label': manufacturer, 'value': manufacturer} for manufacturer in df_vehicles['manufacturer'].unique()],
            value='all',  # Default value
            placeholder="Select a Manufacturer"
        )
    ], style={'width': '30%', 'display': 'inline-block'}),
    
    html.Div([
        html.Label('From Year:'),
        dcc.Dropdown(
            id='from-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, df_vehicles['model_year'].max() + 1)],
            value=1920,  # Default value
            placeholder="From Year"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-right': '2%'}),
    
    html.Div([
        html.Label('To Year:'),
        dcc.Dropdown(
            id='to-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, df_vehicles['model_year'].max() + 1)],
            value=df_vehicles['model_year'].max(),  # Default value
            placeholder="To Year"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-right': '2%'}),
    
    

    html.Div([
        html.Label('Select a Body Type:'),
        dcc.Dropdown(
            id='body-type-dropdown',
            options=[{'label': body_type, 'value': body_type} for body_type in body_type_counts['body_type'].unique()],
            value=body_type_counts['body_type'].unique()[0],  # Default value
            placeholder="Select a Body Type"
        ),

        
        html.Label('Sort Order:'),
        dcc.RadioItems(
            id='view-toggle',
            options=[
                {'label': 'Alphabetical Order', 'value': 'original'},
                {'label': 'Ascending by Vehicle Count', 'value': 'ascending'}
            ],
            value='alphabetical',  # Default value
            labelStyle={'display': 'inline-block'}
        )
    ], style={'width': '48%', 'display': 'inline-block'}),

    
        html.Div("Steeper correlation lines indicate a greater rate of financial depreciation.", style={'text-align': 'center'}),
    
    html.Div([
        html.Label('Select a Model:'),
        dcc.RadioItems(
            id='model-toggle',
            options=[{'label': 'All Models', 'value': 'all'}] + [{'label': model, 'value': model} for model in filtered_df['model'].unique()],
            value='all',  # Default value
            labelStyle={'display': 'inline-block'}
        )
    ], style={'width': '30%', 'float': 'right', 'display': 'inline-block'}),

    dcc.Graph(id='histogram-days-listed'),
    dcc.Graph(id='scatter-plot'),
    dcc.Graph(id='correlation-bar-chart'),
    dcc.Graph(id='histogram-body-type')

])

# Callback to update the histogram based on the selected manufacturer and view
@app.callback(
    Output('histogram-days-listed', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('view-toggle', 'value')]
)
def update_histogram(selected_manufacturer, selected_view):
    if selected_manufacturer == 'all':
        filtered_df = model_avg_days
    else:
        filtered_df = model_avg_days[model_avg_days['manufacturer'] == selected_manufacturer]
    
    if selected_view == 'ascending':
        filtered_df = filtered_df.sort_values(by='days_listed', ascending=True)
    
    fig_days = px.bar(filtered_df, x='model', y='days_listed', color='model',
                 title=f'Average Listed Days by Model for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Average Listed Days by Model for All Manufacturers',
                 labels={'model': 'Model', 'days_listed': 'Average Listed Days'},
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    return fig_days


# Callback to update the scatter plot based on the selected manufacturer, model, and year range
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('model-toggle', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_scatter_plot(selected_manufacturer, selected_model, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    if selected_model != 'all':
        filtered_data = filtered_data[filtered_data['model'] == selected_model]
    
    fig = px.scatter(filtered_data, x='odometer', y='price', color='model',
                     title=f'Depreciation Rates of Price vs Mileage for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Depreciation Rates of Price vs Mileage for All Manufacturers',
                     labels={'odometer': 'Odometer Reading (miles)', 'price': 'Price (USD)'},
                     hover_data=['model_year', 'condition'],
                     color_discrete_sequence=px.colors.qualitative.Dark24,
                     trendline="ols")
    return fig

# Callback to update the correlation bar chart based on the selected manufacturer and year range
@app.callback(
    Output('correlation-bar-chart', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_correlation_bar_chart(selected_manufacturer, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    correlation_results = filtered_data.groupby('model').apply(lambda x: x['price'].corr(x['odometer'])).reset_index()
    correlation_results.columns = ['model', 'correlation_coefficient']
    fig_depreciation = px.bar(correlation_results, x='model', y='correlation_coefficient',
                 title=f'Correlation Coefficient of Price vs Odometer for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Correlation Coefficient of Price vs Odometer for All Manufacturers',
                 labels={'model': 'Model', 'correlation_coefficient': 'Correlation Coefficient'},
                 color='correlation_coefficient',
                 color_continuous_scale=px.colors.sequential.Viridis)
    return fig_depreciation

# Run the app
if __name__ == '__main__':
    app.run(debug=True)








## Average Days Listed per Manufacturer

In [None]:
# Callback to update the histogram based on the selected manufacturer and view
@app.callback(
    Output('histogram-days-listed', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('view-toggle', 'value')]
)
def update_histogram(selected_manufacturer, selected_view):
    if selected_manufacturer == 'all':
        filtered_df = model_avg_days
    else:
        filtered_df = model_avg_days[model_avg_days['manufacturer'] == selected_manufacturer]
    
    if selected_view == 'ascending':
        filtered_df = filtered_df.sort_values(by='days_listed', ascending=True)
    
    fig_days = px.bar(filtered_df, x='model', y='days_listed', color='model',
                 title=f'Average Listed Days by Model for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Average Listed Days by Model for All Manufacturers',
                 labels={'model': 'Model', 'days_listed': 'Average Listed Days'},
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    return fig_days







## Price vs Milage per Manufacturer
##### Here we can take a look at the depreciation values caused by wear and tear

In [None]:
# Callback to update the scatter plot based on the selected manufacturer, model, and year range
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('model-toggle', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_scatter_plot(selected_manufacturer, selected_model, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    if selected_model != 'all':
        filtered_data = filtered_data[filtered_data['model'] == selected_model]
    
    fig = px.scatter(filtered_data, x='odometer', y='price', color='model',
                     title=f'Depreciation Rates of Price vs Mileage for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Depreciation Rates of Price vs Mileage for All Manufacturers',
                     labels={'odometer': 'Odometer Reading (miles)', 'price': 'Price (USD)'},
                     hover_data=['model_year', 'condition'],
                     color_discrete_sequence=px.colors.qualitative.Dark24,
                     trendline="ols")
    return fig





In [None]:
# Callback to update the correlation bar chart based on the selected manufacturer and year range
@app.callback(
    Output('correlation-bar-chart', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_correlation_bar_chart(selected_manufacturer, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    correlation_results = filtered_data.groupby('model').apply(lambda x: x['price'].corr(x['odometer'])).reset_index()
    correlation_results.columns = ['model', 'correlation_coefficient']
    fig_depreciation = px.bar(correlation_results, x='model', y='correlation_coefficient',
                 title=f'Correlation Coefficient of Price vs Odometer for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Correlation Coefficient of Price vs Odometer for All Manufacturers',
                 labels={'model': 'Model', 'correlation_coefficient': 'Correlation Coefficient'},
                 color='correlation_coefficient',
                 color_continuous_scale=px.colors.sequential.Viridis)
    return fig_depreciation

# Run the app
if __name__ == '__main__':
    app.run(debug=True)



## Average Days Listed per Manufacturer

In [None]:
# Calculate the average 'days_listed' for each 'model' within each 'manufacturer'
model_avg_days = df_vehicles.groupby(['manufacturer', 'model'])['days_listed'].mean().reset_index()

# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    dcc.Dropdown(
        id='manufacturer-dropdown',
        options=[{'label': 'All Manufacturers', 'value': 'all'}] + 
                [{'label': manufacturer, 'value': manufacturer} for manufacturer in model_avg_days['manufacturer'].unique()],
        value='all',  # Default value
        placeholder="Select a Manufacturer"
    ),
    dcc.RadioItems(
        id='view-toggle',
        options=[
            {'label': 'Original Order', 'value': 'original'},
            {'label': 'Ascending Order', 'value': 'ascending'}
        ],
        value='original',  # Default value
        labelStyle={'display': 'inline-block'}
    ),
    dcc.Graph(id='histogram')
])

# Callback to update the histogram based on the selected manufacturer and view
@app.callback(
    Output('histogram', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('view-toggle', 'value')]
)
def update_histogram(selected_manufacturer, selected_view):
    if selected_manufacturer == 'all':
        filtered_df = model_avg_days
    else:
        filtered_df = model_avg_days[model_avg_days['manufacturer'] == selected_manufacturer]
    
    if selected_view == 'ascending':
        filtered_df = filtered_df.sort_values(by='days_listed', ascending=True)
    
    fig_days = px.bar(filtered_df, x='model', y='days_listed', color='model',
                 title=f'Average Listed Days by Model for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Average Listed Days by Model for All Manufacturers',
                 labels={'model': 'Model', 'days_listed': 'Average Listed Days'},
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    return fig_days



# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


## Vehicle Body Type per Manufacturer
##### Here, we can explore who has the most of each type of body type.

In [None]:
# Callback to update the histogram based on the selected body type and sort order
@app.callback(
    Output('histogram', 'figure'),
    [Input('body-type-dropdown', 'value'),
     Input('sort-order-radio', 'value')]
)
def update_histogram(selected_body_type, sort_order):
    filtered_df = body_type_counts[body_type_counts['body_type'] == selected_body_type]
    
    if sort_order == 'alphabetical':
        filtered_df = filtered_df.sort_values(by='manufacturer')
    else:
        filtered_df = filtered_df.sort_values(by='count', ascending=True)

    fig_body_type = px.bar(filtered_df, x='body_type', y='count', color='manufacturer',
                 title=f'Vehicle Count per Manufacturer for Body Type: {selected_body_type}',
                 labels={'body_type': 'Body Type', 'count': 'Vehicle Count'},
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    return fig_body_type

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)



## Vehicle Count per Manufacturer for Condition

In [None]:
# Count the number of vehicles for each 'condition' and 'manufacturer'
condition_counts = df_vehicles.groupby(['condition', 'manufacturer']).size().reset_index(name='count')

# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            id='manufacturer-dropdown',
            options=[{'label': 'All Manufacturers', 'value': 'all'}] + 
                    [{'label': manufacturer, 'value': manufacturer} for manufacturer in condition_counts['manufacturer'].unique()],
            value='all',  # Default value
            placeholder="Select a Manufacturer"
        ),
    ], style={'width': '48%', 'display': 'inline-block'}),
    
    html.Div([
        dcc.Dropdown(
            id='condition-dropdown',
            options=[{'label': condition, 'value': condition} for condition in condition_counts['condition'].unique()],
            value=condition_counts['condition'].unique()[0],  # Default value
            placeholder="Select a Condition"
        ),
    ], style={'width': '48%', 'float': 'right', 'display': 'inline-block'}),
    
    dcc.RadioItems(
        id='sort-order-radio',
        options=[
            {'label': 'Alphabetical', 'value': 'alphabetical'},
            {'label': 'Ascending by Vehicle Count', 'value': 'ascending'}
        ],
        value='alphabetical',  # Default value
        labelStyle={'display': 'inline-block'}
    ),
    dcc.Graph(id='histogram')
])

# Callback to update the histogram based on the selected condition and sort order
@app.callback(
    Output('histogram', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('condition-dropdown', 'value'),
     Input('sort-order-radio', 'value')]
)
def update_histogram(selected_manufacturer, selected_condition, sort_order):
    filtered_df = condition_counts[condition_counts['condition'] == selected_condition]
    
    if selected_manufacturer != 'all':
        filtered_df = filtered_df[filtered_df['manufacturer'] == selected_manufacturer]
    
    if sort_order == 'alphabetical':
        filtered_df = filtered_df.sort_values(by='manufacturer')
    else:
        filtered_df = filtered_df.sort_values(by='count', ascending=True)

    fig_condition = px.bar(filtered_df, x='manufacturer', y='count', color='manufacturer',
                 title=f'Vehicle Count per Manufacturer for Condition: {selected_condition}',
                 labels={'manufacturer': 'Manufacturer', 'count': 'Vehicle Count'},
                 color_discrete_sequence=px.colors.qualitative.Dark24)
    
    # Change font color to blue
    fig_condition.update_layout(font=dict(color='blue'))
    return fig_condition

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


## Vehicle Conditions per Model Year per Manufacturer

In [None]:
# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    dcc.Dropdown(
        id='manufacturer-dropdown',
        options=[{'label': 'All Manufacturers', 'value': 'all'}] + 
                [{'label': manufacturer, 'value': manufacturer} for manufacturer in df_vehicles['manufacturer'].unique()],
        value='all',  # Default value
        placeholder="Select a Manufacturer"
    ),
    dcc.Graph(id='histogram')
])

# Callback to update the histogram based on the selected manufacturer
@app.callback(
    Output('histogram', 'figure'),
    [Input('manufacturer-dropdown', 'value')]
)
def update_histogram(selected_manufacturer):
    if selected_manufacturer == 'all':
        filtered_df = df_vehicles
    else:
        filtered_df = df_vehicles[df_vehicles['manufacturer'] == selected_manufacturer]
    
    fig_year_condition = px.histogram(filtered_df, x='model_year', color='condition',
                       nbins=50,  # Number of bins
                       title=f'Vehicle Count per Model Year for Manufacturer: {selected_manufacturer}' if selected_manufacturer != 'all' else 'Vehicle Count per Model Year for All Manufacturers',
                       labels={'model_year': 'Model Year', 'count': 'Vehicle Count'},
                       color_discrete_sequence=px.colors.qualitative.Dark24)
    
    fig_year_condition.update_layout(xaxis=dict(range=[1920, filtered_df['model_year'].max()]))  # Setting x-axis range
    return fig_year_condition

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


In [None]:
# Filter the DataFrame to include only rows where the odometer value is between 1,000 and 500,000
filtered_df = df_vehicles[(df_vehicles['odometer'] >= 1000) & (df_vehicles['odometer'] <= 500000)]

# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.Div([
        html.Label('Select a Manufacturer:'),
        dcc.Dropdown(
            id='manufacturer-dropdown',
            options=[{'label': 'All Manufacturers', 'value': 'all'}] +
                    [{'label': manufacturer, 'value': manufacturer} for manufacturer in filtered_df['manufacturer'].unique()],
            value='all',  # Default value
            placeholder="Select a Manufacturer"
        )
    ], style={'width': '30%', 'display': 'inline-block'}),
    
    html.Div([
        html.Label('From Year:'),
        dcc.Dropdown(
            id='from-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, filtered_df['model_year'].max() + 1)],
            value=1920,  # Default value
            placeholder="From Year"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-left': '2%'}),
    
    html.Div([
        html.Label('To Year:'),
        dcc.Dropdown(
            id='to-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, filtered_df['model_year'].max() + 1)],
            value=filtered_df['model_year'].max(),  # Default value
            placeholder="To Year"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-left': '2%'}),
    
    dcc.Graph(id='scatter-plot'),
    html.Div("Steeper correlation lines indicate a greater rate of financial depreciation.", style={'text-align': 'center'}),
    
    html.Div([
        html.Label('Select a Model:'),
        dcc.RadioItems(
            id='model-toggle',
            options=[{'label': 'All Models', 'value': 'all'}] + [{'label': model, 'value': model} for model in filtered_df['model'].unique()],
            value='all',  # Default value
            labelStyle={'display': 'inline-block'}
        )
    ], style={'width': '30%', 'float': 'right', 'display': 'inline-block'}),
    
    dcc.Graph(id='correlation-bar-chart')
])

# Callback to update the scatter plot based on the selected manufacturer, model, and year range
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('model-toggle', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_scatter_plot(selected_manufacturer, selected_model, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    if selected_model != 'all':
        filtered_data = filtered_data[filtered_data['model'] == selected_model]
    
    fig = px.scatter(filtered_data, x='odometer', y='price', color='model',
                     title=f'Depreciation Rates of Price vs Mileage for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Depreciation Rates of Price vs Mileage for All Manufacturers',
                     labels={'odometer': 'Odometer Reading (miles)', 'price': 'Price (USD)'},
                     hover_data=['model_year', 'condition'],
                     color_discrete_sequence=px.colors.qualitative.Dark24,
                     trendline="ols")
    return fig

# Callback to update the correlation bar chart based on the selected manufacturer and year range
@app.callback(
    Output('correlation-bar-chart', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_correlation_bar_chart(selected_manufacturer, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    correlation_results = filtered_data.groupby('model').apply(lambda x: x['price'].corr(x['odometer'])).reset_index()
    correlation_results.columns = ['model', 'correlation_coefficient']
    fig_depreciation = px.bar(correlation_results, x='model', y='correlation_coefficient',
                 title=f'Correlation Coefficient of Price vs Odometer for {selected_manufacturer}' if selected_manufacturer != 'all' else 'Correlation Coefficient of Price vs Odometer for All Manufacturers',
                 labels={'model': 'Model', 'correlation_coefficient': 'Correlation Coefficient'},
                 color='correlation_coefficient',
                 color_continuous_scale=px.colors.sequential.Viridis)
    return fig_depreciation

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


## Scatter Plot for Days Listed vs Milage

In [None]:
# Filter the DataFrame to include only rows where the odometer value is between 1,000 and 500,000
filtered_df = df_vehicles[(df_vehicles['odometer'] >= 1000) & (df_vehicles['odometer'] <= 500000)]

# Define a custom color sequence
color_sequence = px.colors.qualitative.Dark24

# Initialize the Dash app
app = Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.Div([
        html.Label('Select Manufacturer:'),
        dcc.Dropdown(
            id='manufacturer-dropdown',
            options=[{'label': 'All Manufacturers', 'value': 'all'}] + 
                    [{'label': manufacturer, 'value': manufacturer} for manufacturer in filtered_df['manufacturer'].unique()],
            value='all',  # Default value
            placeholder="Select a Manufacturer"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-right': '2%'}),
    
    html.Div([
        html.Label('From Year:'),
        dcc.Dropdown(
            id='from-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, filtered_df['model_year'].max() + 1)],
            value=1920,  # Default value
            placeholder="From Year"
        )
    ], style={'width': '30%', 'display': 'inline-block', 'margin-right': '2%'}),
    
    html.Div([
        html.Label('To Year:'),
        dcc.Dropdown(
            id='to-year-dropdown',
            options=[{'label': str(year), 'value': year} for year in range(1920, filtered_df['model_year'].max() + 1)],
            value=filtered_df['model_year'].max(),  # Default value
            placeholder="To Year"
        )
    ], style={'width': '30%', 'display': 'inline-block'}),
    
    dcc.Graph(id='scatter-plot')
])

# Callback to update the scatter plot based on the selected manufacturer and year range
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('manufacturer-dropdown', 'value'),
     Input('from-year-dropdown', 'value'),
     Input('to-year-dropdown', 'value')]
)
def update_scatter_plot(selected_manufacturer, from_year, to_year):
    filtered_data = filtered_df[
        (filtered_df['model_year'] >= from_year) &
        (filtered_df['model_year'] <= to_year)
    ]
    
    if selected_manufacturer != 'all':
        filtered_data = filtered_data[filtered_data['manufacturer'] == selected_manufacturer]
    
    fig = px.scatter(filtered_data, x='odometer', y='days_listed', color='manufacturer',
                     title='Scatter Plot of Days Listed vs Mileage',
                     labels={'odometer': 'Odometer Reading (miles)', 'days_listed': 'Days Listed'},
                     hover_data=['model_year', 'model', 'condition'],
                     color_discrete_sequence=color_sequence)
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


In [None]:
import streamlit as st
from streamlit.runtime.scriptrunner import add_script_run_ctx, get_script_run_ctx
from subprocess import Popen

# Main script
def main():
    st.header('Data Viewer')
    st.dataframe(df_vehicles)

if __name__ == "__main__":
    ctx = get_script_run_ctx()
    process = Popen(['python', 'app.py'])
    add_script_run_ctx(process, ctx)
    main()


In [None]:
#Creating a text header from the above data
st.header('Data Viewer')
#Displaying Dataframe with Streamlit
st.dataframe(df_vehicles)

In [None]:
streamlit run 