In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

In [3]:
df = pd.read_csv('../vehicles_us.csv')

In [4]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [5]:
df[['make', 'model']] = df['model'].str.split(' ', n=1, expand=True)

In [6]:
df['model_year'] = df['model_year'].fillna(0)
if np.array_equal(df['model_year'], df['model_year'].astype('int')):
    df['model_year'] = df['model_year'].astype('int')

df['cylinders'] = df['cylinders'].fillna(0)
if np.array_equal(df['cylinders'], df['cylinders'].astype('int')):
    df['cylinders'] = df['cylinders'].astype('int')

df['odometer'] = df['odometer'].fillna(0)
if np.array_equal(df['odometer'], df['odometer'].astype('int')):
    df['odometer'] = df['odometer'].astype('int')

df['is_4wd'] = df['is_4wd'].fillna(0)
if np.array_equal(df['is_4wd'], df['is_4wd'].astype('int')):
    df['is_4wd'] = df['is_4wd'].astype('int')

df['paint_color'] = df['paint_color'].fillna('unknown')

df['date_posted'] = pd.to_datetime(df['date_posted'], format='%Y-%m-%d')

In [7]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    51525 non-null  int32         
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     51525 non-null  int32         
 5   fuel          51525 non-null  object        
 6   odometer      51525 non-null  int32         
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  is_4wd        51525 non-null  int32         
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
 13  make          51525 non-null  object        
dtypes: datetime64[ns](1), int32(4), int64(2), object(7)
memory usage: 4.7+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make
0,9400,2011,x5,good,6,gas,145000,automatic,SUV,unknown,1,2018-06-23,19,bmw
1,25500,0,f-150,good,6,gas,88705,automatic,pickup,white,1,2018-10-19,50,ford
2,5500,2013,sonata,like new,4,gas,110000,automatic,sedan,red,0,2019-02-07,79,hyundai
3,1500,2003,f-150,fair,8,gas,0,automatic,pickup,unknown,0,2019-03-22,9,ford
4,14900,2017,200,excellent,4,gas,80903,automatic,sedan,black,0,2019-04-02,28,chrysler


### **Summary Conclusion**:

The dataset has been successfully cleaned, with missing values in key columns like `model_year`, `cylinders`, `odometer`, and `is_4wd` filled appropriately. Categorical missing data, such as `paint_color`, was replaced with `'unknown'`. Memory optimization was achieved by converting data types, reducing the dataset size.

Key insights include a complete distribution of odometer readings and engine cylinder counts, making the dataset reliable for further analysis. The `date_posted` column has been formatted correctly, allowing for time-based trend analysis. The dataset is now in a good state for deeper analysis of price trends based on vehicle features.

In [8]:
fig_price = px.histogram(df, x='price', title='Distribution of Car Prices')
fig_price.show()

The histogram shows the distribution of car prices in the dataset. Most cars are priced below $30,000, with a significant concentration in the $5,000 to $15,000 range. As prices increase beyond $30,000, the number of cars decreases sharply, indicating fewer high-end listings. This suggests that the majority of vehicles in the dataset fall within the affordable price range.

In [15]:
fig_price_odometer = px.histogram(df, x='odometer', y='price',color='price',
                                  title='Price vs Odometer',
                                  labels={'odometer': 'Odometer Reading (miles)', 'price': 'Price (USD)'})
fig_price_odometer.show()

The histogram shows the relationship between car prices and odometer readings. Vehicles with lower odometer readings generally have higher prices, with most of the cars clustered around 50,000 to 150,000 miles. As the odometer readings increase, prices tend to decrease, indicating that higher mileage cars are typically less expensive. This suggests a clear trend where lower mileage correlates with higher vehicle prices.

In [16]:
mean_price_by_model1 = df.groupby('model')['price'].mean().reset_index()
mean_price_by_model1 = mean_price_by_model1.sort_values(by='price', ascending=False)

In [11]:
fig_mean_price_model = px.scatter(mean_price_by_model1.head(10), x='model', y='price', color='model',
                              title='Top 10 Most Expensive Car Models(Average Price)',
                              labels={'price': 'Average Price (USD)'})
fig_mean_price_model.show()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  int64         
 1   model_year    51525 non-null  int32         
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     51525 non-null  int32         
 5   fuel          51525 non-null  object        
 6   odometer      51525 non-null  int32         
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   51525 non-null  object        
 10  is_4wd        51525 non-null  int32         
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
 13  make          51525 non-null  object        
dtypes: datetime64[ns](1), int32(4), int64(2), object(7)
memory usage: 4.7+ MB


In [13]:
df = df[df['model_year'] != 0]

In [14]:
fig_scatter =px.scatter(df, x='model_year', y='price', color='model',
                        title='Scatter Plot: Car Model vs Days Listed',
                        labels={'model': 'Model', 'days_listed': 'Days Listed'})
fig_scatter.show()

# Conclusion