In [2]:
import seaborn as sns

In [3]:
import pandas as pd

## Reading the file

In [4]:
cars_df=pd.read_csv('data_cars2.csv')
cars_df #taking a look at the raw data

FileNotFoundError: [Errno 2] No such file or directory: 'data_cars2.csv'

## 1-Data exploration

In [None]:
cars_df.info() # take a small look at the info 

In [None]:
cars_df.isnull().sum() # what is missing ?

In [None]:
cars_df.shape # another small look 

#### From this step we know the number of rows and columns, which are 4999 rows and 17 columns, the type of each column, and the number of missing data in each column (the Market Category column contains a large amount of missing data, and there is a column called 0 in which all of its data is missing).

## 2-data cleaning

In [None]:
median_EngineHP=cars_df['Engine HP'].median()
median_NumberofDoors=cars_df['Number of Doors'].median()
median_EngineHP,median_NumberofDoors
#The median of each column containing missing data was calculated to fill it.
# for now will fill it like this 
#if it is necessary in our analysis will be filling it manually 

In [None]:
cars_df['Engine HP'].fillna(median_EngineHP,inplace=True)
cars_df['Number of Doors'].fillna(median_NumberofDoors,inplace=True)

In [None]:
cars_df['Market Category'].fillna('Unknown', inplace=True)
#The Market Category column has a lot of missing data so we can't replace it with the median so the missing data is called unkown
# we dont need it in our analysis

In [None]:
cars_df.loc[
    (cars_df['Engine Cylinders'].isnull()) & (cars_df['Engine Fuel Type'] == 'electric'),
    'Engine Cylinders'
] = 0
# electric cars has no cylinders

In [None]:
cars_df.loc[
    (cars_df['Engine Cylinders'].isnull()) & (cars_df['Model'] == 'RX-7'),
    'Engine Cylinders'
] = 2

# this was for the mazda's RX7

In [None]:
cars_df.loc[
    (cars_df['Engine Cylinders'].isnull()) & (cars_df['Model'] == 'RX-8'),
    'Engine Cylinders'
] = 2

# this was for the mazda's RX8 

In [None]:
cars_df[cars_df['Engine Cylinders'].isnull()]
# all clean and ready to analyze

In [None]:
cars_df.loc[cars_df['Engine Fuel Type'].isnull(), 'Engine Fuel Type'] = 'regular'

In [None]:
cars_df.isnull().sum()
# Ensure that there is no missing data.

In [None]:
cars_df['MSRP_formatted']= cars_df['MSRP'].apply(lambda x: "${:,.0f}".format(x))
cars_df
# for our reference we have put a new column with the MSRP in $

#### In this step, all empty data were filled in using several methods. The column with all missing data was deleted, and the columns that contained reasonably missing data were replaced with the median. There is a column that contains a lot of missing data that was dealt with by naming the missing data as unknown.

## 3- Exploratory Data Analysis

In [None]:
cars_df.describe()
#For numerical value

In [None]:
cars_df.select_dtypes(include='object').nunique()
##For categorical value

In [None]:
print(cars_df['Transmission Type'].value_counts())
# general info 

In [None]:
avg_popularity_by_style = cars_df.groupby('Vehicle Style')['Popularity'].mean().sort_values(ascending=False).plot(kind = 'barh');
plt.title("avarage popularity")
plt.xlabel("number of search")
plt.ylabel("vehicle style");
#print(avg_popularity_by_style);
# calculating  the avarage to see what cars are more desirable 

In [None]:
print(cars_df['Vehicle Style'].value_counts())
# counting the availabelity in the market 

In [None]:
cars_df['Year'].value_counts().sort_values().plot(kind='bar')
plt.title("available models")
plt.xlabel("years")
plt.ylabel("number of cars")
plt.show()
# a look to what years of make we have in the data 

#### In this step, the information contained in the data was explored, such as the average, highest value, lowest value, st, etc., and the most frequently repeated items in each column were also explored.

## 4-problem solving

#### we are about to start up a business which is a showroom for used cars to be spiciffic we want to be specilized in cargo vans , 2dr SUV's and Passenger Vans based on the data we have 

In [None]:
cars_df.corr(numeric_only=True)
#There was no correlations between the catogery we are targeting and the numbers of cilenders or highway MPG

In [None]:
avg_popularity_by_style = cars_df.groupby('Vehicle Style')['Popularity'].mean().sort_values(ascending=False)

print(avg_popularity_by_style)
# deciding to go with the top three Vehicle Style as it is has the top 3 avarage Popularity 

In [None]:
print(cars_df['Vehicle Style'].value_counts())
# as we have a high demand on the top 3 Vehicle Style in Popularity and a low value available we will put this three types of vehicle in our showroom

In [None]:
cars_df['Vehicle Style'].value_counts().plot(kind='barh');
plt.title("available models")
plt.xlabel("number of cars")
plt.ylabel("cars style")
plt.show()
# ploting it in a bar chart to make it clear 

In [None]:
target_styles = ['Cargo Van', 'Cargo Minivan', 'Passenger Van']
filtered_df = cars_df[cars_df['Vehicle Style'].isin(target_styles)]
grouped = filtered_df.groupby(['Vehicle Style', 'Make']).size().unstack()
grouped.T.plot(kind='bar', stacked=True, figsize=(12, 6))
grouped.T
# Ai generated analysis as we didnt learn how to do this in particular and it is nacesery for our analysis 
# last code with same result but with the teaching of Mr.Ali 

In [None]:
target_styles = ['Cargo Van', 'Cargo Minivan', 'Passenger Van']

newer_cars = cars_df[
    (cars_df['Year'] > 2015) &
    (cars_df['Vehicle Style'].isin(target_styles))
]

print(f"Number of cars from after 2015 in target styles: {len(newer_cars)}")
newer_cars.sample(73)
# taking the cars that has been made after 2015 as it is a Used Cars Data and we dont want some old cars 

In [None]:
# Define top 4 popular styles
top_styles = ['Passenger Van', 'Cargo Minivan', 'Cargo Van', '2dr SUV']

# Filter cars after 2015 and in top 4 styles
newer_cars = cars_df[(cars_df['Year'] > 2015) & (cars_df['Vehicle Style'].isin(top_styles))]

# Print count
print(f"Number of cars from after 2015 in top 4 styles: {len(newer_cars)}")

# Show sample
newer_cars.sample(76)

In [None]:
# Define the target styles
target_styles = ['Cargo Van', 'Cargo Minivan', 'Passenger Van']

# Filter only rows with those vehicle styles
filtered_vans = cars_df[cars_df['Vehicle Style'].isin(target_styles)]

# Group by Make and count how many vehicles of each
van_counts_by_make = filtered_vans['Make'].value_counts()

# Display the results
print(van_counts_by_make)

In [None]:
cars_df.groupby(['Make', 'Vehicle Style'])['Model'].count()

In [None]:
xx = cars_df.groupby(['Make', 'Vehicle Style'])['Model'].count().reset_index()#.plot(kind='bar', stacked=True)
xx = xx[xx['Vehicle Style'].isin(target_styles)]
xx

In [None]:
import matplotlib.pyplot as plt


In [None]:
xx

In [None]:
pd.pivot_table(xx, values='Model', columns=['Vehicle Style'], index=['Make']).plot(kind='bar', stacked='bar');