In [1]:
# general purpose libraries
import numpy as np
import pandas as pd

# visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


### Loading the Dataset

In [2]:
# to print all the columns

pd.set_option("display.max_columns", None)

In [3]:
# to hide any warnings

import warnings
warnings.filterwarnings("ignore")

In [4]:
DATASET = r"C:\Users\91930\Documents\GITHUB\ArtOfAI\dataset\car-price-prediction\CarPrice_Assignment.csv"

In [5]:
# loading the dataset

df = pd.read_csv(DATASET)

# print the first five rows of the dataset
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


### Getting to known the dataset

In [6]:
# to check the shape ofthe dataset

print(f"Number of rows: {df.shape[0]} \nNumber of columns: {df.shape[1]}")

Number of rows: 205 
Number of columns: 26


In [7]:
# printing the exact names of the columns

df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [8]:
# to check the datatype of each column

pd.DataFrame(df.dtypes, columns=['Datatype'])

Unnamed: 0,Datatype
car_ID,int64
symboling,int64
CarName,object
fueltype,object
aspiration,object
doornumber,object
carbody,object
drivewheel,object
enginelocation,object
wheelbase,float64


### Checking for missing values

In [9]:
# checking the count of missing values in each column

df.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

Observation:

- There are no missing values in our dataset.

### Checking for unique values count in each columns

In [10]:
# Checking  the unique values in each column

df.nunique()

car_ID              205
symboling             6
CarName             147
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64

### Basic EDA

----------

Purpose of Exploratory Analysis

- To get an overview of our dataset
- To get idea of unique enteries specially in the categorical columns
- To check for any potential errors in our dataset

----------

Insights from `fueltype`

In [11]:
# checking the unique enteries in `fueltype` column

df['fueltype'].unique()

array(['gas', 'diesel'], dtype=object)

In [12]:
# checking the count of each unique entry in the column

df['fueltype'].value_counts()

fueltype
gas       185
diesel     20
Name: count, dtype: int64

**Observation**:

- There are 2 unique enteries in `fueltype` column
- One is gas and other is diesel
- Our dataset has `185` ccars which runs on `gas` and `20` cars which runs on `diesel`.

Insights from aspiration

In [14]:
# checking the unique enteries in `aspiration` column

df['aspiration'].unique()

array(['std', 'turbo'], dtype=object)

In [15]:
# check the count of each unique entry in the column

df['aspiration'].value_counts()

aspiration
std      168
turbo     37
Name: count, dtype: int64