## Importing necessary libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##  Changing the working directory

In [2]:
os.chdir("D:\DM7\Python for Data Science\Data_Files")

## Importing data

In [3]:
cars_data = pd.read_csv('Toyota.csv', index_col=0, na_values=['??', '###','????'])
cars_data.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170


## Creating copy of original data

### In python there are two ways to create copy 
1. Shallow copy 
2. Deep copy

In [4]:
# 1. Shallow copy
cars_data2 = cars_data.copy(deep=False)

# 2. Deep copy
#cars_data2 = cars_data.copy(deep=True)

## Attributes of data

###  To get the dimensionality of the dataframe

In [5]:
cars_data.shape

(1436, 10)

### Concise summary of dataframe
info() returns a concise summary of a 
dataframe

In [6]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1336 non-null   float64
 2   KM         1421 non-null   float64
 3   FuelType   1336 non-null   object 
 4   HP         1430 non-null   float64
 5   MetColor   1286 non-null   float64
 6   Automatic  1436 non-null   int64  
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   object 
 9   Weight     1436 non-null   int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 123.4+ KB


### By using info(), we can see that
1. ‘KM’ has been read as object instead of integer
2. ‘HP’ has been read as object instead of integer
3. ‘MetColor’ and ‘Automatic’ have been read as float64 and int64 respectively since it has values 0/1
4. Ideally, ‘Doors’ should’ve been read as int64 since it has values 2, 3, 4, 5. But it has been read as object
5. Missing values present in few variables

## Data cleaning

### 1. Converting variables data types

In [7]:
cars_data['MetColor'] = cars_data['MetColor'].astype('object')
cars_data['Automatic'] = cars_data['Automatic'].astype('object')
cars_data['KM'] = cars_data['KM'].astype('float64')
cars_data['HP'] = cars_data['HP'].astype('float64')
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      1436 non-null   int64  
 1   Age        1336 non-null   float64
 2   KM         1421 non-null   float64
 3   FuelType   1336 non-null   object 
 4   HP         1430 non-null   float64
 5   MetColor   1286 non-null   object 
 6   Automatic  1436 non-null   object 
 7   CC         1436 non-null   int64  
 8   Doors      1436 non-null   object 
 9   Weight     1436 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 123.4+ KB


### 2. Cleaning column 'Doors'

<h>Checking unique values of variable ‘Doors’ :</h>

In [8]:
print(np.unique(cars_data['Doors']))

['2' '3' '4' '5' 'five' 'four' 'three']


replace() is used to replace a value with the desired 
value

In [9]:
cars_data['Doors'].replace('five', 5, inplace=True)
cars_data['Doors'].replace('four', 4, inplace=True)
cars_data['Doors'].replace('three', 3, inplace=True)

In [10]:
cars_data['Doors'] = cars_data['Doors'].astype('int64')

# Checking changed data types of columns
cars_data.dtypes

Price          int64
Age          float64
KM           float64
FuelType      object
HP           float64
MetColor      object
Automatic     object
CC             int64
Doors          int64
Weight         int64
dtype: object

## Identifying missing values

1. In Pandas dataframes, missing data is represented
by NaN (an acronym for Not a Number)
2. To check null values in Pandas dataframes,
isnull() and isna() are used

To check the count of missing values present in each column

In [11]:
cars_data.isna().sum()
#cars_data.isnull().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

Subsetting the rows that have one or more missing values

In [12]:
missing = cars_data[cars_data.isnull().any(axis=1)]
missing

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
6,16900,27.0,,Diesel,,,0,2000,3,1245
7,18600,30.0,75889.0,,90.0,1.0,0,2000,3,1245
9,12950,23.0,71138.0,Diesel,,,0,1900,3,1105
15,22000,28.0,18739.0,Petrol,,0.0,0,1800,3,1185
...,...,...,...,...,...,...,...,...,...,...
1428,8450,72.0,,Petrol,86.0,,0,1300,3,1015
1431,7500,,20544.0,Petrol,86.0,1.0,0,1300,3,1025
1432,10845,72.0,,Petrol,86.0,0.0,0,1300,3,1015
1433,8500,,17016.0,Petrol,86.0,0.0,0,1300,3,1015


## Approaches to fill the missing values
1. Fill the missing value by mean/ median, in case of numerical value
2. Fill the missing values with the class which has maximum count, in case of categorical variable

### Imputing missing values
describe() Generate descriptive statistics that summarize the
central tendency, dispersion and shape of a dataset’s
distribution, excluding NaN values


In [13]:
cars_data.describe()

Unnamed: 0,Price,Age,KM,HP,CC,Doors,Weight
count,1436.0,1336.0,1421.0,1430.0,1436.0,1436.0,1436.0
mean,10730.824513,55.672156,68647.239972,101.478322,1566.827994,4.033426,1072.45961
std,3626.964585,18.589804,37333.023589,14.768255,187.182436,0.952677,52.64112
min,4350.0,1.0,1.0,69.0,1300.0,2.0,1000.0
25%,8450.0,43.0,43210.0,90.0,1400.0,3.0,1040.0
50%,9900.0,60.0,63634.0,110.0,1600.0,4.0,1070.0
75%,11950.0,70.0,87000.0,110.0,1600.0,5.0,1085.0
max,32500.0,80.0,243000.0,192.0,2000.0,5.0,1615.0


## 1. Imputing missing values of ‘Age’

• Calculating the mean value of the Age variable

In [14]:
cars_data['Age'].mean()

55.67215568862275

• To fill NA/NaN values using the specified value

In [15]:
cars_data['Age'].fillna(cars_data['Age'].mean(), inplace = True)

## 2. Imputing missing values of 'KM'

Calculating the median value of the KM variable

In [16]:
cars_data['KM'].median()

63634.0

To fill NA/NaN values using the specified value

In [17]:
cars_data['KM'].fillna(cars_data['KM'].median(), inplace = True)

## 3. Imputing missing values of ‘HP’

Calculating the mean value of the HP variable

In [18]:
cars_data['HP'].mean()

101.47832167832168

To fill NA/NaN values using the specified value

In [19]:
cars_data['HP'].fillna(cars_data['HP'].mean(), inplace = True)

Check for missing data after filling values

In [20]:
cars_data.isnull().sum()

Price          0
Age            0
KM             0
FuelType     100
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

## 4. Imputing missing values of ‘FuelType’

value_counts()
1. Returns a Series containing counts of unique values
2. The values will be in descending order so that the 
first element is the most frequently-occurring 
element
3. Excludes NA values by default

To get the mode value of FuelType

In [21]:
cars_data['FuelType'].value_counts().index[0]

'Petrol'

To fill NA/NaN values using the specified value

In [22]:
cars_data['FuelType'].fillna(cars_data['FuelType'].value_counts().index[0],
                             inplace=True)

## 5. Imputing missing values of ‘MetColor’

To get the mode value of MetColor

In [23]:
cars_data['MetColor'].mode()

0    1.0
Name: MetColor, dtype: object

To fill NA/NaN values using the specified value


In [24]:
cars_data['MetColor'].fillna(cars_data['MetColor'].mode()[0],
                             inplace=True)

### Checking for missing values after filling values

In [25]:
cars_data.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

## Imputing missing values using lambda functions

To fill the NA/ NaN values in both numerical and categorial variables at one stretch

In [26]:
cars_data2 = cars_data2.apply(lambda x:x.fillna(x.mean()) \
                              if x.dtype == 'float' else \
                              x.fillna(x.value_counts().index[0]))

Check for missing data after filling values

In [27]:
cars_data2.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

## Saving cleaned data as a CSV file for further use

In [28]:
cars_data.to_csv('cleaned_cars_data.csv')