#  Exploratory Data Analysis(EDA)

In [55]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [56]:
#Loading the dataset 
mpg_df=pd.read_csv('D:\\Tsed\\Python\\Projects\\Mile Per Gallon\\Dataset\\mpg.csv')
mpg_df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite


## 2. Data Cleaning

### 2.1 Handling missing values:  


In [57]:
# detecting null values in horsepower
mpg_df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [58]:
# Calculating the threshold (5%) missing value is tolerable to drop them out
threshold=len(mpg_df)*0.05
print('Minimum threshold value:',threshold)
print('Number of missing horsepower values:', mpg_df['horsepower'].isna().sum())

Minimum threshold value: 19.900000000000002
Number of missing horsepower values: 6


In [59]:
# lets print the missing values out
mpg_df[mpg_df['horsepower'].isna()]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
32,25.0,4,98.0,,2046,19.0,71,usa,ford pinto
126,21.0,6,200.0,,2875,17.0,74,usa,ford maverick
330,40.9,4,85.0,,1835,17.3,80,europe,renault lecar deluxe
336,23.6,4,140.0,,2905,14.3,80,usa,ford mustang cobra
354,34.5,4,100.0,,2320,15.8,81,europe,renault 18i
374,23.0,4,151.0,,3035,20.5,82,usa,amc concord dl


#### INSIGHT:
    :number of missing value(6) is less than the threshold value(19.9).So we can drop them out

In [60]:
# Dropping the missing values
mpg_df.dropna(inplace=True)
mpg_df.tail(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


#### INSIGHTS: 
    : The index is still up to 397. Cross checking if the missing values is removed. 
    : If they have been removed, we have to reset the index

In [61]:
# Cross checking if the missing values is removed
mpg_df.isna().any().sum()

0

In [62]:
# Resetting the index to remove empty rows
mpg_df.reset_index(inplace=True)

In [63]:
# Drop index column of the original data 
mpg_df.drop('index',axis=1,inplace=True)

In [64]:
# Cross checking if it the empty rows are removed
mpg_df.tail(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
389,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
390,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
391,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


#### INSIGHTS: 
     We are good to go!

### 2.2. Formatting & standardizing 

In [65]:
# Standardizing & Converting data types into the proper format
mpg_df['weight']=mpg_df['weight'].astype(float)
mpg_df['model_year']=pd.to_datetime(mpg_df['model_year'],errors='coerce').dt.year
print(mpg_df['weight'].dtype)
mpg_df.head(3)

float64


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,1970,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,1970,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,1970,usa,plymouth satellite


### 2.3 Generating new feature

In [66]:
# Generating new feature from model_year column called age
mpg_df.insert(6,'age',datetime.now().year-mpg_df['model_year'])
mpg_df.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,54,1970,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,54,1970,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,54,1970,usa,plymouth satellite
