# Exploratory Data Analysis on Automobile Data

In [None]:
##Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO

In [None]:
## Download file using pyfetch from URL
import requests

In [None]:
# Using Requests to fetch the URL and download the data
response= requests.get("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/Data%20files/automobileEDA.csv")

In [None]:
# Convert the downloable data into csv
df=pd.read_csv(StringIO(response.text))

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
## Total no of rows
df.shape[0]

In [None]:
## Total no of rows
df.shape[1]

In [None]:
df.size

In [None]:
## Checking for Null values
df.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
## Checking for duplicate rows
df[df.duplicated()==True]

In [None]:
## To filter rows with Nan Values
df[df.isnull().any(axis=1)]

In [None]:
## View 2 column data with Nan Values
df[df.isnull().any(axis=1)][['horsepower-binned','stroke']]

In [None]:
## Replace Null Values with mean
df=df = df.dropna().reset_index(drop=True)

In [None]:
## Fetching dropped index row ,which automatically fetches next row
df.iloc[46]

In [None]:
## reset index on the table
df.reset_index(drop=True)

In [None]:
df.iloc[46]

In [None]:
df.columns

In [None]:
## Unique values in each column
df.nunique()

In [None]:
## to view unqiue values in all columns
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals}\n")


In [None]:
Columns=df[['body-style','engine-type','horsepower-binned']]

In [None]:
## to view unqiue values in selected columns
for col in Columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals}\n")

In [None]:
# To check data type of all columns
for col in df.columns:
    datatype = df[col].dtypes
    print(f"Data Type for {col} is : {datatype}\n")

## Correlation Matrix

In [None]:
df.dtypes

In [None]:
## To view correaltion matrix , we need to fetch only numerical dataypes columns in the table
df_corr=df[df.select_dtypes(include=['number']).columns].corr()

In [None]:
## Plotting Correlation Matrix using Heatmap
plt.figure(figsize = (14,16))
sns.heatmap(df_corr,annot=True)
plt.title("Correlation Matrix")
plt.show()

In [None]:
df.columns


In [None]:
## Correlation with respect to one column
df_corr_price=df.select_dtypes(include=['number']).corr()['price'].sort_values(ascending=True)
type(df_corr_price)

In [None]:
#Convert series into DataFRame
df_corr_price=pd.DataFrame(df_corr_price)

In [None]:
type(df_corr_price)

In [None]:
## Plot correlation with respect to Price column
plt.figure(figsize=(10,6))
sns.heatmap(df_corr_price,annot=True)
plt.title("Correlation Matrix for Price")
plt.show()

In [None]:
## Finding correlation with repect to 4 columns bore, stroke, compression-ratio, and horsepower.
df_columns_corr=df[['bore', 'stroke', 'compression-ratio', 'horsepower']].corr()

In [None]:
df_columns_corr

In [None]:
plt.figure(figsize=(10,4))
sns.heatmap(df_columns_corr,annot=True)
plt.title("Correlation Matrix for Price")
plt.show()

## Visualizations


In [None]:
df_cols=df[['price','peak-rpm','highway-mpg','engine-size']]
sns.pairplot(df_cols)

There is a perfect positive linear relationship between price and engine-size and perfect negative linear relationship between price and hhighway-mpg .However , we dont see any relationship betweeen price and peak-rpm. Let see their correlation matrix

In [None]:
df_cols.corr()

# Examine the correlation


In [None]:
# Perfect Positive Linear relatioship
sns.regplot(x='engine-size',y='price',data=df_cols)

In [None]:
# Perfect negative Linear relatioship
sns.regplot(x='highway-mpg',y='price',data=df_cols)

In [None]:
# Weak Linear relatioship
sns.regplot(x='peak-rpm',y='price',data=df_cols)

Box Plot for Categorical Variables

In [None]:
sns.boxplot(x="body-style", y="price", data=df)
plt.show()

## Grouping


In [None]:
df.groupby('body-style')[['price','engine-size']].sum()

In [None]:
##Multiple aggregations
df.groupby('body-style')[['price','engine-size']].agg(['mean','median','count','sum'])

In [None]:
##Multiple aggregations
df.groupby('body-style')[['price','engine-size']].agg({'price':'mean','engine-size':'count'})

In [None]:
df.head()

In [None]:
df_grouped=df.groupby(['body-style','engine-location'],as_index=False)['price'].sum()
df_grouped=pd.DataFrame(df_grouped)
print(type(df_grouped))

In [None]:
df_grouped

In [None]:
## Pivoting
df.pivot_table(index='body-style',columns='engine-location',values='price')

In [None]:
df_corr1=df.select_dtypes(include=['number'])[['price','bore','engine-size']].corr()
sns.heatmap(df_corr1,annot=True)

To find the important features to predict the car price . To find out the features which are highly correlated either in a positive or negative.Example below identify features with correlation greater than equal to 0.75

In [None]:
df_correlation=df.select_dtypes(include=['number']).corr()

In [None]:
df_correlation

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,diesel,gas
symboling,1.0,0.453423,-0.533112,-0.359883,-0.242821,-0.539407,-0.226121,-0.055329,-0.142125,-0.01352,-0.180275,0.090778,0.229436,0.004688,0.073359,-0.07642,0.021612,-0.192544,0.192544
normalized-losses,0.453423,1.0,-0.046971,0.027234,0.090056,-0.362499,0.107034,0.147862,-0.030152,0.056054,-0.112288,0.226654,0.215903,-0.208695,-0.166772,0.138183,0.223308,-0.096487,0.096487
wheel-base,-0.533112,-0.046971,1.0,0.876945,0.821427,0.607417,0.790205,0.592517,0.493941,0.163725,0.248383,0.381984,-0.351925,-0.496565,-0.566948,0.591956,0.51705,0.305889,-0.305889
length,-0.359883,0.027234,0.876945,1.0,0.857249,0.522367,0.882091,0.698626,0.608496,0.13685,0.156926,0.583073,-0.280397,-0.686142,-0.715124,0.689466,0.690579,0.212484,-0.212484
width,-0.242821,0.090056,0.821427,0.857249,1.0,0.351127,0.862835,0.739543,0.543859,0.210495,0.188331,0.604099,-0.252116,-0.639692,-0.68455,0.744176,0.685663,0.251112,-0.251112
height,-0.539407,-0.362499,0.607417,0.522367,0.351127,1.0,0.354085,0.100518,0.19949,-0.084755,0.269773,-0.031489,-0.27002,-0.128523,-0.18167,0.182407,0.11112,0.279063,-0.279063
curb-weight,-0.226121,0.107034,0.790205,0.882091,0.862835,0.354085,1.0,0.858968,0.646918,0.193575,0.153952,0.750749,-0.281231,-0.766146,-0.808146,0.828483,0.809512,0.228445,-0.228445
engine-size,-0.055329,0.147862,0.592517,0.698626,0.739543,0.100518,0.858968,1.0,0.601384,0.265626,0.017246,0.825377,-0.227577,-0.712693,-0.734062,0.889265,0.814696,0.076032,-0.076032
bore,-0.142125,-0.030152,0.493941,0.608496,0.543859,0.19949,0.646918,0.601384,1.0,-0.049606,-4.4e-05,0.579114,-0.276668,-0.591597,-0.598455,0.544375,0.574286,0.055927,-0.055927
stroke,-0.01352,0.056054,0.163725,0.13685,0.210495,-0.084755,0.193575,0.265626,-0.049606,1.0,0.191776,0.139409,-0.067852,-0.050971,-0.052345,0.107893,0.065893,0.240193,-0.240193


In [None]:
##Checking row and index values
df_correlation.loc['gas','symboling']

np.float64(0.1925439658319512)

In [None]:
col_corr=[]
for row in df_correlation.index:
  row_value=df_correlation.loc[row,'price']
  if abs(row_value)>=0.75 and row !='price':
    col_corr.append(row)
    ##print(f"The correlation for {row} and {col} is : {row_value}")

print(f"Features which are highly correlated with price are:{col_corr}")


Features which are highly correlated with price are:['curb-weight', 'engine-size', 'horsepower', 'city-L/100km']
