# Load the Automobile dataset


 
  


In [None]:
#import libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 

plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['figure.dpi'] = 150

In [None]:
# loading data set as Pandas dataframe
df = pd.read_csv("./datasets/automobile.csv")
df.head()

# Data Cleaning

In [None]:
# Find out the number of values which are not numeric
df['price'].str.isnumeric().value_counts()

# List out the values which are not numeric
df['price'].loc[df['price'].str.isnumeric() == False]

#Setting the missing value to mean of price and convert the datatype to integer
price = df['price'].loc[df['price'] != '?']
price_mean = price.astype(int).mean()
df['price'] = df['price'].replace('?', price_mean).astype(int)
df['price'].head()

In [None]:
# Cleaning the horsepower field
df['horsepower'].str.isnumeric().value_counts()
horsepower = df['horsepower'].loc[df['horsepower'] != '?']
hp_mean = horsepower.astype(int).mean()
df['horsepower'] = df['horsepower'].replace('?',hp_mean).astype(int)
df['horsepower'].head()

In [None]:
# Cleaning the Normalized losses field
df[df['normalized-losses']=='?'].count()
nl=df['normalized-losses'].loc[df['normalized-losses'] !='?'].count()
nmean=nl.astype(int).mean()
df['normalized-losses'] = df['normalized-losses'].replace('?',nmean).astype(int)
df['normalized-losses'].head()

# Bivariate Analysis



In [None]:
# plot the relationship between “horsepower” and “price”
plt.scatter(df["price"], df["horsepower"])
plt.title("Scatter Plot for horsepower vs price")
plt.xlabel("horsepower")
plt.ylabel("price")

In [None]:
#boxplot
sns.boxplot(x="engine-location",y="price",data=df)

In [None]:
#boxplot to visualize the distribution of "price" with types of "drive-wheels"
sns.boxplot(x="drive-wheels", y="price",data=df)

Figure above illustrates the range of prices in cars with different wheel types. Boxplot shows the average and median price in respective wheel types and some outliers.

# Multivariate Analysis



In [None]:
# pair plot with plot type regression
sns.pairplot(df,height=2, vars = ['normalized-losses', 'price','horsepower'],  kind="reg")

In [None]:
 #pair plot (matrix scatterplot) of few columns 
sns.set(style="ticks", color_codes=True)
sns.pairplot(df,height=2,vars = ['symboling', 'normalized-losses','wheel-base'], hue="drive-wheels")

In [None]:
from scipy import stats

corr = stats.pearsonr(df["price"], df["horsepower"])
print("p-value:\t", corr[1])
print("cor:\t\t", corr[0])

Here the correlation of these two variable is 0.80957 which is close to +1 thus the price and horsepower are highly positively correlated. Using pandas corr( function correlation between entire numerical record can be calculated.

In [None]:
df.corr(method='pearson', numeric_only=True)

Now let's visualize this correlation analysis with heatmap. Heatmap is best technique to make this look beautiful and easier to interpret.

In [None]:
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix)
      

A coefficient close to 1 means that there’s a very strong positive correlation between the two variables. The diagonal line is the correlation of the variables to themselves — so they’ll obviously be 1.

# Multivariate Analysis over titanic dataset


In [None]:
#load data set
titanic=pd.read_csv("../datasets/titanic.csv")
titanic.head()


In [None]:
titanic.shape


Let’s take a  look at what is the number of records missing in the data set.

In [None]:
total = titanic.isnull().sum().sort_values(ascending=False)
total

In [None]:
#percentage of women survived
women = titanic.loc[titanic.Gender == 'female']["Survived"]
rate_women = sum(women)/len(women)

#percentage of men survived
men = titanic.loc[titanic.Gender == 'male']["Survived"]
rate_men = sum(men)/len(men)

print(str(rate_women * 100) + " % of women who survived." )
print(str(rate_men * 100) + " % of men who survived." )

You can see the number of females survival was high, so gender could be the attribute that contributes to analyzing the survival of any variable(person). Let's visualize this information on survival numbers in males and females.

In [None]:
# Exploratory Data Analysis using seaborn

# Map the values of the "Survived" column to "not_survived" and "survived"
if set(titanic['Survived']).issubset({0, 1}):
    titanic['Survived'] = titanic['Survived'].map({0:"not_survived", 1:"survived"})
    print(titanic['Survived'])

# Create a figure with two subplots
fig, ax = plt.subplots(1, 2, figsize = (7, 6))
fig.tight_layout(pad=3.0)

# Plot the number of passengers by gender in the first subplot
titanic["Gender"].value_counts().plot.bar(color = "skyblue", ax = ax[0])
ax[0].set_title("Number Of Passengers By Gender")
ax[0].set_ylabel("Population")

# Plot the count of survived and dead passengers by gender in the second subplot
sns.countplot(data=titanic, x="Gender", hue="Survived", ax = ax[1])
ax[1].set_title("Gender: Survived vs Dead")


Let's visualize the number of survival and death from different Pclasses.

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (7, 6))
fig.tight_layout(pad=3.0)

# Plot the number of passengers by Pclass in the first subplot
titanic["Pclass"].value_counts().plot.bar(color = "skyblue", ax = ax[0])
ax[0].set_title("Number Of Passengers By Pclass")
ax[0].set_ylabel("Population")

# Plot the count of survived and dead passengers by Pclass in the second subplot
sns.countplot(data=titanic, x="Pclass", hue="Survived", ax = ax[1])
ax[1].set_title("Pclass: Survived vs Dead")
plt.show()

Looks like the number of passenger in Pclass 3 was high and most of them could not survive. Most of Pclass 1 passengers survived.

In [None]:
# Fill missing values in the "Embarked" column with "S"
titanic["Embarked"] = titanic["Embarked"].fillna("S")

# Display the updated titanic dataframe
titanic

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (7, 6))
fig.tight_layout(pad=3.0)

# Plot the number of passengers by Embarked in the first subplot
titanic["Embarked"].value_counts().plot.bar(color = "skyblue", ax = ax[0])
ax[0].set_title("Number Of Passengers By Embarked")
ax[0].set_ylabel("Number")

# Plot the count of survived and dead passengers by Embarked in the second subplot
sns.countplot(data=titanic, x="Embarked", hue="Survived", ax = ax[1])
ax[1].set_title("Embarked: Survived vs Unsurvived")


In [None]:
# Plot the distribution of the "Age" column
# The `dropna()` function is used to remove any missing values (`NaN`) 
# from the 'Age' column before creating the plot. 
# The `kde` parameter is set to `True` to display the Kernel Density Estimate plot.
ax = sns.histplot(titanic['Age'].dropna(), kde=True, color="skyblue")
ax.lines[0].set_color('brown')
plt.title("Distribution of Age")

Now let's do first multivariate analysis into titanic data set with variables Survived, Pclass,Fear and Age. 

In [None]:

sns.set(style="ticks", color_codes=True)
# Pair plot (matrix scatterplot) of few columns
sns.pairplot(titanic,height=2,vars = [ 'Fare','Age','Pclass'], hue="Survived")

**Correlation Matrix**

First map Embarked records with integer values so that we can include Embraked too in our correlation analysis.

In [None]:
# Map the values of the "Embarked" column to 0, 1, 2, and 3
if set(titanic['Embarked']).issubset({'S', 'C', 'Q', 'NaN'}):
    titanic['Embarked'] = titanic['Embarked'].map({"S":1, "C":2,"Q":2,"NaN":0})
    #print(titanic['Embarked'])

# The `corr()` computes pairwise correlation of numeric columns in the DataFrame
correlation_matrix = titanic.corr(method='pearson', numeric_only='true')
correlation_matrix

**Heatmap**

In [None]:
# Plots a correlation heatmap for the given dataframe
# Use annot to represent the cell values with text
sns.heatmap(correlation_matrix, annot=True)
plt.title("Correlation Heatmap")
