# Exploratory Data Analysis




## I. Data Loading and Overview

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data Sci Fundamentals/Slides and Codes/titanic.csv')

In [None]:
# Show the top 10 results of the data frame
df.head(10)

In [None]:
# show basic metrices of the numeric data
df.describe()

In [None]:
# Loop through each categorical column and print the count of each category
for column in df.select_dtypes(include=['object', 'category']).columns:
    print(f"Counts for {column} category:")
    print(df[column].value_counts())
    print()  # This will print a blank line for better readability


In [None]:
# print the shape of the data
print(df.shape)

# print the missing data points
print("\n",df.isnull().sum())

# print the percentage of data missing for each column
# Calculate the percentage of missing data for each column
missing_percentage = (df.isnull().sum() / len(df) * 100).round(2)

# Print the percentage of missing data
print('\n',missing_percentage.astype(str) + "%")

Make decision

- Cabin column is missing too much information so we remove the column;
- augument Embarked column and Age column with methods

## II. Single variable histograms and scatterplot pairplots

In [None]:
# Numeric Variables
import seaborn as sns
import matplotlib.pyplot as plt

# Drop non-numerical columns for simplicity, or you can keep them to see how seaborn handles categorical data
numerical_data = df.select_dtypes(include=['float64', 'int64'])

# Create a pair plot
sns.pairplot(numerical_data)

# Show the plot
plt.show()


In [None]:
# Categorical variables
# List of categorical columns to plot
categorical_columns = ['Sex', 'Embarked', 'Pclass', 'Survived']

# Create a figure to hold the subplots
plt.figure(figsize=(15, 10))

# Loop through the categorical columns and create a subplot for each
for index, column in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, index)  # Adjust the grid size according to the number of categorical columns
    sns.countplot(x=column, data=df)
    plt.title(f'Count of {column}')

plt.tight_layout()
plt.show()


## III. Data Cleaning

### 1. Check if your data is missing at random

Using groupby on your variables of interest

In [None]:
# Group by Sex column
missing_counts_by_sex = df.groupby("Sex").apply(lambda x: x.isnull().sum())
missing_counts_by_sex

In [None]:
# Group by Survived and Sex column
missing_counts_by_sex = df.groupby(['Survived',"Sex"]).apply(lambda x: x.isnull().sum())
missing_counts_by_sex

We can see that the data is not missing at random - most missing data points for age are identified as males who did not survive

### 2. Missing Data Handling

In [None]:
# Create a copy of data frame
df1 = df.copy()
# Handling missing values with different methods
df1['Age'].fillna(df1['Age'].median(), inplace=True)
df1['Embarked'].fillna(df1['Embarked'].mode()[0], inplace=True)

# Dropping irrelevant columns
df1.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

df1.info()

In [None]:
# Numeric Variables Visual
# Drop non-numerical columns for simplicity, or you can keep them to see how seaborn handles categorical data
numerical_data = df1.select_dtypes(include=['float64', 'int64'])

# Create a pair plot
sns.pairplot(numerical_data)

# Show the plot
plt.show()


In [None]:
# Categorical variables
# List of categorical columns to plot
categorical_columns = ['Sex', 'Embarked', 'Pclass', 'Survived']

# Create a figure to hold the subplots
plt.figure(figsize=(15, 10))

# Loop through the categorical columns and create a subplot for each
for index, column in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, index)  # Adjust the grid size according to the number of categorical columns
    sns.countplot(x=column, data=df1)
    plt.title(f'Count of {column}')

plt.tight_layout()
plt.show()


- Because we filled age with Median we see a spike at the mid point in the histogram
- Because we filled Embarked with mode (S) we see that there is a slight increase in S category

In [None]:
# You can also drop the missing data
df2 = df.copy()
df2.dropna(subset = ['Age','Embarked'],inplace = True)

# Dropping irrelevant columns
df2.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

print(df2.shape)

print(df2.info())

Now the data shape dropeed from 891 * 12 to 712 * 10

In [None]:
# Use the groupped mean to fill the na
# Create a copy of the data frame
df3 = df.copy()

# Fill missing 'Age' values by the mean age of the respective 'Survived' and 'Sex' group
df3['Age'] = df3.groupby(['Survived', 'Sex'])['Age'].transform(lambda x: x.fillna(x.mean()))

# Handle missing 'Embarked' values by the mode (the most frequently occurring value)
df3['Embarked'].fillna(df3['Embarked'].mode()[0], inplace=True)

# Dropping irrelevant columns 'Ticket' and 'Cabin'
df3.drop(['Ticket', 'Cabin'], axis=1, inplace=True)

# Display the info to check the new state of df1
df3.info()


In [None]:
# Numeric Variables Visual
# Drop non-numerical columns for simplicity, or you can keep them to see how seaborn handles categorical data
numerical_data = df3.select_dtypes(include=['float64', 'int64'])

# Create a pair plot
sns.pairplot(numerical_data)

# Show the plot
plt.show()


In [None]:
# Categorical variables
# List of categorical columns to plot
categorical_columns = ['Sex', 'Embarked', 'Pclass', 'Survived']

# Create a figure to hold the subplots
plt.figure(figsize=(15, 10))

# Loop through the categorical columns and create a subplot for each
for index, column in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, index)  # Adjust the grid size according to the number of categorical columns
    sns.countplot(x=column, data=df3)
    plt.title(f'Count of {column}')

plt.tight_layout()
plt.show()


### 3. Drop Irrelevant Columns

In [None]:
df3.columns

In [None]:
# Drop irrelevant columns
df3.drop(['PassengerId','Name',], axis = 1, inplace = True)

## IV. Insights Extraction and Analysis

In [None]:
# Decide to use df3 as our cleaned data frame
cleaned_df = df3.copy()

In [None]:
# Descriptive statistics for numeric data using df3
cleaned_df.describe()

In [None]:
# Loop through each categorical column and print the count of each category
for column in cleaned_df.select_dtypes(include=['object', 'category']).columns:
    print(f"Counts for {column} category:")
    print(df[column].value_counts())
    print()  # This will print a blank line for better readability


In [None]:
# Numeric Variables Visual
# Drop non-numerical columns for simplicity, or you can keep them to see how seaborn handles categorical data
numerical_data = cleaned_df.select_dtypes(include=['float64', 'int64'])

# Create a pair plot
sns.pairplot(numerical_data)

# Show the plot
plt.show()


In [None]:
# Correlation heatmap
# Calculate the correlation matrix
corr_matrix = cleaned_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, linewidths=.5)

# Add title
plt.title('Correlation Heatmap of the Cleaned Data')

# Show the plot
plt.show()

Insights:...