# **Import Your Arsenal**

In [None]:
# Data Handling
import numpy as np
import pandas as pd

In [None]:
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Magic command for jupyter Notebook to display plots inline
%matplotlib inline

In [None]:
# Set a style for your plots to make them look professional
sns.set_style('whitegrid')

# **Load the Data and First Impressions**

In [None]:
# Load the dataset
import pandas as pd
df = pd.read_csv("titanic.csv")

In [None]:
data = df.copy()

In [None]:
# Get the "big picture" - shape and first 5 rows
print("Dataser Dimension:", data.shape)
print("\nFirst Look:")
display(data.head())

In [None]:
print("\nDataset Info:")
data.info()


# **Basic statistics**

In [None]:
# Get descriptive statistics for numerical features
print("Numerical Description:")
display(data.describe())

In [None]:
# Get descriptive statistics for categorical features
print("\nCategorical Description:")
display(data.describe(include=['O']))

# **Data Cleaning and Preprocessing (The "Missing it Usable" Phase)**

In [None]:
# Calculate the Percentage of missing values for each column
missing_percent = (data.isnull().sum() / len(data)) *100
missing_percent = missing_percent[missing_percent > 0]. sort_values(ascending = False)
print("Missing Value Percentage:")
print(missing_percent)

In [None]:
# Let's Visualize the missing Data
plt.figure(figsize=(10, 4))
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('Heatmap of Missing Data')
plt.show()

In [None]:
# Advanced approach: Extract Deck
# This needs to be done before dropping the 'deck' column
data['Deck'] = data['deck'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')
display(data.head())

In [None]:
# Group the data and canculate the median age for each group
age_medians = data.groupby(['pclass', 'sex'])['age'].median()
print(age_medians)

In [None]:
# Define a function to impute age based on group
def impute_age(row):
  age = row['age']
  pclass = row['pclass']
  sex = row['sex']
  if pd.isna(age):
      return age_medians[pclass][sex]
  else:
      return age

In [None]:
# Apply the function
data['age'] = data[['age', 'pclass', 'sex']].apply(impute_age, axis=1)

In [None]:
print(data['embarked'].value_counts())
data['embarked'].fillna('S', inplace=True)

# Verify Cleaning Work

In [None]:
# Checking the heaadmap
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('Missing Data After Cleaning')
plt.show()
print("Remaining missing values:", data.isnull().sum().sum())

# Exploratory Data Analysis and Visualization

In [None]:
# Countplot of survivors
plt.figure(figsize=(8,5))
sns.countplot(x='survived', data=data, palette='viridis')
plt.title('Overall Survival Count')
plt.show()
survival_rate = data['survived'].mean()
print(f"Overall Survival Rate: {survival_rate:.2%}")


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='survived', hue='sex', data=data, palette='RdBu_r')
plt.title("Survival Count by Sex")
plt.show()
print(data.groupby('sex')['survived'].mean())


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='survived', hue='pclass', data=data, palette='rainbow')
plt.title("Survival Count by Pclass")
plt.show()
print(data.groupby('pclass')['survived'].mean())

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=data, x='age', hue='survived', bins=30, kde=True, alpha=0.6, palette='viridis')
plt.title("Age Distribution by Survivors VS. Non-Survivors")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(x='survived', y='age', data=data, palette='viridis')
plt.title("Age Spread By Survival")
plt.show()



In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=data[data['fare'] < 200], x='fare', hue='survived', bins=30, kde=True, alpha=0.6)
plt.title("Fare Distribution (fare < 200): Survivors Vs. Non-Survivors")
plt.show()

In [None]:
data['FamilySize'] = data['sibsp'] + data['parch'] + 1
data['IsAlone'] = 1
data.loc[data['FamilySize'] > 1, 'IsAlone'] = 0

In [None]:
print(data.groupby('IsAlone') ['survived'].mean())
sns.countplot(x='IsAlone', hue='survived', data=data)
plt.title('Survival by Being Alone')
plt.show()


In [None]:
g = sns.catplot(x='pclass', y='survived', hue='sex', data=data, kind='point', palette='magma', height=5, aspect=1.5)
g.fig.suptitle('Survival Rate By Pclass and Sex', y=1.05)
plt.show()


In [None]:
g = sns.catplot(x='pclass', y='survived', hue='age', data=data, kind='point', palette='magma', height=5, aspect=1.5)
g.fig.suptitle('Survival Rate By Pclass and Age', y=1.05)
plt.show()

In [None]:
g = sns.catplot(x='pclass', y='survived', hue='fare', data=data, kind='point', palette='magma', height=5, aspect=1.5)
g.fig.suptitle('Survival Rate By Pclass and Fare', y=1.05)
plt.show()

In [None]:
g = sns.catplot(x='age', y='survived', hue='sex', data=data, kind='point', palette='magma', height=5, aspect=5.5)
g.fig.suptitle('Survival Rate By Age and Sex', y=1.05)
plt.show()

In [None]:
g = sns.catplot(x='fare', y='survived', hue='sex', data=data, kind='point', palette='magma', height=5, aspect=18.5)
g.fig.suptitle('Survival Rate By Fare and Sex', y=1.05)
plt.show()