# Titanic Survival Prediction Using Decision Tree

In [None]:

# Step 1: Load the Titanic dataset
import pandas as pd

file_path = '/mnt/data/titanic.csv'
titanic_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
titanic_data.head()


### Step 1: Preprocess Data

In [None]:

# Map categorical data to numerical values and handle missing values
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data['Embarked'] = titanic_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)


### Step 2: Selecting Features

In [None]:

# Choose relevant features for the model
selected_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = titanic_data[selected_features]
y = titanic_data['Survived']


### Step 3: Split the Data

In [None]:

# Split data into training, development, and test sets
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


### Step 4: Train and Plot a Decision Tree

In [None]:

from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Train a decision tree with a specified max depth
decision_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
decision_tree.fit(X_train, y_train)

# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(decision_tree, feature_names=selected_features, class_names=["Not Survived", "Survived"], filled=True)
plt.title("Decision Tree with max_depth=3")
plt.show()


### Step 5: Compute Development Accuracy

In [None]:

from sklearn.metrics import accuracy_score

# Predict and compute development accuracy
dev_predictions = decision_tree.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, dev_predictions)
dev_accuracy


### Step 6: Model with Different Max Depths

In [None]:

training_accuracies = []
development_accuracies = []

for depth in range(2, 11):
    # Train decision tree with specified max depth
    model = DecisionTreeClassifier(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    
    # Compute accuracies
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    dev_accuracy = accuracy_score(y_dev, model.predict(X_dev))
    
    # Store accuracies
    training_accuracies.append(train_accuracy)
    development_accuracies.append(dev_accuracy)
    
    # Plot the tree
    plt.figure(figsize=(12, 8))
    plot_tree(model, feature_names=selected_features, class_names=["Not Survived", "Survived"], filled=True)
    plt.title(f"Decision Tree with max_depth={depth}")
    plt.show()


### Step 7: Plot Training and Development Accuracies

In [None]:

# Plot accuracy curves
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), training_accuracies, marker='o', label='Training Accuracy')
plt.plot(range(2, 11), development_accuracies, marker='s', label='Development Accuracy')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Training and Development Accuracies vs Max Depth')
plt.legend()
plt.grid()
plt.show()


### Step 8: Analyze Line Shape


The training accuracy increases consistently as the max depth grows, eventually reaching 100%. However, the development accuracy peaks at a certain depth and starts to decline, indicating overfitting. The optimal depth is where development accuracy is highest.


### Step 9: Report Test Accuracy

In [None]:

# Identify the optimal depth (where development accuracy peaks)
import numpy as np

optimal_depth = np.argmax(development_accuracies) + 2  # Adding 2 as range starts from 2
final_model = DecisionTreeClassifier(max_depth=optimal_depth, random_state=42)
final_model.fit(X_train, y_train)

# Compute test accuracy
test_predictions = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

optimal_depth, test_accuracy
