In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import plot_tree

data = pd.read_csv('titanic.csv')

print("Dataset shape:", data.shape)
print("\nDataset columns:", data.columns)
print("\nDataset description:\n", data.describe())

print("Missing values:\n", data.isnull().sum())

plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=data)
plt.xlabel("Survived")
plt.ylabel("Count")
plt.title("Distribution of Survival")
plt.show()

plt.figure(figsize=(12, 8))
sns.boxplot(x='Pclass', y='Age', hue='Survived', data=data)
plt.xlabel("Passenger Class")
plt.ylabel("Age")
plt.title("Relationship between Passenger Class, Age, and Survival")
plt.show()

data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
data.loc[:, 'Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data.loc[:, 'Age'] = data['Age'].fillna(data['Age'].median())

X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", confusion)
print("\nClassification Report:\n", report)

plt.figure(figsize=(20, 10))
plot_tree(clf, filled=True, feature_names=X.columns, class_names=['Not Survived', 'Survived'])
plt.show()

importances = clf.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)
print("Feature Importances:\n", feature_importances)