In [16]:
# Run this ONCE in a terminal or Jupyter cell
# Remove the # to uncomment and run

# !pip install scikit-learn pandas matplotlib kagglehub

In [18]:
import pandas as pd  # for working with tables (dataframes)
from sklearn.model_selection import train_test_split  # split data
from sklearn.tree import DecisionTreeClassifier, plot_tree  # decision tree
from sklearn.metrics import accuracy_score, classification_report  # evaluation
import matplotlib.pyplot as plt  # to draw the tree

In [20]:
import kagglehub

# Download the Titanic dataset from Kaggle
path = kagglehub.dataset_download("sakshisatre/titanic-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/ryanmcmaster/.cache/kagglehub/datasets/sakshisatre/titanic-dataset/versions/2


In [22]:
# Load from a file in the same folder as your notebook
df = pd.read_csv("Titanic Dataset.csv")

In [24]:
# Show the first 5 rows
print("First 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

     age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.00      0      0   24160  211.3375       B5        S    2    NaN   
1   0.92      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.00      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0       

In [26]:
# Get information about the dataset
print("Dataset info:")
print(df.info())

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB
None


In [28]:
# Show statistics for numeric columns
print("Basic statistics:")
print(df.describe())


Basic statistics:
            pclass     survived          age        sibsp        parch  \
count  1309.000000  1309.000000  1046.000000  1309.000000  1309.000000   
mean      2.294882     0.381971    29.881138     0.498854     0.385027   
std       0.837836     0.486055    14.413493     1.041658     0.865560   
min       1.000000     0.000000     0.170000     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    39.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     9.000000   

              fare        body  
count  1308.000000  121.000000  
mean     33.295479  160.809917  
std      51.758668   97.696922  
min       0.000000    1.000000  
25%       7.895800   72.000000  
50%      14.454200  155.000000  
75%      31.275000  256.000000  
max     512.329200  328.000000  


In [30]:
# Select only these columns/features: the inside brackets create a list of column names, 
# and the outside brackets use that list to keep (choose) those columns from the DataFrame
df = df[['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

print("Columns after selection:")
print(df.columns.tolist())     # Prints all column names in the DataFrame
print(f"\nShape: {df.shape}")  # Shows the number of rows and columns

Columns after selection:
['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

Shape: (1309, 8)


In [32]:
# Count missing values in each column
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64


In [34]:
# Calculate median age
median_age = df['age'].median()
print(f"Median age: {median_age}")

# Fill missing ages with median
df['age'].fillna(median_age, inplace=True)

# Verify it worked
print(f"Missing ages after filling: {df['age'].isnull().sum()}")

Median age: 28.0
Missing ages after filling: 0


In [50]:
# Fill missing fare with median fare
median_fare = df['fare'].median()
# Print median fare
print(f"Median fare: {median_fare}")

df['fare'].fillna(median_fare, inplace=True)

# Verify
print(f"Missing fares: {df['fare'].isnull().sum()}")

Median fare: 14.4542
Missing fares: 0


In [38]:
# Drop rows where embarked is missing
df.dropna(subset=['embarked'], inplace=True)

# Verify
print(f"Missing embarked: {df['embarked'].isnull().sum()}")
print(f"Rows remaining: {len(df)}")

Missing embarked: 0
Rows remaining: 1307


In [40]:
# Final check for missing values
print("Missing values after cleaning:")
print(df.isnull().sum())

Missing values after cleaning:
pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64


In [42]:
# Select only these columns/features: the inside brackets create a list of column names, 
# and the outside brackets use that list to keep (choose) those columns from the DataFrame
df = df[['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

print("Columns after selection:")
print(df.columns.tolist())     # Prints all column names in the DataFrame


# There are a couple of ways to show the number of rows and columns in the updated dataframe ("df"):

# Option 1: Print rows and columns separately
print(f"Rows: {df.shape[0]}")     # Number of rows in the DataFrame
print(f"Columns: {df.shape[1]}")  # Number of columns in the DataFrame

# Option 2: Print them together as a tuple
print(f"\nShape: {df.shape}")     # Shows (rows, columns) together

Columns after selection:
['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
Rows: 1307
Columns: 8

Shape: (1307, 8)


In [52]:
# Save cleaned data
# Writes the DataFrame to a new CSV file without adding extra row numbers (index)
df.to_csv("Titanic_Cleaned.csv", index=False)     

# Prints a confirmation message so the user knows the save was successful
print("✓ Cleaned data saved to 'Titanic_Cleaned.csv'")   

✓ Cleaned data saved to 'Titanic_Cleaned.csv'


In [54]:
import pandas as pd

# Load the cleaned data (or load and clean again)
df = pd.read_csv("Titanic_Cleaned.csv")

print("✓ Cleaned data loaded")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

✓ Cleaned data loaded
Shape: (1307, 8)
Columns: ['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']


In [56]:
# Display data types of each column
print("Data types:")
print(df.dtypes)

Data types:
pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object


In [58]:
# Check unique values in 'sex'
print("Unique values in 'sex':")
print(df['sex'].unique())
print(f"Count: {df['sex'].nunique()}")

# Check unique values in 'embarked'
print("\nUnique values in 'embarked':")
print(df['embarked'].unique())
print(f"Count: {df['embarked'].nunique()}")

Unique values in 'sex':
['female' 'male']
Count: 2

Unique values in 'embarked':
['S' 'C' 'Q']
Count: 3


In [60]:
# Show first few rows with 'sex' column
print("Before encoding:")
print(df[['sex', 'age', 'survived']].head())

Before encoding:
      sex    age  survived
0  female  29.00         1
1    male   0.92         1
2  female   2.00         0
3    male  30.00         0
4  female  25.00         0


In [62]:
# Convert 'sex' to dummy variables
df = pd.get_dummies(df, columns=['sex'], drop_first=True)

print("Columns after converting sex:")
print(df.columns.tolist())

Columns after converting sex:
['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'sex_male']


In [64]:
# Convert 'embarked' to dummy variables
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

print("Columns after converting embarked:")
print(df.columns.tolist())

Columns after converting embarked:
['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']


In [66]:
# X = all columns EXCEPT survived
X = df.drop('survived', axis=1)

# y = ONLY the survived column
y = df['survived']

# Check the shapes
print(f"X shape: {X.shape}")  # (rows, features) multiple features
print(f"y shape: {y.shape}")  # (rows,) only rows because it's only one feature

# See the feature names
print(f"\nFeature names: {X.columns.tolist()}")

X shape: (1307, 8)
y shape: (1307,)

Feature names: ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']


In [68]:
from sklearn.model_selection import train_test_split

# Split the data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing
    random_state=42     # For reproducible results
)

# Check the sizes
print(f"Training set: {X_train.shape[0]} passengers")
print(f"Testing set: {X_test.shape[0]} passengers")

Training set: 1045 passengers
Testing set: 262 passengers


In [70]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
model = DecisionTreeClassifier(
    max_depth=5,        # Limit tree depth
    random_state=42     # For reproducible results
)

print("✓ Model created!")
print(f"Model type: {type(model)}")

✓ Model created!
Model type: <class 'sklearn.tree._classes.DecisionTreeClassifier'>


In [None]:
# Train the model on training data
model.fit(X_train, y_train)

print("✓ Model trained successfully!")

In [72]:
# Verify X_train and X_test have the same columns
print(f"X_train columns: {X_train.shape[1]}")      # Number of feature columns in training data
print(f"X_test columns: {X_test.shape[1]}")        # Number of feature columns in testing data
print(f"\nColumns match: {X_train.shape[1] == X_test.shape[1]}")   # Check if the counts are the same

X_train columns: 8
X_test columns: 8

Columns match: True


In [74]:
# Display first few rows of training features
print("First 5 rows of X_train:")
print(X_train.head())

print("\nFirst 10 values of y_train:")
print(y_train.head(10).tolist())

First 5 rows of X_train:
      pclass   age  sibsp  parch     fare  sex_male  embarked_Q  embarked_S
1292       3  28.5      0      0   16.100      True       False        True
543        2  30.0      3      0   21.000     False       False        True
289        1  39.0      1      1   79.650     False       False        True
10         1  47.0      1      0  227.525      True       False       False
147        1  28.0      0      0   42.400      True       False        True

First 10 values of y_train:
[0, 1, 1, 0, 0, 1, 0, 0, 1, 0]


In [76]:
# TODO: Create a DecisionTreeClassifier
# Set max_depth=5 and random_state=42
model = DecisionTreeClassifier(
    max_depth=5,
    random_state=42
)

print("✓ Model created!")
print(f"Model type: {type(model)}")
print(f"Max depth: {model.max_depth}")

✓ Model created!
Model type: <class 'sklearn.tree._classes.DecisionTreeClassifier'>
Max depth: 5


In [80]:
# TODO: Train the model using .fit()
# Pass X_train and y_train as arguments
model.fit(X_train, y_train)

print("✓ Model trained successfully!")
print(f"Tree depth: {model.get_depth()}")
print(f"Number of leaves: {model.get_n_leaves()}")

✓ Model trained successfully!
Tree depth: 5
Number of leaves: 29


In [82]:
# TODO: Train the model using .fit()
# Pass X_train and y_train as arguments
model.fit(X_train, y_train)

print("✓ Model trained successfully!")
print(f"Tree depth: {model.get_depth()}")
print(f"Number of leaves: {model.get_n_leaves()}")

✓ Model trained successfully!
Tree depth: 5
Number of leaves: 29


In [84]:
# Check model attributes after training
print("Model attributes:")
print(f"Number of features: {model.n_features_in_}")
print(f"Feature names: {model.feature_names_in_}")
print(f"Number of outputs: {model.n_outputs_}")
print(f"Number of classes: {model.n_classes_}")
print(f"Classes: {model.classes_}")

Model attributes:
Number of features: 8
Feature names: ['pclass' 'age' 'sibsp' 'parch' 'fare' 'sex_male' 'embarked_Q'
 'embarked_S']
Number of outputs: 1
Number of classes: 2
Classes: [0 1]


In [86]:
# Make predictions on the first 5 test examples
sample_predictions = model.predict(X_test.head(5))
actual_values = y_test.head(5).tolist()

print("Sample predictions vs actual:")
for i, (pred, actual) in enumerate(zip(sample_predictions, actual_values)):
    result = "✓" if pred == actual else "✗"
    print(f"Passenger {i+1}: Predicted={pred}, Actual={actual} {result}")

Sample predictions vs actual:
Passenger 1: Predicted=0, Actual=0 ✓
Passenger 2: Predicted=0, Actual=1 ✗
Passenger 3: Predicted=0, Actual=0 ✓
Passenger 4: Predicted=0, Actual=0 ✓
Passenger 5: Predicted=0, Actual=0 ✓
