## **Import Required Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## **Read the Data**

In [2]:
df = pd.read_csv(r"D:\Coding\Datasets\titanic.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Dropping the unnecessary columns
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## **Train Test Split**

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop("Survived", axis=1),
                                                    df["Survived"],
                                                    test_size=0.3,
                                                    random_state=0)
x_train.shape, x_test.shape

((623, 7), (268, 7))

In [5]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
857,1,male,51.0,0,0,26.55,S
52,1,female,49.0,1,0,76.7292,C
386,3,male,1.0,5,2,46.9,S
124,1,male,54.0,0,1,77.2875,S
578,3,female,,1,0,14.4583,C


In [6]:
y_train.head()

857    1
52     1
386    0
124    0
578    0
Name: Survived, dtype: int64

## **Data Preprocessing**

In [7]:
# Check the information of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


### **Apply SimpleImputer on 'Age' and 'Embarked' Columns**

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
# Create an object of the SimpleImputer class
simple_imputer_age = SimpleImputer()
simple_imputer_embarked = SimpleImputer(strategy="most_frequent")

# Fit the training data
simple_imputer_age.fit(x_train[["Age"]])
simple_imputer_embarked.fit(x_train[["Embarked"]])

# Transform the 'Age' and 'Embarked' columns of the training data
x_train_age = simple_imputer_age.transform(x_train[["Age"]])
x_train_embarked = simple_imputer_embarked.transform(x_train[["Embarked"]])

# Transform the 'Age' and 'Embarked' columns of the testing data
x_test_age = simple_imputer_age.transform(x_test[["Age"]])
x_test_embarked = simple_imputer_embarked.transform(x_test[["Embarked"]])

In [10]:
# Print the first 5 values of x_train_age
x_train_age[:5]

array([[51.        ],
       [49.        ],
       [ 1.        ],
       [54.        ],
       [29.91533865]])

In [11]:
# Print the first 5 values of x_train_embarked
x_train_embarked[:5]

array([['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C']], dtype=object)

### **Apply OneHot Encoder on 'Sex' and 'Embarked' Columns**

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# Create an object of the OneHotEncoder class
one_hot_encoder_sex = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
one_hot_encoder_embarked = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit the training data
one_hot_encoder_sex.fit(x_train[["Sex"]])
one_hot_encoder_embarked.fit(x_train_embarked)

# Transform the 'Sex' and 'Embarked' columns of the training data
x_train_sex = one_hot_encoder_sex.transform(x_train[["Sex"]])
x_train_embarked = one_hot_encoder_embarked.transform(x_train_embarked)

# Transform the 'Sex' and 'Embarked' columns of the testing data
x_test_sex = one_hot_encoder_sex.transform(x_test[["Sex"]])
x_test_embarked = one_hot_encoder_embarked.transform(x_test_embarked)

In [14]:
# Print the first 5 values of x_train_sex
x_train_sex[:5]

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [15]:
# Print the first 5 values of x_train_embarked
x_train_embarked[:5]

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [16]:
# Drop the 'Age', 'Sex' and 'Embarked' column from the training data
x_train_remaining = x_train.drop(columns=["Age", "Sex", "Embarked"])
x_train_remaining.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare
857,1,0,0,26.55
52,1,1,0,76.7292
386,3,5,2,46.9
124,1,0,1,77.2875
578,3,1,0,14.4583


In [17]:
# Drop the 'Age', 'Sex' and 'Embarked' column from the testing data
x_test_remaining = x_test.drop(columns=["Age", "Sex", "Embarked"])
x_test_remaining.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare
495,3,0,0,14.4583
648,3,0,0,7.55
278,3,4,1,29.125
31,1,1,0,146.5208
255,3,0,2,15.2458


In [18]:
# Merge the processed columns with the reamaining dataframe
x_train_transformed = np.concatenate((x_train_remaining, x_train_age, x_train_sex, x_train_embarked), axis=1)
x_test_transformed = np.concatenate((x_test_remaining, x_test_age, x_test_sex, x_test_embarked), axis=1)

In [19]:
# Print the x_train_transformed data
x_train_transformed

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 1., 0., ..., 1., 0., 0.],
       [3., 5., 2., ..., 0., 0., 1.],
       ...,
       [3., 0., 0., ..., 0., 1., 0.],
       [3., 1., 0., ..., 0., 0., 1.],
       [2., 1., 1., ..., 0., 0., 1.]])

In [20]:
one_hot_encoder_embarked.get_feature_names_out()

array(['x0_C', 'x0_Q', 'x0_S'], dtype=object)

In [21]:
# Assemble the column names of the transformed data
x_transformed_columns = np.array(x_train_remaining.columns)
x_transformed_columns = np.concatenate((x_transformed_columns, 
                                        simple_imputer_age.get_feature_names_out(),
                                        one_hot_encoder_sex.get_feature_names_out(),
                                        one_hot_encoder_embarked.get_feature_names_out()))

In [22]:
x_transformed_columns

array(['Pclass', 'SibSp', 'Parch', 'Fare', 'Age', 'Sex_female',
       'Sex_male', 'x0_C', 'x0_Q', 'x0_S'], dtype=object)

In [23]:
# Convert the tranformed arrays into pandas dataframe
x_train_transformed = pd.DataFrame(x_train_transformed, columns=x_transformed_columns)
x_test_transformed = pd.DataFrame(x_test_transformed, columns=x_transformed_columns)

In [24]:
x_train_transformed.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Age,Sex_female,Sex_male,x0_C,x0_Q,x0_S
0,1.0,0.0,0.0,26.55,51.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,0.0,76.7292,49.0,1.0,0.0,1.0,0.0,0.0
2,3.0,5.0,2.0,46.9,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,1.0,77.2875,54.0,0.0,1.0,0.0,0.0,1.0
4,3.0,1.0,0.0,14.4583,29.915339,1.0,0.0,1.0,0.0,0.0


In [25]:
x_test_transformed.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Age,Sex_female,Sex_male,x0_C,x0_Q,x0_S
0,3.0,0.0,0.0,14.4583,29.915339,0.0,1.0,1.0,0.0,0.0
1,3.0,0.0,0.0,7.55,29.915339,0.0,1.0,0.0,0.0,1.0
2,3.0,4.0,1.0,29.125,7.0,0.0,1.0,0.0,1.0,0.0
3,1.0,1.0,0.0,146.5208,29.915339,1.0,0.0,1.0,0.0,0.0
4,3.0,0.0,2.0,15.2458,29.0,1.0,0.0,1.0,0.0,0.0


## **Build a DecisionTree Classifier**

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
# Instantiate a DecisionTreeClassifier object
dt_classifier = DecisionTreeClassifier(random_state=0)

# Fit the training data
dt_classifier.fit(x_train_transformed, y_train)

## **Accuracy Assessment**

In [28]:
# Predict the x_test_transformed data
y_pred = dt_classifier.predict(x_test_transformed)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
# Print the overall accuracy of the decision tree model
accuracy_score(y_test, y_pred)

0.7761194029850746

## **Export the Model**

In [31]:
import pickle

In [32]:
# Exporting the one_hot_encoder_sex
pickle.dump(one_hot_encoder_sex, file=open("D:\Coding\Models\ohe_sex.pkl", "wb"))

# Exporting the one_hot_encoder_embarked
pickle.dump(one_hot_encoder_embarked, file=open("D:\Coding\Models\ohe_embarked.pkl", "wb"))

# Exporting the decision tree classifier
pickle.dump(dt_classifier, file=open("D:\Coding\Models\decision_tree_model.pkl", "wb"))