In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OneHotEncoder

# Load the Titanic dataset
# url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
# df = pd.read_csv(url)
df = pd.read_csv('./titanic_embarked.csv')

# Display the first few rows of the dataset
print(df.head())

# Handle missing values
df = df.dropna(subset=['Age', 'Embarked', 'Survived'])  # Drop rows with missing 'Age', 'Embarked', or 'Survived'

# Using pd.get_dummies()
# df_dummies = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

# Using OneHotEncoder as an alternative
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(df[['Sex', 'Embarked']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Sex', 'Embarked']))

# Concatenate encoded features with the original dataframe
df_encoded = pd.concat([df.drop(columns=['Sex', 'Embarked']), encoded_df], axis=1)

# Ensure X and y have the same number of rows
df_encoded = df_encoded.dropna(subset=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Survived'])

# Define features (X) and target (y)
X = df_encoded[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]
y = df_encoded['Survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Plot the decision tree
plt.figure(figsize=(20,10))
tree.plot_tree(clf, feature_names=X.columns, class_names=['Not Survived', 'Survived'], filled=True)
plt.title("Decision Tree Visualization")
plt.show()

# Save the model to a pickle file
with open('decision_tree_titanic_model.pkl', 'wb') as f:
    pickle.dump(clf, f)
print("Model saved to decision_tree_titanic_model.pkl")

# Load the model from the pickle file
with open('decision_tree_titanic_model.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

# Verify that the loaded model works by making a prediction
sample_prediction = loaded_clf.predict(X_test[:1])
print(f"Sample prediction: {sample_prediction}")


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


ValueError: Found input variables with inconsistent numbers of samples: [860, 712]