In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Load the dataset
data = pd.read_csv('Titanic-Dataset.csv')
# Separate features and target variable
X = data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])  # Dropping columns not used
y = data['Survived']
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

# Identify numeric and categorical columns
numeric_features = ['Pclass', 'SibSp', 'Parch', 'Age', 'Fare']
categorical_features = ['Sex', 'Embarked']

# Handle missing values for numeric features
imputer_num = SimpleImputer(strategy='mean')
X_train[numeric_features] = imputer_num.fit_transform(X_train[numeric_features])
X_test[numeric_features] = imputer_num.transform(X_test[numeric_features])

# Handle missing values for categorical features
imputer_cat = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = imputer_cat.fit_transform(X_train[categorical_features])
X_test[categorical_features] = imputer_cat.transform(X_test[categorical_features])

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# Get column names for the encoded features
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Create DataFrame for encoded features
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)

# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(columns=categorical_features).join(X_train_encoded_df)
X_test = X_test.drop(columns=categorical_features).join(X_test_encoded_df)
# Scale numeric features
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Train the model
model = RandomForestClassifier(n_estimators=3000, max_depth=20, random_state=1)
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Print the first few predictions
print("First few predictions:", y_pred[:])

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy:", accuracy)


First few predictions: [0 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1
 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0]
Model accuracy: 0.8491620111731844
