In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

df = pd.read_csv(r'G:\AI\AI\Homework AI\lesson-15\homework\titanic.csv')

df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Handle Missing Values 
imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])

imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = df['Embarked'].astype(object) 
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']]).flatten()

# Prepare data for model
df = pd.get_dummies(df, columns=['Embarked'],  prefix='', prefix_sep='', dtype=int)
df['Pclass'] = (df['Pclass'] == 1).astype(int)
df['Age'] = (df['Age'] < 25).astype(int)
df['Sex'] = (df['Sex'] == 'male').astype(int)

print(df.head())

# Split data into training and testing sets
X, y = df.drop(columns='Survived'), df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)


# Scale numerical features (for automatic binarization)
scaler = StandardScaler()
float_cols = X_train.select_dtypes(include=['float64']).columns
X_train[float_cols] = scaler.fit_transform(X_train[float_cols])
X_test[float_cols] = scaler.transform(X_test[float_cols])

# Train the model
nb_clf = BernoulliNB()
nb_clf.fit(X_train, y_train)

# Make predictions
y_pred = nb_clf.predict(X_test)

# Check accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

   Pclass  Sex  Age  SibSp  Parch     Fare  Survived  C  Q  S
0       0    1    1      1      0   7.2500         0  0  0  1
1       1    0    0      1      0  71.2833         1  1  0  0
2       0    0    0      0      0   7.9250         1  0  0  1
3       1    0    0      1      0  53.1000         1  0  0  1
4       0    1    0      0      0   8.0500         0  0  0  1
Accuracy: 0.76


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

df = pd.read_csv(r'G:\AI\AI\Homework AI\lesson-15\homework\titanic.csv')

df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Handle Missing Values 
imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])

imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = df['Embarked'].astype(object) 
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']]).flatten()

# Prepare data for model
df = pd.get_dummies(df, columns=['Embarked'],  prefix='', prefix_sep='', dtype=int)
df['Sex'] = (df['Sex'] == 'male').astype(int)

print(df.head())

# Split data into training and testing sets
X, y = df.drop(columns='Survived'), df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)

# Scale numerical features (for automatic binarization)
scaler = StandardScaler()
float_cols = X_train.select_dtypes(include=['float64']).columns
X_train[float_cols] = scaler.fit_transform(X_train[float_cols])
X_test[float_cols] = scaler.transform(X_test[float_cols])

# Train the model
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# Make predictions
y_pred = nb_clf.predict(X_test)

# Check accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

   Pclass  Sex   Age  SibSp  Parch     Fare  Survived  C  Q  S
0       3    1  22.0      1      0   7.2500         0  0  0  1
1       1    0  38.0      1      0  71.2833         1  1  0  0
2       3    0  26.0      0      0   7.9250         1  0  0  1
3       1    0  35.0      1      0  53.1000         1  0  0  1
4       3    1  35.0      0      0   8.0500         0  0  0  1
Accuracy: 0.76


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

df = pd.read_csv(r'G:\AI\AI\Homework AI\lesson-15\homework\titanic.csv')

df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# --- Handle Missing Values ---
# 'Age' often has missing values (NaNs). Impute with the median.
imputer_age = SimpleImputer(strategy='median')
df['Age'] = imputer_age.fit_transform(df[['Age']])

# 'Embarked' sometimes has missing values. Impute with the most frequent value (mode).
imputer_embarked = SimpleImputer(strategy='most_frequent')
df['Embarked'] = df['Embarked'].astype(str) # Ensure it's string before imputation
df['Embarked'] = imputer_embarked.fit_transform(df[['Embarked']]).flatten()

# --- Feature Engineering / Preprocessing ---
# One-hot encode 'Embarked' categorical feature
df = pd.get_dummies(df, columns=['Embarked'], prefix='', prefix_sep='', dtype=int)

# 'Pclass' is kept as numerical (1, 2, 3)
# 'Age' is kept as numerical (after imputation)
# Binarize 'Sex': Convert 'male' to 1, 'female' to 0.
df['Sex'] = (df['Sex'] == 'male').astype(int)

print("DataFrame head after preprocessing and imputation:")
print(df.head())

# --- Split data into training and testing sets ---
X, y = df.drop(columns='Survived'), df['Survived']
# IMPORTANT: shuffle=True is default, random_state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=42)


# --- Scale numerical features ---
scaler = StandardScaler()

# Identify numerical columns that are NOT already binary (0 or 1)
# This ensures 'Age', 'Fare', and 'Pclass' (if not already 0/1) are scaled.
numerical_cols_to_scale = []
for col in X_train.columns:
    # Check if the column is numeric (int or float)
    if pd.api.types.is_numeric_dtype(X_train[col]):
        # Check if it's not already a binary (0 or 1) column
        if not ((X_train[col].isin([0, 1]).all()) and (X_train[col].min() == 0 and X_train[col].max() == 1)):
            numerical_cols_to_scale.append(col)


if numerical_cols_to_scale:
    X_train[numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
    X_test[numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])
    print(f"\nScaled numerical features: {numerical_cols_to_scale}")
else:
    print("\nNo continuous numerical features found for explicit scaling.")
    print("All features are likely binary or one-hot encoded.")


# --- Train the model ---
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# --- Make predictions ---
y_pred = nb_clf.predict(X_test)

# --- Check accuracy ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Training and Prediction Complete.")
print(f"Accuracy of Gaussian Naive Bayes model: {accuracy:.2f}") # Format to 2 decimal places


DataFrame head after preprocessing and imputation:
   Pclass  Sex   Age  SibSp  Parch     Fare  Survived  C  Q  S  nan
0       3    1  22.0      1      0   7.2500         0  0  0  1    0
1       1    0  38.0      1      0  71.2833         1  1  0  0    0
2       3    0  26.0      0      0   7.9250         1  0  0  1    0
3       1    0  35.0      1      0  53.1000         1  0  0  1    0
4       3    1  35.0      0      0   8.0500         0  0  0  1    0

Scaled numerical features: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'nan']

Model Training and Prediction Complete.
Accuracy of Gaussian Naive Bayes model: 0.76
