In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import joblib

In [2]:
# Load the datasets
train_data = pd.read_csv('D:/Hochschule Fresenius notes (sem3)/Technical Applications and Data Management/Final_project/train.csv')
test_data = pd.read_csv('D:/Hochschule Fresenius notes (sem3)/Technical Applications and Data Management/Final_project/test.csv')

In [3]:
# Data Understanding
print("Training Data Info:")
print(train_data.info())
print("Test Data Info:")
print(test_data.info())

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

In [4]:
# Data Preparation

def preprocess_data(data_prep, is_train=True):
    """Preprocess the Titanic dataset."""
    # Handle missing 'Age' using median imputation
    age_imputer = SimpleImputer(strategy='median')
    data_prep['Age'] = age_imputer.fit_transform(data_prep[['Age']])

    # Fill missing 'Embarked' with mode (for training set)
    if is_train:
        embarked_imputer = SimpleImputer(strategy='most_frequent')
        data_prep['Embarked'] = embarked_imputer.fit_transform(data_prep[['Embarked']])
    else:
        data_prep['Embarked'].fillna("Unknown", inplace=True)

    # Fill missing 'Fare' (only in test data) using median
    if 'Fare' in data_prep.columns:
        fare_imputer = SimpleImputer(strategy='median')
        data_prep['Fare'] = fare_imputer.fit_transform(data_prep[['Fare']])

    # Drop columns with too many missing values or irrelevant information
    data_prep.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

    # Convert categorical variables to numeric (one-hot encoding)
    data_prep = pd.get_dummies(data_prep, columns=['Sex', 'Embarked'], drop_first=True)

    return data_prep

In [5]:
# Preprocess the datasets
train_cleaned = preprocess_data(train_data, is_train=False)
test_cleaned = preprocess_data(test_data, is_train=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_prep['Embarked'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_prep['Embarked'].fillna("Unknown", inplace=True)


In [6]:
# Split the training data into train and validation sets
X = train_cleaned.drop(columns=['Survived'])
y = train_cleaned['Survived']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Modeling
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

model_results = {}

In [9]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)
    model_results[name] = {
        "Accuracy": acc,
        "Classification Report": report
    }
    print(f"{name} Model:\n")
    print(f"Accuracy: {acc:.2f}")
    print(classification_report(y_val, y_pred))


Logistic Regression Model:

Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       157
           1       0.79      0.73      0.76       111

    accuracy                           0.81       268
   macro avg       0.81      0.80      0.80       268
weighted avg       0.81      0.81      0.81       268

Decision Tree Model:

Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       157
           1       0.71      0.71      0.71       111

    accuracy                           0.76       268
   macro avg       0.75      0.75      0.75       268
weighted avg       0.76      0.76      0.76       268

Random Forest Model:

Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.81      0.84      0.83       157
           1       0.76      0.72      0.74       111

    accuracy                           0.79       268
   macro av

In [10]:
# Select the Decision Tree model
best_model = models["Decision Tree"]

In [11]:
# Predictions on the test dataset
test_predictions = best_model.predict([X.iloc[0]])



In [12]:
train_cleaned.head(1)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Embarked_Unknown
0,0,3,22.0,1,0,7.25,True,False,True,False


In [13]:
# Print prediction result and file save confirmation
print("Prediction for the first row:", test_predictions[0])

Prediction for the first row: 0


In [14]:
model_filename = 'titanic_suvival_model.pkl'
joblib.dump(model, model_filename)

['titanic_suvival_model.pkl']