# Project 4: Classification - Logistic Regression on the Titanic Dataset
• Description: Build a logistic regression model to predict survival on the Titanic based on passenger features like age, sex, and class. o Skills: Classification techniques, model evaluation, feature engineering using Python and Scikit-learn.


In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

In [25]:
df= pd.read_csv(r"C:\Users\harsh soni\Downloads\PROJECTS\titanic3.csv")

In [None]:
#Remove opIrrelevant Columns

In [27]:
df.drop(['body', 'cabin', 'boat', 'home.dest'], axis=1, inplace=True)

# extract title from name

In [29]:
df['title'] = df['name'].str.extract(r',\s*([^\.]+)\.')

# Map Rare Titles

In [31]:
title_map = {
    'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Dr': 'Officer', 'Rev': 'Officer', 'Col': 'Officer',
    'Major': 'Officer', 'Capt': 'Officer', 'Sir': 'Noble',
    'Don': 'Noble', 'Dona': 'Noble', 'Lady': 'Noble',
    'the Countess': 'Noble', 'Jonkheer': 'Noble'
}
df['title'] = df['title'].map(title_map)


# Fill Missing age by title Median

In [35]:
df['age'] = df.groupby('title')['age'].transform(lambda x: x.fillna(x.median()))

# Fill Missing embarked with Mode

In [37]:
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

# Replace 0 fare with Median

In [41]:
df['fare'] = df['fare'].replace(0, np.nan)
df['fare'] = df['fare'].fillna(df['fare'].median())

# Encode Categorical Variables

In [43]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])            # male=1, female=0
df['embarked'] = le.fit_transform(df['embarked'])  # C=0, Q=1, S=2
df['title'] = le.fit_transform(df['title'])        # encode titles

# Select Features and Target

In [45]:
features = ['pclass', 'sex', 'age', 'fare', 'embarked', 'title']
X = df[features]
y = df['survived']

# Split the data

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Scale the features

In [49]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression Model

In [51]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict the model

In [53]:
y_pred = model.predict(X_test_scaled)

# Evaluate the Model

In [55]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7748091603053435

Confusion Matrix:
 [[126  18]
 [ 41  77]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.88      0.81       144
           1       0.81      0.65      0.72       118

    accuracy                           0.77       262
   macro avg       0.78      0.76      0.77       262
weighted avg       0.78      0.77      0.77       262

