# Imports and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm
#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc



In [None]:
train_data_path = "https://storage.googleapis.com/edulabs-public-datasets/titanic/titanic-train.csv"
test_data_path = "https://storage.googleapis.com/edulabs-public-datasets/titanic/titanic-test.csv"

In [None]:
# Load Titanic dataset
df = pd.read_csv(train_data_path)

df

#sibsp. Number of Siblings/Spouses Aboard.
#parch. Number of Parents/Children Aboard.

# Quick data cleaning

**Note: this is not optimal data cleaning, we will improve this later.**

Now we just want to move directly to Classification with Logistic Regression

In [None]:
# Check missing values
df.isnull().sum()

In [None]:
# drop Cabin column - too many missing values
df.drop(columns=['Cabin'], inplace=True)

# fill age with median age by groups


In [None]:
df.Age.plot(kind='hist', bins=30)

In [None]:
df.groupby(['Sex'])['Age'].median()

In [None]:
df.groupby(['Pclass'])['Age'].median()

In [None]:
df.groupby(['Sex', 'Pclass', 'Parch'])['Age'].median()

In [None]:
# use groupby transform to fill the age
df['Age'] = df.groupby(['Sex', 'Pclass', 'Parch'])['Age'].transform(lambda x: x.fillna(x.median()))

In [None]:
df.Age.plot(kind='hist', bins=30)

In [None]:
df.isnull().sum()

In [None]:
# drop left 2 rows with nulls
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

## Remove / transform columns

In [None]:
# drop columns
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

In [None]:
# convert types
df.info()

In [None]:
# convert sex to is_male
df['is_male'] = df['Sex'].map({'male': 1, 'female': 0})
df.drop(columns=['Sex'], inplace=True)

In [None]:
df['Embarked'].unique()

In [None]:
# convert Embarked to numbers
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [None]:
df

# Data Exploration

## Target distribution

In [None]:
fig = px.histogram(
    df,
    x='Survived',
    # histnorm='percent',
    width=600
)
fig.show()

## Correlations

In [None]:
# get correlation matrix
df.corr()

In [None]:
# get features with significant correlation to target variable
df.corr()['Survived'].abs().sort_values(ascending=False)

# Split dataset

In [None]:
df.columns

In [None]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [None]:
# split to train validation
# IMPORTANT - add stratify!!!
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=101, stratify=y)


# Train Logistic Regression

Train as is - note convergence fails without normalization

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

**Add normalization**

In [None]:
# normalize before training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Explore predictions

In [None]:
# predict
model.predict(X_val_scaled)

In [None]:
# predict_proba
model.predict_proba(X_val_scaled)

# Explore metrics

In [None]:
metrics.accuracy_score(y_val, model.predict(X_val_scaled))

In [None]:
model.score(X_val_scaled, y_val)

In [None]:
metrics.confusion_matrix(y_val, model.predict(X_val_scaled))

In [None]:
# display confusion metrics as df
pd.DataFrame(metrics.confusion_matrix(y_val, model.predict(X_val_scaled)),
             columns=['Predicted Not Survived (0)', 'Predicted Survived (1)'],
             index=['Actual Not Survived (0)', 'Actual Survived (1)'])

In [None]:
print(metrics.classification_report(y_val, model.predict(X_val_scaled)))

In [None]:
metrics.recall_score(y_val, model.predict(X_val_scaled))

In [None]:
metrics.precision_score(y_val, model.predict(X_val_scaled))

In [None]:
# double-check overfit
print(metrics.classification_report(y_train, model.predict(X_train_scaled)))

# Decision boundary - THRESHOLD

In [None]:
model.predict(X_val_scaled[:5])

In [None]:
model.predict_proba(X_val_scaled[:5])

In [None]:
# default threshold is 0.5
model.predict_proba(X_val_scaled[:5])[:,1] > 0.5

In [None]:
# lets try changing threshold

# ROC-AUC

In [None]:
probabilities = model.predict_proba(X_val_scaled)[:,1]

In [None]:
# Compute ROC curve values
fpr, tpr, thresholds = roc_curve(y_val, probabilities)
roc_auc = auc(fpr, tpr)  # Compute AUC

In [None]:
# prompt: display roc curve using plotly with threshold values as tooltips

fig = go.Figure()

# Add the ROC curve
fig.add_trace(go.Scatter(
    x=fpr,
    y=tpr,
    mode='lines',
    name=f'ROC Curve (AUC = {roc_auc:.2f})',
    hovertemplate = 'False Positive Rate: %{x:.3f}<br>True Positive Rate: %{y:.3f}<br>Threshold: %{text:.3f}<extra></extra>',
    text = thresholds
))

# Add a diagonal line for random guessing
fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    line=dict(dash='dash'),
    name='Random Guessing'
))


fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    width=800,
    height=600
)

fig.show()


In [None]:
# calc auc
# Note - AUC is an ultimate metrics that measures the performance of the model, it's not related to the
# selection of the threshold!
fpr, tpr, thresholds = metrics.roc_curve(y_val, model.predict_proba(X_val_scaled)[:,1])
metrics.auc(fpr, tpr)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, model.predict_proba(X_train_scaled)[:,1])
metrics.auc(fpr, tpr)

# More

In [None]:
model.predict_proba(X_val_scaled[:2])

In [None]:
model.decision_function(X_val_scaled[:2])

In [None]:
# calc sigmoid of decision funciton
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
sigmoid(model.decision_function(X_val_scaled[:2]))

In [None]:
model.coef_

In [None]:
model.intercept_