In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

In [None]:
df = sns.load_dataset('titanic')

df.head(20)

## EDA

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
sns.countplot(data=df, x="survived")
plt.title("Survival Count")

In [None]:
numeric_cols = ["age", "fare", "sibsp", "parch"]

df[numeric_cols].hist(bins=30, figsize=(12, 8))
plt.suptitle("Distribution of Numeric Features")
plt.tight_layout()
plt.show()

In [None]:
for col in numeric_cols:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
categorical_cols = ["sex", "pclass", "embarked", "alone"]

for col in categorical_cols:
    sns.countplot(data=df, x=col)
    plt.title(f"Countplot of {col}")
    plt.show()

In [None]:
df.hist(bins=30, figsize=(12, 8))
plt.tight_layout()
plt.show()

In [None]:
for col in numeric_cols:
    sns.histplot(data=df, x=col, hue="survived", element="step", stat="density", common_norm=False)
    plt.title(f"{col} by Survival")
    plt.show()

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df, hue="survived")

In [None]:
for col in categorical_cols:
    sns.countplot(data=df, x=col, hue="survived")
    plt.title(f"{col} vs. Survival")
    plt.show()

## Modelle

In [None]:
target = "survived"
features = ["pclass", "age", "fare"]

X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

y_pred = lin_model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Plot predicted vs. actual
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression Predictions")
plt.savefig("results/linear_regression_plot.png")
plt.show()

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno

msno.matrix(df)

In [None]:
df.shape

In [None]:
#df = df.dropna(subset=["age"])
df["age"] = df["age"].fillna(df["age"].median())

In [None]:
X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

y_pred = lin_model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression Predictions")
plt.show()

In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()