In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import re

## Subtask 1

In [67]:
df = pd.read_csv('Titanic Dataset.csv')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [68]:
missing_percentage = (df.isnull().sum() / len(df)) * 100

print(missing_perc)

pclass        0.000000
survived      0.000000
name          0.000000
sex           0.000000
age          20.091673
sibsp         0.000000
parch         0.000000
ticket        0.000000
fare          0.076394
cabin        77.463713
embarked      0.152788
boat         62.872422
body         90.756303
home.dest    43.086325
dtype: float64


## Subtask 2

In [69]:
# Tworzenie zmiennej binarnej dla braku wartości w kolumnie 'age' (1, jeśli brakuje; 0, jeśli jest)
df['age_missing'] = df['age'].isnull().astype(int)

# Analiza zależności 'age_missing' od zmiennej 'sex'
sex_dependency = df.groupby('sex')['age_missing'].mean() * 100

# Analiza zależności 'age_missing' od zmiennej 'pclass' (klasa pasażerska)
pclass_dependency = df.groupby('pclass')['age_missing'].mean() * 100

print("\nZależność braku wartości 'age' od zmiennej 'sex' (procent braków w każdej grupie):")
print(sex_dependency)

print("\nZależność braku wartości 'age' od zmiennej 'pclass' (procent braków w każdej grupie):")
print(pclass_dependency)


Zależność braku wartości 'age' od zmiennej 'sex' (procent braków w każdej grupie):
sex
female    16.738197
male      21.945433
Name: age_missing, dtype: float64

Zależność braku wartości 'age' od zmiennej 'pclass' (procent braków w każdej grupie):
pclass
1    12.074303
2     5.776173
3    29.337094
Name: age_missing, dtype: float64


## Subtask 3

In [70]:
## podpkt 3
threshold = 70
high_missing_cols = missing_percentage[missing_percentage > threshold].index.tolist()
df_baseline = df.drop(columns=high_missing_cols, inplace=False)
print(f"✅ Usunięto kolumny o brakach > {threshold}%: {high_missing_cols}")

y = df_baseline['survived']
X = df_baseline.drop('survived', axis=1)

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)
baseline_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])
baseline_pipeline.fit(X_train, y_train)

print("✅ Główny Pipeline (Preprocessing + Logistic Regression) został wytrenowany.")

y_pred = baseline_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("\n--- Wyniki Oceny Modelu Baselinowego z Pipeline ---")
print(f"Dokładność (Accuracy) na zbiorze testowym: {accuracy:.4f}")
print("\nRaport Klasyfikacji:")
print(classification_report(y_test, y_pred))

✅ Usunięto kolumny o brakach > 70%: ['cabin', 'body']
✅ Główny Pipeline (Preprocessing + Logistic Regression) został wytrenowany.

--- Wyniki Oceny Modelu Baselinowego z Pipeline ---
Dokładność (Accuracy) na zbiorze testowym: 0.9542

Raport Klasyfikacji:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       162
           1       0.97      0.91      0.94       100

    accuracy                           0.95       262
   macro avg       0.96      0.95      0.95       262
weighted avg       0.95      0.95      0.95       262





## Subtask 4

In [71]:
df_advanced = df.copy()

df_advanced['Cabin_Missing'] = df_advanced['cabin'].isnull().astype(int)
df_advanced.drop(columns=['cabin'], inplace=True)
print("✅ Utworzono cechę 'Cabin_Missing' (1 - brak wartości w Cabin, 0 - jest wartość). i usunieto kolumnę 'cabin'.")


def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

df_advanced['Title'] = df_advanced['name'].apply(extract_title)

rare_titles = df_advanced['Title'].value_counts() < 10
df_advanced['Title'] = df_advanced['Title'].apply(lambda x: 'Rare' if rare_titles[x] else x)

df_advanced['Title'] = df_advanced['Title'].replace(['Mlle', 'Ms'], 'Miss')
df_advanced['Title'] = df_advanced['Title'].replace('Mme', 'Mrs')

print(f"✅ Utworzono i skorygowano cechę 'Title'. Unikalne tytuły: {df_advanced['Title'].unique().tolist()}")


title_age_median = df_advanced.groupby('Title')['age'].median()
def impute_age_by_title(row):
    if pd.isnull(row['age']):
        return title_age_median[row['Title']]
    else:
        return row['age']

df_advanced['age'] = df_advanced.apply(impute_age_by_title, axis=1)

print("✅ Kolumna 'age' uzupełniona medianą zależną od 'Title'.")


#usuniecie kolumn ktore maja duzo brakow
threshold = 70
high_missing_cols = missing_percentage[missing_percentage > threshold].index.tolist()
df_advanced = df.drop(columns=high_missing_cols, inplace=False)


y = df_advanced['survived']
X = df_advanced.drop('survived', axis=1)


numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [col for col in numerical_features if col in X_train.columns]),
        ('cat', categorical_transformer, [col for col in categorical_features if col in X_train.columns])
    ],
    remainder='drop'
)

advanced_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, max_iter=1000))
])

advanced_pipeline.fit(X_train, y_train)

print("\n✅ Zaawansowany Pipeline (Feature Engineering + Imputacja + Regresja Logistyczna) został wytrenowany.")

y_pred = advanced_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n--- Wyniki Oceny Modelu Zaawansowanego ---")
print(f"Dokładność (Accuracy) na zbiorze testowym: {accuracy:.4f}")
print("\nRaport Klasyfikacji:")
print(classification_report(y_test, y_pred))

✅ Utworzono cechę 'Cabin_Missing' (1 - brak wartości w Cabin, 0 - jest wartość). i usunieto kolumnę 'cabin'.
✅ Utworzono i skorygowano cechę 'Title'. Unikalne tytuły: ['Miss', 'Master', 'Mr', 'Mrs', 'Rare']
✅ Kolumna 'age' uzupełniona medianą zależną od 'Title'.

✅ Zaawansowany Pipeline (Feature Engineering + Imputacja + Regresja Logistyczna) został wytrenowany.

--- Wyniki Oceny Modelu Zaawansowanego ---
Dokładność (Accuracy) na zbiorze testowym: 0.9542

Raport Klasyfikacji:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       162
           1       0.97      0.91      0.94       100

    accuracy                           0.95       262
   macro avg       0.96      0.95      0.95       262
weighted avg       0.95      0.95      0.95       262



  title_search = re.search(' ([A-Za-z]+)\.', name)
