In [2]:
import pandas as pd
import numpy as np
import pathlib as path
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
train_path = r'C:\Users\user\Documents\GitHub\Notebooks-on-ml\TITANIC\Data\train.csv'
test_path = r'C:\Users\user\Documents\GitHub\Notebooks-on-ml\TITANIC\Data\test.csv'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

def get_safe_survival_mapping(train_fs, test_fs):
    train_fs = train_fs.copy()
    test_fs = test_fs.copy()

    for df in [train_fs, test_fs]:
        df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0])
        df['Is_Adult_Male'] = df['Name'].str.contains(r'Mr\.').astype(int)

    train_fs['Family_Survival'] = 0.5
    test_fs['Family_Survival'] = 0.5

    for _, group in train_fs.groupby(['Surname', 'Fare']):
        if len(group) > 1:
            for ind, row in group.iterrows():
                others = group.drop(ind)
                if others['Survived'].max() == 1.0:
                    train_fs.loc[ind, 'Family_Survival'] = 1
                elif others['Survived'].max() == 0.0:
                    train_fs.loc[ind, 'Family_Survival'] = 0

    for _, group in train_fs.groupby('Ticket'):
        if len(group) > 1:
            for ind, row in group.iterrows():
                if train_fs.loc[ind, 'Family_Survival'] == 0.5:
                    others = group.drop(ind)
                    if others['Survived'].max() == 1.0:
                        train_fs.loc[ind, 'Family_Survival'] = 1
                    elif others['Survived'].max() == 0.0:
                        train_fs.loc[ind, 'Family_Survival'] = 0

    for ind, row in test_fs.iterrows():
        if row['Is_Adult_Male'] == 1:
            test_fs.loc[ind, 'Family_Survival'] = 0
            continue

        fam_in_train = train_fs[(train_fs['Surname'] == row['Surname']) & (train_fs['Fare'] == row['Fare'])]
        ticket_in_train = train_fs[train_fs['Ticket'] == row['Ticket']]
        combined = pd.concat([fam_in_train, ticket_in_train])

        if len(combined) > 0:
            if combined['Survived'].max() == 1.0:
                test_fs.loc[ind, 'Family_Survival'] = 1
            elif combined['Survived'].max() == 0.0:
                test_fs.loc[ind, 'Family_Survival'] = 0

    full_mapping = pd.concat([train_fs[['PassengerId', 'Family_Survival']],
                                    test_fs[['PassengerId', 'Family_Survival']]])
    return full_mapping

train_stats = df_train.copy()
train_stats['Deck'] = train_stats['Cabin'].str[0].fillna('M')
stats_lookup = train_stats.groupby(['Pclass', 'Deck'])['Survived'].mean().reset_index()
stats_lookup.columns = ['Pclass', 'Deck', 'Group_Survival_Rate']

def prepare_data(df_input, survival_mapping, train_stats=None):
    df = df_input.copy()

    df['Is_Adult_Male'] = df['Name'].str.contains(r'Mr\.').astype(int)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(float)
    df['IsMother'] = ((df['Sex'] == 1) & (df['Parch'] > 0) & (df['Age'] > 18)).astype(float)
    df['IsChild'] = (df['Age'] < 14).astype(float)
    df['IsFather'] = ((df['Is_Adult_Male'] == 1) & (df['Parch'] > 0)).astype(float)

    median_fare = df.groupby('Pclass')['Fare'].median()[3]
    df['Fare'] = df['Fare'].fillna(median_fare)
    df['Ticket_Group_Size'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Fare_Individual'] = df['Fare'] / df['FamilySize'].fillna(df['Fare'])
    df['Fare_Individual_Log'] = df['Fare_Individual'].apply(lambda x: np.log1p(x) if x > 0 else 0).fillna(np.log1p(df['Fare']))
    df['Fare_Log'] = df['Fare'].apply(lambda x: np.log1p(x) if x > 0 else 0).fillna(df['Fare'])
    
    df['Has_Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
    df['Deck'] = df['Cabin'].str[0].fillna('M')
    deck_mapping = {'B': 0.74, 'D': 0.75, 'E': 0.75, 'C': 0.59, 'F': 0.61, 'Rare': 0.45, 'M': 0.23}
    df['Deck_Score'] = df['Deck'].map(deck_mapping).fillna(0.23)

    if train_stats is not None:
        df = df.merge(train_stats, on=['Pclass', 'Deck'], how='left')
        pclass_fill = {1: 0.63, 2: 0.47, 3: 0.24} 
        df['Group_Survival_Rate'] = df['Group_Survival_Rate'].fillna(df['Pclass'].map(pclass_fill))

    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    title_mapping = {
        'Rev': 'Service', 'Dr': 'Service',
        'Jonkheer': 'Noble', 'Don': 'Noble', 'Sir': 'Noble', 'Lady': 'Noble', 'Countess': 'Noble', 'Dona': 'Noble',
        'Capt': 'Officer', 'Col': 'Officer', 'Major': 'Officer',
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'
    }
    df['Title'] = df['Title'].replace(title_mapping)
    df['Title'] = df['Title'].replace(['Noble', 'Officer', 'Service'], 'Elite&Other')

    df['Age'] = df.groupby(['Pclass', 'Sex', 'Title'])['Age'].transform(lambda x: x.fillna(x.median()))
    df = df.merge(survival_mapping, on='PassengerId', how='left')
    cols_to_drop = ['Name', 'Ticket', 'Cabin', 'PassengerId', 'SibSp', 'Parch', 'Is_Adult_Male']
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
    df = pd.get_dummies(df)
    return df.astype(float)

full_survival_map = get_safe_survival_mapping(df_train, df_test)

X  = prepare_data(df_train, full_survival_map).drop(columns=['Survived'])
y = df_train['Survived']
X_test_final = prepare_data(df_test, full_survival_map)
X_test_final = X_test_final.reindex(columns=X.columns, fill_value=0)

model = RandomForestClassifier(
    n_estimators=500, 
    max_depth=5, 
    min_samples_split=17, 
    min_samples_leaf=16, 
    max_features='sqrt',
    criterion='entropy',
    random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=327,
    max_depth=5,
    learning_rate=0.013337104067134765,
    subsample=0.778773230181126,
    colsample_bytree=0.6683926691242267,
    reg_alpha=0.009709205846637647,
    reg_lambda=0.04033145090390927,
    gamma=1.4180766941558238,
    min_child_weight=9,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

calibrated_xgb = CalibratedClassifierCV(xgb_model, method='sigmoid', cv=5)
calibrated_rf = CalibratedClassifierCV(model, method='sigmoid', cv=5)

stacking_model = StackingClassifier(
    estimators = [('rf', calibrated_rf), ('xgb', calibrated_xgb),],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1,
    passthrough=False
).fit(X, y)

probs = stacking_model.predict_proba(X_test_final)[:, 1]
predictions = (probs >= 0.5).astype(int)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": predictions
})

submission.to_csv('submission_final.csv', index=False)
print("Файл готов к отправке!")

Файл готов к отправке!
