In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

# 🔹 1. Загружаем данные
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 🔹 2. Создаём новые признаки (Feature Engineering)
# 2.1 Извлекаем титул из имени (Mr, Miss, Mrs и т. д.)
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])

# Группируем редкие титулы
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_df['Title'] = train_df['Title'].replace(rare_titles, 'Rare')
test_df['Title'] = test_df['Title'].replace(rare_titles, 'Rare')

# 2.2 Создаём признак "Deck" (палуба) из номера каюты
train_df['Deck'] = train_df['Cabin'].fillna('M').apply(lambda x: x[0])  # M = No Cabin
test_df['Deck'] = test_df['Cabin'].fillna('M').apply(lambda x: x[0])

# 2.3 Флаг "Была ли каюта"
train_df['HasCabin'] = (train_df['Cabin'].notna()).astype(int)
test_df['HasCabin'] = (test_df['Cabin'].notna()).astype(int)

# 2.4 Создаём семейные признаки
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

# 2.5 Другие полезные признаки
train_df['Fare_Per_Person'] = train_df['Fare'] / train_df['FamilySize']
test_df['Fare_Per_Person'] = test_df['Fare'] / test_df['FamilySize']

train_df['Age*Class'] = train_df['Age'] * train_df['Pclass']
test_df['Age*Class'] = test_df['Age'] * test_df['Pclass']

# 🔹 3. Сохраняем PassengerId для финального сабмита
passenger_ids = test_df['PassengerId']

# 🔹 4. Убираем ненужные колонки
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train_df = train_df.drop(drop_cols, axis=1)
test_df = test_df.drop(drop_cols, axis=1)

# 🔹 5. Разделяем на признаки (X) и целевую переменную (y)
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
X_test = test_df

# 🔹 6. Заполняем пропущенные значения (Age, Fare, Embarked)
imputer = SimpleImputer(strategy="median")
X_train[['Age', 'Fare', 'Age*Class']] = imputer.fit_transform(X_train[['Age', 'Fare', 'Age*Class']])
X_test[['Age', 'Fare', 'Age*Class']] = imputer.transform(X_test[['Age', 'Fare', 'Age*Class']])

# Пересчитываем Fare_Per_Person после заполнения Fare
X_train['Fare_Per_Person'] = X_train['Fare'] / X_train['FamilySize']
X_test['Fare_Per_Person'] = X_test['Fare'] / X_test['FamilySize']

# 🔹 7. Заполняем пропущенные значения в категориальных признаках
categorical_features = ['Sex', 'Embarked', 'Pclass', 'Title', 'Deck']
for col in categorical_features:
    most_frequent = X_train[col].mode()[0]
    X_train[col].fillna(most_frequent, inplace=True)
    X_test[col].fillna(most_frequent, inplace=True)

# 🔹 8. Кодируем категории для **RandomForest** и **GradientBoosting**
le = LabelEncoder()
for col in categorical_features:
    X_train[col + '_num'] = le.fit_transform(X_train[col])
    X_test[col + '_num'] = le.transform(X_test[col])

# Удаляем исходные категориальные колонки
X_train = X_train.drop(categorical_features, axis=1)
X_test = X_test.drop(categorical_features, axis=1)

# 🔹 9. Обучаем CatBoost
# Для CatBoost используем исходные категориальные признаки
X_train_catboost = X_train.copy()
X_test_catboost = X_test.copy()

catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.02, depth=8, l2_leaf_reg=3, random_seed=42, verbose=100)
catboost_model.fit(X_train_catboost, y_train)

# 🔹 10. Обучаем RandomForest и GradientBoosting
rf_model = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

gb_model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

# 🔹 11. Усредняем предсказания (CatBoost + RF + GB)
cat_preds = catboost_model.predict_proba(X_test_catboost)[:, 1]
rf_preds = rf_model.predict_proba(X_test)[:, 1]
gb_preds = gb_model.predict_proba(X_test)[:, 1]

final_predictions = (cat_preds * 0.5 + rf_preds * 0.3 + gb_preds * 0.2) > 0.5
final_predictions = final_predictions.astype(int)

# 🔹 12. Создаём файл gender_submission.csv
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': final_predictions
})

# Сохраняем предсказания в файл
submission.to_csv('gender_submission.csv', index=False)

# 🔹 13. Проверяем точность на обучающих данных (для оценки)
train_predictions = catboost_model.predict(X_train_catboost)
accuracy = accuracy_score(y_train, train_predictions)
print(f"🔹 Точность на обучающих данных: {accuracy:.2f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(most_frequent, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting val

0:	learn: 0.6788850	total: 3.15ms	remaining: 3.15s
100:	learn: 0.3417354	total: 243ms	remaining: 2.17s
200:	learn: 0.2870961	total: 498ms	remaining: 1.98s
300:	learn: 0.2550872	total: 752ms	remaining: 1.75s
400:	learn: 0.2211030	total: 1.01s	remaining: 1.51s
500:	learn: 0.1969145	total: 1.28s	remaining: 1.28s
600:	learn: 0.1742177	total: 1.56s	remaining: 1.04s
700:	learn: 0.1556685	total: 1.83s	remaining: 780ms
800:	learn: 0.1401056	total: 2.1s	remaining: 521ms
900:	learn: 0.1276031	total: 2.36s	remaining: 260ms
999:	learn: 0.1170511	total: 2.62s	remaining: 0us
🔹 Точность на обучающих данных: 0.97


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier

class TitanicPreprocessor:
    def __init__(self):
        self.categorical_features = ['Sex', 'Embarked', 'Pclass', 'Title', 'Deck']
        self.numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
        self.label_encoders = {}
        self.numerical_imputer = SimpleImputer(strategy='median')
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    def extract_title(self, name):
        title = name.split(', ')[1].split('.')[0]
        rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 
                      'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
        return 'Rare' if title in rare_titles else title
    
    def create_features(self, df):
        """Create all feature engineering in one place"""
        df = df.copy()
        
        # Extract title from name
        df['Title'] = df['Name'].apply(self.extract_title)
        
        # Cabin features
        df['Deck'] = df['Cabin'].fillna('M').str[0]
        df['HasCabin'] = df['Cabin'].notna().astype(int)
        
        # Family features
        df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
        df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
        
        # Fare features
        df['Fare_Per_Person'] = df['Fare'] / df['FamilySize']
        
        # Age interaction will be computed after imputation
        return df
    
    def fit_transform(self, df):
        """Full preprocessing pipeline for training data"""
        # Create initial features
        df = self.create_features(df)
        
        # Handle missing values in numerical features
        numerical_features_matrix = self.numerical_imputer.fit_transform(df[self.numerical_features])
        df[self.numerical_features] = numerical_features_matrix
        
        # Now we can safely compute Age*Class
        df['Age*Class'] = df['Age'] * df['Pclass']
        
        # Handle missing values in categorical features
        categorical_features_matrix = self.categorical_imputer.fit_transform(df[self.categorical_features])
        df[self.categorical_features] = categorical_features_matrix
        
        # Encode categorical features
        for col in self.categorical_features:
            self.label_encoders[col] = LabelEncoder()
            df[col + '_encoded'] = self.label_encoders[col].fit_transform(df[col])
        
        return df
    
    def transform(self, df):
        """Transform test data using fitted preprocessor"""
        # Create initial features
        df = self.create_features(df)
        
        # Handle missing values using fitted imputers
        numerical_features_matrix = self.numerical_imputer.transform(df[self.numerical_features])
        df[self.numerical_features] = numerical_features_matrix
        
        # Now we can safely compute Age*Class
        df['Age*Class'] = df['Age'] * df['Pclass']
        
        categorical_features_matrix = self.categorical_imputer.transform(df[self.categorical_features])
        df[self.categorical_features] = categorical_features_matrix
        
        # Use fitted label encoders
        for col in self.categorical_features:
            df[col + '_encoded'] = self.label_encoders[col].transform(df[col])
        
        return df

def train_models(X_train, y_train):
    """Train multiple models with cross-validation"""
    models = {
        'catboost': CatBoostClassifier(
            iterations=1000,
            learning_rate=0.02,
            depth=8,
            l2_leaf_reg=3,
            random_seed=42,
            verbose=False
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=300,
            max_depth=10,
            random_state=42
        ),
        'gradient_boosting': GradientBoostingClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=5,
            random_state=42
        )
    }
    
    trained_models = {}
    for name, model in models.items():
        # Perform cross-validation
        scores = cross_val_score(model, X_train, y_train, cv=5)
        print(f"{name} CV Score: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
        
        # Train on full training data
        model.fit(X_train, y_train)
        trained_models[name] = model
    
    return trained_models

def main():
    # Load data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    
    # Initialize preprocessor
    preprocessor = TitanicPreprocessor()
    
    # Preprocess training data
    processed_train = preprocessor.fit_transform(train_df)
    
    # Prepare features for training
    feature_cols = ([f"{col}_encoded" for col in preprocessor.categorical_features] + 
                   preprocessor.numerical_features + 
                   ['HasCabin', 'IsAlone', 'Fare_Per_Person', 'Age*Class'])
    
    X_train = processed_train[feature_cols]
    y_train = processed_train['Survived']
    
    # Verify no missing values
    assert not X_train.isnull().any().any(), "Missing values found in training features"
    
    # Train models
    trained_models = train_models(X_train, y_train)
    
    # Process test data
    processed_test = preprocessor.transform(test_df)
    X_test = processed_test[feature_cols]
    
    # Verify no missing values in test data
    assert not X_test.isnull().any().any(), "Missing values found in test features"
    
    # Make predictions
    weights = {'catboost': 0.5, 'random_forest': 0.3, 'gradient_boosting': 0.2}
    weighted_predictions = np.zeros(len(X_test))
    
    for name, model in trained_models.items():
        weighted_predictions += model.predict_proba(X_test)[:, 1] * weights[name]
    
    final_predictions = (weighted_predictions > 0.5).astype(int)
    
    # Create submission file
    submission = pd.DataFrame({
        'PassengerId': test_df['PassengerId'],
        'Survived': final_predictions
    })
    submission.to_csv('submission.csv', index=False)
    
    # Print training metrics
    print("\nTraining Metrics:")
    for name, model in trained_models.items():
        train_preds = model.predict(X_train)
        print(f"\n{name} Performance:")
        print(classification_report(y_train, train_preds))

if __name__ == "__main__":
    main()

catboost CV Score: 0.8294 (+/- 0.0812)
random_forest CV Score: 0.8350 (+/- 0.0706)
gradient_boosting CV Score: 0.8317 (+/- 0.0585)


AssertionError: Missing values found in test features