In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

spaceship_titanic_path = kagglehub.competition_download('spaceship-titanic')

print('Data source import complete.')


In [None]:
"""
Предобработка:
Обработаны пропущенные значения
Разделены составные признаки (Cabin и PassengerId)

Сконструированные признаки:
GroupMeanAge - средний возраст в группе
IsSingle - путешествие в одиночку
TotalSpent - общая сумма трат пассажира
ServicesUsed - количество использованных сервисов
GroupSize - размер группы путешественников
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

pd.set_option('future.no_silent_downcasting', True)

def preprocess_data(df):
    df = df.copy()

    if 'Cabin' in df.columns:
        df[['Deck', 'Cabin_num', 'Side']] = df['Cabin'].str.split('/', expand=True)

    df[['Group', 'Number']] = df['PassengerId'].str.split('_', expand=True)

    numeric_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])

    return df

def create_features(df):
    df = df.copy()

    df['TotalSpent'] = df['RoomService'] + df['FoodCourt'] + \
                       df['ShoppingMall'] + df['Spa'] + df['VRDeck']

    df['ServicesUsed'] = ((df[['RoomService', 'FoodCourt', 'ShoppingMall',
                              'Spa', 'VRDeck']] > 0).sum(axis=1))

    df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')

    df['GroupMeanAge'] = df.groupby('Group')['Age'].transform('mean')

    df['IsSingle'] = (df['GroupSize'] == 1).astype(int)

    return df

# Применяем предобработку и feature engineering
train_processed = preprocess_data(train_df)
test_processed = preprocess_data(test_df)

train_featured = create_features(train_processed)
test_featured = create_features(test_processed)

# Определение признаков
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
                   'TotalSpent', 'ServicesUsed', 'GroupSize', 'GroupMeanAge']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

# Подготовка данных для модели
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Создание пайплайна
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Подготовка данных для обучения
X_train = train_featured[numeric_features + categorical_features]
y_train = train_featured['Transported']

# Обучение модели
model.fit(X_train, y_train)

# Предсказания для тестового набора
X_test = test_featured[numeric_features + categorical_features]
predictions = model.predict(X_test)

# Создание файла с результатами
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': predictions
})

# Сохранение результатов
submission.to_csv('submission.csv', index=False)

# Получение имен признаков после преобразования
categorical_names = []
for i, feature in enumerate(categorical_features):
    unique_values = train_featured[feature].unique()
    categorical_names.extend([f"{feature}_{val}" for val in sorted(unique_values)[1:]])

feature_names = numeric_features + categorical_names

# Создание DataFrame с важностью признаков
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': model.named_steps['classifier'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nТоп-15 самых важных признаков:")
print(feature_importance.head(15))

# Оценка качества модели на тренировочном наборе
train_predictions = model.predict(X_train)
print("\nОценка качества модели на тренировочном наборе:")
print(classification_report(y_train, train_predictions))