In [47]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib



In [2]:
# Загрузка данных
df = pd.read_csv("train.csv")

In [3]:
df[['Survived', 'Pclass', 'SibSp', 'Parch']] = df[['Survived', 'Pclass', 'SibSp', 'Parch']].astype('int8')

In [4]:
# Удаление столбца PassengerId
df.drop(columns=['PassengerId', 'Ticket'], inplace=True)

In [5]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1}).astype("int8")

In [6]:
# Вручную заполняем пропущенные значения
df.loc[df['Name'].str.contains('Icard, Miss. Amelie'), 'Embarked'] = 'C'  # Шербур
df.loc[df['Name'].str.contains('Stone, Mrs. George Nelson'), 'Embarked'] = 'S'  # Саутгемптон

In [7]:
# Создаём объект KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Применяем imputer к данным
df[['Age']] = imputer.fit_transform(df[['Age']])

In [8]:
df = pd.get_dummies(df, columns=['Embarked', 'Pclass'], dtype=np.int8)

In [9]:
# Define age group boundaries and labels
age_bins = [0, 1, 5, 10, 14, 18, 30, 50, 70, df['Age'].max()]
age_labels = [
    "Infant", 
    "Toddler", 
    "Child", 
    "Young_Teen", 
    "Teenager", 
    "Young_Adult", 
    "Adult", 
    "Senior", 
    "Elderly"
]

# Create a new column with age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

In [10]:
df = pd.get_dummies(df, columns=['AgeGroup'], prefix='AgeGroup', dtype=np.int8)

In [11]:
# Создание нового признака FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Признак одиночного пассажира
df['IsAlone'] = (df['FamilySize'] == 1).astype(np.int8)

# Стоимость билета на человека
df['FarePerPerson'] = df['Fare'] / df['FamilySize']

In [12]:
# Извлечение титула (обращения) из имени
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.')

# Удаление титула из Name
df['Name'] = df['Name'].str.replace(r' [A-Za-z]+\.', '', regex=True).str.strip()



In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Survived              891 non-null    int8   
 1   Name                  891 non-null    int32  
 2   Sex                   891 non-null    int8   
 3   Age                   891 non-null    float64
 4   SibSp                 891 non-null    int8   
 5   Parch                 891 non-null    int8   
 6   Fare                  891 non-null    float64
 7   Embarked_C            891 non-null    int8   
 8   Embarked_Q            891 non-null    int8   
 9   Embarked_S            891 non-null    int8   
 10  Pclass_1              891 non-null    int8   
 11  Pclass_2              891 non-null    int8   
 12  Pclass_3              891 non-null    int8   
 13  AgeGroup_Infant       891 non-null    int8   
 14  AgeGroup_Toddler      891 non-null    int8   
 15  AgeGroup_Child        8

In [14]:
title_mapping = {
    'Mr': 'Mr',
    'Miss': 'Miss',
    'Mrs': 'Mrs',
    'Master': 'Master',
    'Dr': 'Rare',
    'Rev': 'Rare',
    'Mlle': 'Miss',  # Mlle - это французский аналог Miss
    'Major': 'Rare',
    'Col': 'Rare',
    'Countess': 'Noble',
    'Capt': 'Rare',
    'Ms': 'Miss',  # Ms можно приравнять к Miss
    'Sir': 'Noble',
    'Lady': 'Noble',
    'Mme': 'Mrs',  # Mme - это французский аналог Mrs
    'Don': 'Noble',
    'Jonkheer': 'Noble'
}

df['Title'] = df['Title'].map(title_mapping)


In [15]:
# Кодируем титулы с помощью LabelEncoder
le = LabelEncoder()
df['Title'] = le.fit_transform(df['Title'])

In [16]:
le = LabelEncoder()
df['Name'] = le.fit_transform(df['Name'].str.split(',').str[0])  # Кодируем только фамилию

In [31]:
df.head()

Unnamed: 0,Survived,Name,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,...,AgeGroup_Adult,AgeGroup_Senior,AgeGroup_Elderly,FamilySize,IsAlone,FarePerPerson,Title,Cabin_Count,Cabin_Letter,Cabin_Number
0,0,73,0,22.0,1,0,7.25,0,0,1,...,0,0,0,2,0,3.625,2,1.0,5,40.5
1,1,136,1,38.0,1,0,71.2833,1,0,0,...,1,0,0,2,0,35.64165,3,1.0,2,85.0
2,1,251,1,26.0,0,0,7.925,0,0,1,...,0,0,0,1,1,7.925,1,1.0,5,40.5
3,1,198,1,35.0,1,0,53.1,0,0,1,...,1,0,0,2,0,26.55,3,1.0,2,123.0
4,0,11,0,35.0,0,0,8.05,0,0,1,...,1,0,0,1,1,8.05,2,1.0,5,40.5


In [18]:
    # Создаем признак количества кают
df['Cabin_Count'] = df['Cabin'].str.split().str.len().fillna(1)

In [19]:
# Разделяем множественные каюты и берем первую
df['Primary_Cabin'] = df['Cabin'].str.split().str[0]

In [20]:
# Разделяем номер основной каюты на букву и число
df['Cabin_Letter'] = df['Primary_Cabin'].str[0]
df['Cabin_Number'] = df['Primary_Cabin'].str.extract('(\d+)').astype(float)

  df['Cabin_Number'] = df['Primary_Cabin'].str.extract('(\d+)').astype(float)


In [30]:
df.filter(like='Cabin', axis=1)


Unnamed: 0,Cabin_Count,Cabin_Letter,Cabin_Number
0,1.0,5,40.5
1,1.0,2,85.0
2,1.0,5,40.5
3,1.0,2,123.0
4,1.0,5,40.5
...,...,...,...
886,1.0,3,40.5
887,1.0,1,42.0
888,1.0,5,40.5
889,1.0,2,148.0


In [22]:
# Удаление столбца Primary_Cabin
df.drop(columns=['Primary_Cabin','Cabin'], inplace=True)

In [23]:
# Разделение данных на train и test (train - с известными Cabin_Letter, test - с NaN)
train_df_Cabin_Letter = df.dropna(subset=['Cabin_Letter'])

# Формирование test_df для строк, где 'Cabin_Letter' содержит NaN
test_df_Cabin_Letter = df[df['Cabin_Letter'].isna()]  # строки с NaN в 'Cabin_Letter'

In [25]:
# Шаг 1: Подготовка данных
X = train_df_Cabin_Letter.drop(columns=['Cabin_Letter'])  # Убираем целевой столбец и 'Cabin'
y = train_df_Cabin_Letter['Cabin_Letter']  # Целевая переменная

# Шаг 3: Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Шаг 4: Обучение модели (например, случайный лес)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Шаг 5: Оценка модели
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Точность модели: {accuracy:.4f}")

Точность модели: 0.7317


In [27]:
# Шаг 1: Подготовка данных для тестовой выборки
X_test_df_Cabin_Letter = test_df_Cabin_Letter.drop(columns=['Cabin_Letter'])  # Убираем целевой столбец и 'Cabin'

# Шаг 2: Предсказания для test_df_Cabin_Letter
predictions = model.predict(X_test_df_Cabin_Letter)

# Шаг 3: Запись предсказаний в основной столбец 'Cabin_Letter'
df.loc[test_df_Cabin_Letter.index, 'Cabin_Letter'] = predictions



In [28]:
# Кодируем титулы с помощью LabelEncoder
le = LabelEncoder()
df['Cabin_Letter'] = le.fit_transform(df['Cabin_Letter'])

In [29]:
# Заполняем пропуски медианой
median_cabin = df['Cabin_Number'].median()
df['Cabin_Number'].fillna(median_cabin, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin_Number'].fillna(median_cabin, inplace=True)


In [None]:
# Определяем столбцы с бинарными значениями (0 и 1)
binary_columns = [col for col in df.columns if set(df[col].dropna()) <= {0, 1}]

# Столбцы, которые нужно нормализовать (не бинарные)
columns_to_normalize = [col for col in df.columns if col not in binary_columns]

# Инициализация MinMaxScaler для нормализации
scaler = MinMaxScaler()

# Нормализуем только числовые столбцы (не бинарные)
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

     Survived      Name  Sex       Age  SibSp     Parch      Fare  Embarked_C  \
0           0  0.109610    0  0.271174  0.125  0.000000  0.014151           0   
1           1  0.204204    1  0.472229  0.125  0.000000  0.139136           1   
2           1  0.376877    1  0.321438  0.000  0.000000  0.015469           0   
3           1  0.297297    1  0.434531  0.125  0.000000  0.103644           0   
4           0  0.016517    0  0.434531  0.000  0.000000  0.015713           0   
..        ...       ...  ...       ...    ...       ...       ...         ...   
886         0  0.609610    0  0.334004  0.000  0.000000  0.025374           0   
887         1  0.331832    1  0.233476  0.000  0.000000  0.058556           0   
888         0  0.439940    1  0.367921  0.125  0.333333  0.045771           0   
889         1  0.078078    0  0.321438  0.000  0.000000  0.058556           1   
890         0  0.238739    0  0.396833  0.000  0.000000  0.015127           0   

     Embarked_Q  Embarked_S

In [48]:

# Разделим данные на признаки (X) и целевую переменную (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Разделим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Инициализация моделей
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(silent=True, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1, max_depth=5, num_leaves=31)  # ограничение max_depth и num_leaves
}

# Оценка всех моделей
results = []

for model_name, model in models.items():
    # Обучение модели
    model.fit(X_train, y_train)
    
    # Предсказания на тестовых данных
    y_pred = model.predict(X_test)
    
    # Оценка метрик модели
    accuracy = accuracy_score(y_test, y_pred)
    full_precision = precision_score(y_test, y_pred, average='binary')  # Точность по всем данным
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Сохраняем результаты
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Full Precision': full_precision,
        'Recall': recall,
        'F1 Score': f1
    })

# Преобразуем результаты в DataFrame
results_df = pd.DataFrame(results)

# Выводим таблицу с результатами
print(results_df)

# Сохраняем лучшую модель (по Full Precision или другой метрике, если нужно)
best_model_name = results_df.loc[results_df['Full Precision'].idxmax(), 'Model']
best_model = models[best_model_name]

# Сохраняем модель
joblib.dump(best_model, f"{best_model_name}_model.pkl")

print(f"Лучшая модель: {best_model_name}")

                    Model  Accuracy  Full Precision    Recall  F1 Score
0     Logistic Regression  0.817164        0.816327  0.720721  0.765550
1           Random Forest  0.824627        0.820000  0.738739  0.777251
2  Support Vector Machine  0.809701        0.875000  0.630631  0.732984
3     K-Nearest Neighbors  0.783582        0.804598  0.630631  0.707071
4       Gradient Boosting  0.854478        0.875000  0.756757  0.811594
5                CatBoost  0.839552        0.854167  0.738739  0.792271
6                 XGBoost  0.817164        0.816327  0.720721  0.765550
7                LightGBM  0.835821        0.845361  0.738739  0.788462
Лучшая модель: Support Vector Machine
