In [4]:
import pandas as pd
import pathlib as path
from sklearn.ensemble import RandomForestClassifier
train_path = r'C:\Users\user\Documents\GitHub\Notebooks-on-ml\TITANIC\Data\train.csv'
test_path = r'C:\Users\user\Documents\GitHub\Notebooks-on-ml\TITANIC\Data\test.csv'
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

def prepare_data(df):

    df = df.copy()
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['FamilyType'] = df['FamilySize'].apply(lambda x: 'Single' if x == 1 else ('Small' if x <= 4 else 'Large'))
    df['AgeW'] = df['Age'].apply(lambda x: 'Child' if x <= 10 else ('Adolescent' if 10 < x <= 17 else ('Young' if 17 < x <= 31 else ('Mature' if 31 < x <= 54 else 'Old'))))
    df['Ticket_Group_Size'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Ticket_Group_size'] = df['Ticket_Group_Size'].apply(lambda x: 'Single' if x == 1 else ('Small' if x <=4 else 'Large'))
    df['Deck'] = df['Cabin'].str[0].fillna('M')
    df['Has_Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
    df['Age'] = df.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    prefix_counts = df['Ticket'].apply(lambda x: x.split()[0] if not x.isdigit() else 'None').value_counts()
    popular_prefixes = prefix_counts[prefix_counts >= 10].index
    df['Ticket_Prefix'] = df['Ticket'].apply(lambda x: x.split()[0] if not x.isdigit() else 'None')
    df['Ticket_Prefix'] = df['Ticket_Prefix'].apply(lambda x: x if x in popular_prefixes else 'Rare')
    df['Fare'] = df['Fare'].apply(lambda x: 'Small' if x <= 10 else ('Medium' if 10 < x <= 55 else 'High'))
    df['Sex'] = df['Sex'].map({'male':0, 'female':1})

    cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'FamilySize', 'Ticket_Group_Size']
    df_clean = df.drop(columns=cols_to_drop)
    df_final = pd.get_dummies(df_clean, columns=['Sex', 'Ticket_Prefix', 'Deck', 'Ticket_Group_size', 'FamilyType', 'Embarked', 'Fare'], drop_first=True)
    mapping = {'Child': 1, 'Adolescent': 2, 'Young': 3, 'Mature': 4, 'Old': 5}
    df_final['AgeW_encoded'] = df_clean['AgeW'].map(mapping)
    df_final = df_final.drop(columns=['AgeW'])

    return df_final

X = prepare_data(df_train).drop(columns=['Survived'])
y = df_train['Survived']
X_test_final = prepare_data(df_test)
X_test_final = X_test_final.reindex(columns=X.columns, fill_value=0)

model = RandomForestClassifier(
    n_estimators=352, 
    max_depth=6, 
    min_samples_split=20, 
    min_samples_leaf=6, 
    random_state=42
)

model.fit(X, y)
probs = model.predict_proba(X_test_final)[:, 1]
predictions = (probs >= 0.5).astype(int)
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": predictions
})

submission.to_csv('submission_final.csv', index=False)
print("Файл готов к отправке!")

Файл готов к отправке!
