In [1]:
import os, sys
import numpy as np
import pandas as pd
import joblib

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890].copy(), all_data.loc[891:].drop(['Survived'], axis=1).copy()

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_all = concat_df(df_train, df_test)

print(df_train.info())
df_train.head()

print(df_test.info())
df_test.head()

df_train.isna().sum()

df_test.isna().sum()

df_all['Age'] = df_all['Age'].fillna(df_all.groupby(['Pclass', 'Sex'])['Age'].transform('median'))

df_all['Embarked'] = df_all['Embarked'].fillna(df_all['Embarked'].mode()[0])

df_all['Fare'] = df_all['Fare'].fillna(
    df_all.groupby(['Pclass', 'Parch', 'SibSp'])['Fare'].transform('median'))

df_all['Deck'] = df_all['Cabin'].str[0].fillna('U') # Unknown
df_all['Deck'].value_counts()

df_all.loc[df_all[df_all['Deck'] == 'T'].index, 'Deck'] = 'A'

df_all['Deck'] = df_all['Deck'].replace(['A', 'B', 'C'], 'ABC')
df_all['Deck'] = df_all['Deck'].replace(['D', 'E'], 'DE')
df_all['Deck'] = df_all['Deck'].replace(['F', 'G'], 'FG')

df_all['Deck'].value_counts()

df_all.drop(columns='Cabin', axis=0, inplace=True)

df_all['AgeBin'] = pd.qcut(df_all['Age'], 10)

df_all['FareBin'] = pd.qcut(df_all['Fare'], 13)

df_all['Sex_Pclass'] = (df_all['Sex'] == 'female').astype(int) / df_all['Pclass']

df_all['Age_Pclass'] = df_all['Age'] / df_all['Pclass']

df_all['Family_Size'] = df_all['SibSp'] + df_all['Parch'] + 1

family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Large', 6: 'Large', 7: 'Large', 8: 'Large', 11: 'Large'}
df_all['Family_Size_Grouped'] = df_all['Family_Size'].map(family_map)

df_all['Fare_Per_Person'] = df_all['Fare'] / df_all['Family_Size']

df_all['Is_Alone'] = (df_all['Family_Size'] == 0).astype(int)

def extract_ticket_prefix(ticket):
    if ticket.isdigit():
        return ticket[0:2]
    
    prefix = ticket.split()[0]
    
    prefix = prefix.replace('.', '')

    if '/' in prefix:
        prefix = prefix.split('/')[0]

    return prefix.upper().strip()

df_all['Ticket_Prefix'] = df_all['Ticket'].apply(extract_ticket_prefix)

df_all['Ticket_Prefix'].value_counts()

df_train, df_test = divide_df(df_all)
prefix_survival = (
    df_all.groupby('Ticket_Prefix')['Survived']
    .mean()
    .sort_values(ascending=False)
)
print(prefix_survival)

def categorize_survival(rate):
    if pd.isna(rate):
        return 'Unknown'
    elif rate >= 0.8:
        return 'High'
    elif rate >= 0.5:
        return 'Medium'
    else:
        return 'Low'

prefix_group = prefix_survival.apply(categorize_survival)

df_all['Prefix_Survival_Group'] = df_all['Ticket_Prefix'].map(prefix_group)

df_all['Ticket_Frequency'] = df_all.groupby('Ticket')['Ticket'].transform('count')

df_all['Title'] = df_all['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_all['Title'].value_counts()

title_mapping = {
    'Miss': 'Miss',
    'Mrs': 'Miss',
    'Ms': 'Miss',
    'Mlle': 'Miss',
    'Lady': 'Miss',
    'Mme': 'Miss',
    'Countess': 'Miss',
    'Dona': 'Miss',
    'Dr': 'Elite',
    'Col': 'Elite',
    'Major': 'Elite',
    'Jonkheer': 'Elite',
    'Capt': 'Elite',
    'Sir': 'Elite',
    'Don': 'Elite',
    'Rev': 'Elite'
}

df_all['IsMarry'] = 0
df_all.loc[df_all['Name'].str.contains('Mrs'), 'IsMarry'] = 1

df_all['Title'] = df_all['Title'].replace(title_mapping)
df_all['Title'].value_counts()

import string

def extract_surname(names):
    families = []
    for name in names:
        name_clean = name.split('(')[0] if '(' in name else name
        family = name_clean.split(',')[0].strip()
        family = family.translate(str.maketrans('', '', string.punctuation))
        families.append(family)
    return families

df_all['Family'] = extract_surname(df_all['Name'])

df_train, df_test = divide_df(df_all)
mean_survival_rate = df_train['Survived'].mean()

common_families = set(df_train['Family']).intersection(df_test['Family'])
common_tickets  = set(df_train['Ticket']).intersection(df_test['Ticket'])

family_rate_df = df_train.groupby('Family')[['Survived', 'Family_Size']].median().reset_index()
ticket_rate_df = df_train.groupby('Ticket')[['Survived', 'Ticket_Frequency']].median().reset_index()

family_rate_df

family_rates = family_rate_df[
    (family_rate_df['Family_Size'] > 1) &
    (family_rate_df['Family'].isin(common_families))
][['Family', 'Survived']].set_index('Family')['Survived'].to_dict()

ticket_rates = ticket_rate_df[
    (ticket_rate_df['Ticket_Frequency'] > 1) &
    (ticket_rate_df['Ticket'].isin(common_tickets))
][['Ticket', 'Survived']].set_index('Ticket')['Survived'].to_dict()

def add_group_rate_features(df, family_rates, ticket_rates, mean_survival_rate):
    df = df.copy()
    df['Family_Survival_Rate'] = df['Family'].map(family_rates).fillna(mean_survival_rate)
    df['Family_Survival_Rate_NA'] = df['Family'].isin(family_rates).astype(int)
    df['Ticket_Survival_Rate'] = df['Ticket'].map(ticket_rates).fillna(mean_survival_rate)
    df['Ticket_Survival_Rate_NA'] = df['Ticket'].isin(ticket_rates).astype(int)
    return df

df_train = add_group_rate_features(df_train, family_rates, ticket_rates, mean_survival_rate)
df_test  = add_group_rate_features(df_test,  family_rates, ticket_rates, mean_survival_rate)

for df in [df_train, df_test]:
    df['Survival_Rate'] = (df['Ticket_Survival_Rate'] + df['Family_Survival_Rate']) / 2
    df['Survival_Rate_NA'] = (df['Ticket_Survival_Rate_NA'] + df['Family_Survival_Rate_NA']) / 2 

le = LabelEncoder()
non_numeric_features = ['Embarked', 'Sex', 'Deck', 'Title', 'Family_Size_Grouped', 'AgeBin', 'FareBin']
for feature in non_numeric_features:
    df_train[feature] = le.fit_transform(df_train[feature])
    df_test[feature] = le.transform(df_test[feature])

cat_features = ['Sex', 'Deck', 'Embarked', 'Title', 'Family_Size_Grouped']
df_all = concat_df(df_train, df_test)
df_all = pd.get_dummies(df_all, columns=cat_features, drop_first=True)

df_all['Pclass'] = df_all['Pclass'].astype(str)

survival_order = ['Unknown', 'Low', 'Medium', 'High']  # từ thấp đến cao
pclass_order = ['3', '2', '1']  # hạng vé: 3 < 2 < 1

encoder = OrdinalEncoder(categories=[survival_order, pclass_order])
cols_to_encode = ['Prefix_Survival_Group', 'Pclass']

df_all[cols_to_encode] = encoder.fit_transform(df_all[cols_to_encode])
print(df_all[cols_to_encode].head())

df_train, df_test = divide_df(df_all)
df_train[df_train.select_dtypes(include='object').columns].info()

from sklearn.feature_selection import RFE

model = RandomForestRegressor()
drop_cols = ['Age', 'Fare', 'Family', 'Family_Size', 'Survived',
             'Name', 'Parch', 'PassengerId', 'Pclass', 'SibSp', 'Ticket', 'Ticket_Prefix',
            'Ticket_Survival_Rate', 'Family_Survival_Rate', 'Ticket_Survival_Rate_NA', 'Family_Survival_Rate_NA']
df_features = df_train.drop(columns=drop_cols)

rfe = RFE(estimator=model, n_features_to_select=1)
rfe.fit(df_features, df_train['Survived'])

ranking = rfe.ranking_
features = df_features.columns

rfe_df = pd.DataFrame({
    'Feature': features,
    'RFE_Rank': ranking
}).sort_values(by='RFE_Rank')

print(rfe_df)

rfe_cols = rfe_df['Feature'].tolist()
rfe_cols

X = df_train[rfe_cols]
y = df_train['Survived']
X_test = df_test[rfe_cols]

X_train, x_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=42)

# Original Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=1750,
    max_depth=7,
    min_samples_split=6,
    min_samples_leaf=6,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Added Gradient Boosting model
gb_model = GradientBoostingClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=6,
    min_samples_leaf=6,
    subsample=0.8,
    random_state=42
)

from sklearn.model_selection import StratifiedKFold

N = 5
# StratifiedKFold 5 folds
skf = StratifiedKFold(n_splits=N, shuffle=True, random_state=42)
probs_test_rf = np.zeros((len(X_test), N))   #  (class=1)
probs_test_gb = np.zeros((len(X_test), N))   #  (class=1)
auc_scores_rf = []
acc_scores_rf = []
auc_scores_gb = []
acc_scores_gb = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}/{N}")
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train Random Forest
    rf_model.fit(X_tr, y_tr)

    # Accuracy - Random Forest
    val_pred_rf = rf_model.predict(X_val)
    acc_rf = accuracy_score(y_val, val_pred_rf)
    acc_scores_rf.append(acc_rf)
    
    # Validation AUC - Random Forest
    y_val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
    auc_rf = roc_auc_score(y_val, y_val_prob_rf)
    auc_scores_rf.append(auc_rf)
    print(f"  RF - AUC: {auc_rf:.4f}, ACC: {acc_rf:.4f}")
    
    # Test prediction - Random Forest
    probs_test_rf[:, fold - 1] = rf_model.predict_proba(X_test)[:, 1]
    
    # Train Gradient Boosting
    gb_model.fit(X_tr, y_tr)

    # Accuracy - Gradient Boosting
    val_pred_gb = gb_model.predict(X_val)
    acc_gb = accuracy_score(y_val, val_pred_gb)
    acc_scores_gb.append(acc_gb)
    
    # Validation AUC - Gradient Boosting
    y_val_prob_gb = gb_model.predict_proba(X_val)[:, 1]
    auc_gb = roc_auc_score(y_val, y_val_prob_gb)
    auc_scores_gb.append(auc_gb)
    print(f"  GB - AUC: {auc_gb:.4f}, ACC: {acc_gb:.4f}")
    
    # Test prediction - Gradient Boosting
    probs_test_gb[:, fold - 1] = gb_model.predict_proba(X_test)[:, 1]
    
# Calculate mean probabilities
probs_mean_rf = probs_test_rf.mean(axis=1)
probs_mean_gb = probs_test_gb.mean(axis=1)

print(f"\nRandom Forest - Average CV AUC: {np.mean(auc_scores_rf):.4f}")
print(f"Random Forest - Average ACC: {np.mean(acc_scores_rf):.4f}")
print(f"Gradient Boosting - Average CV AUC: {np.mean(auc_scores_gb):.4f}")
print(f"Gradient Boosting - Average ACC: {np.mean(acc_scores_gb):.4f}")

# Use the model with better performance for final submission
if np.mean(auc_scores_rf) >= np.mean(auc_scores_gb):
    print("Using Random Forest for final submission")
    preds = (probs_mean_rf >= 0.5).astype(int)
else:
    print("Using Gradient Boosting for final submission")
    preds = (probs_mean_gb >= 0.5).astype(int)

submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": preds
})

submission.to_csv("submission_0.csv", index=False)

print("Submission file 'submission_0.csv' created.")

  df_all['Title'] = df_all['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl