In [25]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb


In [12]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [16]:
test_df.shape, train_df.shape

((418, 11), (891, 12))

In [19]:
def preprocess_data(df):
    df = df.copy()
    
    def extract_last_name_and_title(name):
        # Regex to match last name and title
        match = re.match(r'^([^,]+),\s*(\w+\.?)\s*.*$', name.strip())
        if match:
            last_name = match.group(1).strip()
            title = match.group(2).strip().replace('.', '') if match.group(2) else np.nan
            return last_name, title
        return name, np.nan

    def extract_ticket_number(x):
        try:
            # Extract the last part after splitting by space
            ticket_str = x.split(" ")[-1]
            # Convert to float
            return float(ticket_str)
        except ValueError:
            # Return NaN if conversion fails
            return np.nan

    def extract_ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return np.nan
        return "_".join(items[:-1])
    
    def extract_cabin_letter(x):
        return x.split(" ")[-1]

    def extract_cabin_number(x):
        items = x.split(" ")
        if len(items) == 1:
            return np.nan
        return "_".join(items[:-1])
    
    def count_words(name):
        # Count the number of words in the name
        return len(name.split())

    def impute_numeric_mean(df):
        numeric_cols = df.select_dtypes(include=['number']).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        
    def split_cabin(cabin):
        if pd.isna(cabin):
            return np.nan, np.nan
        match = re.match(r'([A-Za-z])(\d+)', cabin)
        if match:
            letter = match.group(1)
            number = match.group(2)
            return letter, number
        return np.nan, np.nan
    
    # Impute missing values in numeric columns with mean
    impute_numeric_mean(df)
    
    # Apply transformations
    df["Ticket_number"] = df["Ticket"].apply(extract_ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(extract_ticket_item)
    df[["Last_Name", "Title"]] = df["Name"].apply(extract_last_name_and_title).apply(pd.Series)
    df["Name_Word_Count"] = df["Name"].apply(count_words)
    df[["Cabin_Letter", "Cabin_Number"]] = df["Cabin"].apply(split_cabin).apply(pd.Series)
    
    # Remove the specified columns
    df.drop(columns=["Name", "Ticket", "Cabin", "PassengerId"], inplace=True)
    
    # Convert to floats
    df['Cabin_Number'] = df['Cabin_Number'].astype(float)
    
    
    return df

In [20]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [21]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_number,Ticket_item,Last_Name,Title,Name_Word_Count,Cabin_Letter,Cabin_Number
0,0,3,male,22.000000,1,0,7.2500,S,21171.0,A/5,Braund,Mr,4,,
1,1,1,female,38.000000,1,0,71.2833,C,17599.0,PC,Cumings,Mrs,7,C,85.0
2,1,3,female,26.000000,0,0,7.9250,S,3101282.0,STON/O2.,Heikkinen,Miss,3,,
3,1,1,female,35.000000,1,0,53.1000,S,113803.0,,Futrelle,Mrs,7,C,123.0
4,0,3,male,35.000000,0,0,8.0500,S,373450.0,,Allen,Mr,4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,211536.0,,Montvila,Rev,3,,
887,1,1,female,19.000000,0,0,30.0000,S,112053.0,,Graham,Miss,4,B,42.0
888,0,3,female,29.699118,1,2,23.4500,S,6607.0,W./C.,Johnston,Miss,5,,
889,1,1,male,26.000000,0,0,30.0000,C,111369.0,,Behr,Mr,4,C,148.0


In [22]:
def create_dummies_with_consistency(train_df, test_df, categorical_cols):
    # Concatenate train and test datasets to get the complete set of categories
    combined_df = pd.concat([train_df, test_df], axis=0, ignore_index=False)
    
    # Create dummy variables for the combined dataset
    combined_dummies = pd.get_dummies(combined_df, columns=categorical_cols)
    
    # Determine the complete set of dummy columns
    dummy_columns = combined_dummies.columns
    
    # Apply the same dummy columns to both train and test datasets
    train_dummies = pd.get_dummies(train_df, columns=categorical_cols)
    test_dummies = pd.get_dummies(test_df, columns=categorical_cols)
    
    # Reindex both train and test datasets to ensure they have the same dummy columns
    train_dummies = train_dummies.reindex(columns=dummy_columns, fill_value=0)
    test_dummies = test_dummies.reindex(columns=dummy_columns, fill_value=0)
    
    return train_dummies, test_dummies

columns=['SibSp','Sex', 'Parch', 'Pclass', 'Embarked', 'Ticket_item', 'Last_Name', 'Title', 'Cabin_Letter']

processed_train, processed_test = create_dummies_with_consistency(train_df, test_df, columns)

In [27]:
validate_df, test_df = train_test_split(processed_train, test_size = .3, random_state = 24)

y_train = test_df['Survived']
x_train = test_df.drop(['Survived'], axis=1)
y_val = validate_df['Survived']
x_val = validate_df.drop(['Survived'], axis=1)

In [28]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
xgb_model.fit(x_train, y_train)

# Fill NaNs with 0s
x_train = x_train.fillna(0)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)

# Predict using XGBoost
xgb_predictions = xgb_model.predict(x_val)

# Fill NaNs with 0s
x_val = x_val.fillna(0)

# Predict using Random Forest
rf_predictions = rf_model.predict(x_val)

# Evaluate XGBoost model
xgb_accuracy = accuracy_score(y_val, xgb_predictions)
xgb_precision = precision_score(y_val, xgb_predictions)
xgb_recall = recall_score(y_val, xgb_predictions)
xgb_f1 = f1_score(y_val, xgb_predictions)

# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_val, rf_predictions)
rf_precision = precision_score(y_val, rf_predictions)
rf_recall = recall_score(y_val, rf_predictions)
rf_f1 = f1_score(y_val, rf_predictions)

# Print Evaluation Metrics
print("XGBoost Evaluation Metrics:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

print("\nRandom Forest Evaluation Metrics:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Evaluation Metrics:
Accuracy: 0.8154
Precision: 0.8173
Recall: 0.6883
F1 Score: 0.7473

Random Forest Evaluation Metrics:
Accuracy: 0.7978
Precision: 0.7951
Recall: 0.6599
F1 Score: 0.7212


In [29]:
# Remove the survived column (this was introduced in consistency function)
test_df = processed_test.drop(columns=['Survived'])

# Apply the trained XGBoost model to test_df
xgb_predictions_test = xgb_model.predict(test_df)

# Convert predictions to DataFrame and set index as 'Passenger Id'
predictions_df = pd.DataFrame({
    'PassengerId': test_df.index,
    'Survived': xgb_predictions_test
})

predictions_df['PassengerId'] = predictions_df['PassengerId'] + 892
predictions_df.to_csv('final.csv', index = False)

In [30]:
df_ = pd.read_csv('final.csv')
df_

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
