In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier # Or XGBoostClassifier
from sklearn.metrics import accuracy_score

# 1. Load Data
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
#submission_df = pd.read_csv("gender_submission.csv") # For final submission format

# Combine for consistent preprocessing
combined_df = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)

# 2. Feature Engineering
# Handle Name -> Title
combined_df['Title'] = combined_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Consolidate titles (as discussed in previous interaction)
title_mapping = {
    'Mlle': 'Miss', 'Major': 'Officer', 'Col': 'Officer', 'Capt': 'Officer',
    'Ms': 'Miss', 'Countess': 'Royalty', 'Mme': 'Mrs', 'Don': 'Royalty',
    'Lady': 'Royalty', 'Sir': 'Royalty', 'Dr': 'Officer', 'Rev': 'Officer',
    'Jonkheer': 'Royalty', 'Dona': 'Royalty'
}
combined_df['Title'] = combined_df['Title'].replace(title_mapping)
# For any remaining titles not mapped, categorize as 'Rare' or a default common title
combined_df['Title'] = combined_df['Title'].fillna('Mr') # Or a more robust handling

# Handle SibSp, Parch -> FamilySize, IsAlone
combined_df['FamilySize'] = combined_df['SibSp'] + combined_df['Parch'] + 1
combined_df['IsAlone'] = (combined_df['FamilySize'] == 1).astype(int)

# Handle Cabin -> CabinDeck (and impute 'U' for unknown)
combined_df['CabinDeck'] = combined_df['Cabin'].str[0].fillna('U')

# Drop features not needed after engineering or too many NaNs
combined_df = combined_df.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)

# 3. Missing Value Imputation
# Impute Embarked with mode of the combined (or just train for strictness)
most_common_embarked = combined_df['Embarked'].mode()[0]
combined_df['Embarked'] = combined_df['Embarked'].fillna(most_common_embarked)

# Impute Fare with median (especially for test_data's single NaN)
combined_df['Fare'] = combined_df['Fare'].fillna(combined_df['Fare'].median())

# Impute Age based on Title (more sophisticated)
# For simplicity here, just using overall median, but ideal would be by Title group
#combined_df['Age'].fillna(combined_df['Age'].median(), inplace=True)
combined_df["Age"] = combined_df.groupby(["Sex", "Pclass", "Title"])["Age"].transform(lambda x: x.fillna(x.median()))


# 4. Convert Categorical Features to Numerical (One-Hot Encoding)
combined_df = pd.get_dummies(combined_df, columns=['Sex', 'Embarked', 'Title', 'CabinDeck'], drop_first=True) # drop_first avoids multicollinearity

# Re-separate train and test data
X_train = combined_df.iloc[:len(train_df)].drop('PassengerId', axis=1)
X_test = combined_df.iloc[len(train_df):].drop('PassengerId', axis=1)
y_train = train_df['Survived']
test_passenger_ids = test_df['PassengerId'] # Keep for submission

# Align columns - crucial if test set has different categories
train_cols = X_train.columns
test_cols = X_test.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_train[c] = 0
X_test = X_test[train_cols] # Ensure order is same

# 5. Model Training and Prediction
model = RandomForestClassifier(n_estimators=1000, random_state=42) # Start with a good baseline
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# 6. Create Submission File (if this is for a Kaggle competition)
submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': predictions})
submission.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
