In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Assuming you have your training and testing data in 'train_data' and 'test_data'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
# Feature Engineering
def feature_engineering(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    return df

In [4]:
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

In [5]:
# Handle Missing Values
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "FamilySize", "IsAlone", "Title"]
X = train_data[features]
X_test = test_data[features]

In [6]:
# Encoding and Imputation
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

In [7]:
# Align columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

In [8]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X['Age'] = imputer.fit_transform(X[['Age']])
X_test['Age'] = imputer.transform(X_test[['Age']])

In [9]:
# Standardize features
scaler = StandardScaler()
X[['Age', 'FamilySize']] = scaler.fit_transform(X[['Age', 'FamilySize']])
X_test[['Age', 'FamilySize']] = scaler.transform(X_test[['Age', 'FamilySize']])

In [10]:
# Model
y = train_data["Survived"]
model = RandomForestClassifier(random_state=1)

In [11]:
# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

In [12]:
# Predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [13]:
# Output
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
