 # Data Preprocessing dan Exploratory Data Analysis (EDA)

Download dataset From Kaggle

In [None]:
#In the first cell, upload kaggle.json
from google.colab import files
files.upload()  # Select the `kaggle.json` file from your computer



Set Up Kaggle API and Download Dataset

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


download the dataset using the Kaggle API

In [None]:
!kaggle competitions download -c titanic

Unzip the downloaded files

In [None]:
!unzip titanic.zip

# Import Library and Load Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load the dataset
train_data = pd.read_csv('train.csv')
train_data.head()


#  Data Preprocessing dan Exploratory Data Analysis (EDA)

In [None]:
train_data.info()

# Drop kolom yang tidak dibutuhkan (Cabin, Ticket, Name)
train_data.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

# Isi nilai kosong pada kolom 'Age' dengan median dan 'Embarked' dengan modus
imputer = SimpleImputer(strategy='median')
train_data['Age'] = imputer.fit_transform(train_data[['Age']])
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

# Encode variabel kategorikal
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
train_data['Embarked'] = label_encoder.fit_transform(train_data['Embarked'])

# Define features dan target
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Data Visualization


Import Library Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set(style="whitegrid")


Visualization Survived or not survived

In [None]:
# Plot distribusi penumpang yang selamat dan tidak selamat
plt.figure(figsize=(6,4))
sns.countplot(data=train_data, x='Survived', palette='viridis')
plt.title('Distribusi Penumpang yang Selamat vs. Tidak Selamat')
plt.xlabel('Survived (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.show()


Visualization of the Relationship Between Gender and Survival

In [None]:
# Plot distribusi keselamatan berdasarkan gender
plt.figure(figsize=(6,4))
sns.countplot(data=train_data, x='Sex', hue='Survived', palette='viridis')
plt.title('Keselamatan Berdasarkan Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(['Tidak Selamat', 'Selamat'], title='Survived')
plt.show()


Visualization of the Relationship Between Age and Survival

In [None]:
# Plot distribusi usia
plt.figure(figsize=(8,6))
sns.histplot(train_data['Age'], kde=True, bins=30, color='blue')
plt.title('Distribusi Usia Penumpang')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


Visualization of Survival Based on Port of Embarkation

In [None]:
# Plot survival based on port of embarkation
plt.figure(figsize=(6,4))
sns.countplot(data=train_data, x='Embarked', hue='Survived', palette='viridis')
plt.title('Survival Based on Port of Embarkation')
plt.xlabel('Port of Embarkation')
plt.ylabel('Count')
plt.legend(['Did Not Survive', 'Survived'], title='Survived')
plt.show()


# Plot correlation heatmap

In [None]:

plt.figure(figsize=(10,8))
sns.heatmap(train_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Between Features')
plt.show()

#  Advanced Feature Engineering for Improved Model Performance

Create the "FamilySize" Feature

In [None]:
#Family size can be an important indicator.
#We can create a "FamilySize" feature based on the number of family members on board,
#which is the sum of the number of siblings/spouses (SibSp) and the number of parents/children (Parch)
# plus one (for the passenger themselves).
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

Create the "IsAlone" Feature

In [None]:
# Create IsAlone feature
train_data['IsAlone'] = 1  # Initialize as "alone"
train_data['IsAlone'].loc[train_data['FamilySize'] > 1] = 0  # Set to 0 if with family


In [None]:
train_data.head()


Create "FarePerPerson" Feature

In [None]:
# Create FarePerPerson
train_data['FarePerPerson'] = train_data['Fare'] / train_data['FamilySize']


Categorize age into groups

In [None]:
# Categorize age into groups
train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 12, 18, 35, 60, 80], labels=['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior'])


In [None]:
# One-hot encode categorical features
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked','AgeGroup'], drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize Fare and Age columns
scaler = StandardScaler()
train_data[['Fare', 'Age', 'FarePerPerson']] = scaler.fit_transform(train_data[['Fare', 'Age', 'FarePerPerson']])


In [None]:
train_data.head()


# Split Data and Build the Model

In [None]:
# Split data again after feature engineering
X = train_data.drop(['Survived', 'PassengerId'], axis=1)
y = train_data['Survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the same model, e.g., RandomForest
best_rf_model = RandomForestClassifier(random_state=42)
best_rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = best_rf_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Model Accuracy After Feature Engineering: {accuracy:.2f}')
