In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TITANIC
![](https://duniadanmisteri.files.wordpress.com/2019/02/405029.jpg)

# Input Data
 muat dataset Titanic menggunakan 'pandas'

In [None]:
#Memuat dataset Titanic
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df

In [None]:
df.describe(include='all')

In [None]:
# Melihat lima baris pertama dari dataset
df.head()

In [None]:
# Melihat informasi dasar tentang dataset
df.info()

# PREPROCESSING DATA
## Mengecek Data Yang Hilang

In [None]:
# Mengecek nilai yang hilang
print(df.isnull().sum())


Output ini akan menunjukkan berapa banyak nilai yang hilang di setiap kolom. Bahwa kolom Age, Cabin, dan Embarked memiliki nilai yang hilang.

## Menangani Nilai yang Hilang
* Kolom Age: Memiliki 177 nilai yang hilang. Bisa diisi nilai yang hilang dengan median, mean, atau nilai yang diprediksi.

* Kolom Cabin: Memiliki 687 nilai yang hilang. Bisa memilih untuk menghapus kolom ini jika terlalu banyak data yang hilang.

* Kolom Embarked: Memiliki 2 nilai yang hilang. Bisa disi nilai yang hilang dengan mode (nilai yang paling sering muncul).

In [None]:
# Mengisi nilai yang hilang pada kolom 'Age' dengan median umur
df['Age'] = df['Age'].fillna(df['Age'].median())

In [None]:
# Mengisi nilai yang hilang pada kolom 'Embarked' dengan mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [None]:
# Buat fitur baru 'HasCabin' yang bernilai 1 jika penumpang memiliki kabin dan 0 jika tidak
df['HasCabin'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

In [None]:
# Menghapus kolom 'Cabin' karena terlalu banyak nilai yang hilang
df = df.drop('Cabin', axis=1)

In [None]:
# Mengecek hasil setelah diproses
df.isnull().sum()

# FEATURE ENGINEERING

## Mengubah Tipe Data
Beberapa kolom mungkin perlu diubah menjadi tipe data yang lebih sesuai, seperti mengubah kolom kategorikal menjadi numerik atau tipe kategori.

* Kolom Sex: Mengubah dari string ke numerik (misalnya, 'male' menjadi 0 dan 'female' menjadi 1).
* Kolom Embarked: Menggunakan encoding untuk mengubah data kategorikal menjadi numerik.

In [None]:
# Mengubah kolom 'Sex' menjadi numerik
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

In [None]:
# Mengubah kolom 'Embarked' menjadi variabel dummy
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

## Menghapus Kolom yang Tidak Relevan
Beberapa kolom mungkin tidak relevan untuk analisis atau pembuatan model prediksi. Misalnya, Ticket dan Name tidak memberikan informasi yang signifikan untuk prediksi kelangsungan hidup.

In [None]:
# Menghapus kolom 'Ticket' dan 'Name'
df.drop(['Ticket', 'Name'], axis=1, inplace=True)


## Memeriksa Duplikasi
Periksa apakah ada duplikasi dalam data, yang mungkin perlu dihapus.

In [None]:
# Mengecek duplikasi
duplicates = df.duplicated().sum()
print(f"Jumlah duplikasi: {duplicates}")

## Memeriksa Kembali Data yang Bersih

In [None]:
# Mengecek informasi dasar tentang dataset setelah pembersihan
df.head()


# FEATURE SELECTION

In [None]:
features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = df[features]
y = df['Survived']

In [None]:
X

In [None]:
y

## Train Test Split, Memisahkan data untuk dilatih dan diuji

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Data train

In [None]:
X_train

In [None]:
y_train

### Data test

In [None]:
X_test

In [None]:
y_test

# MODELING

# Random Forest

In [None]:
# inisialisasi model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# melatih model
rf_model.fit(X_train, y_train)

In [None]:
# memprediksi data uji
rf_predictions = rf_model.predict(X_test)

In [None]:
# evaluasi model
print("Random Forest Classifier")
print(classification_report(y_test, rf_predictions))
print("Accuracy:", accuracy_score(y_test, rf_predictions))

# Logistic Regression

In [None]:
# inisialisasi model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr_model = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
# melatih model
lr_model.fit(X_train, y_train)

In [None]:
# memprediksi data uji
lr_predictions = lr_model.predict(X_test)

In [None]:
# evaluasi model
print("\nLogistic Regression")
print(classification_report(y_test, lr_predictions))
print("Accuracy:", accuracy_score(y_test, lr_predictions))

# VISUALISASI

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# ROC curve untuk Random Forest
rf_probs = rf_model.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
rf_roc_auc = roc_auc_score(y_test, rf_probs)

In [None]:
# ROC curve untuk Logistic Regression
lr_probs = lr_model.predict_proba(X_test)[:, 1]
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
lr_roc_auc = roc_auc_score(y_test, lr_probs)

In [None]:
# confusion matrix untuk Random Forest
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(rf_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Survived', 'Survived'], 
            yticklabels=['Not Survived', 'Survived'])
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# confusion matrix untuk Logistic Regression
lr_conf_matrix = confusion_matrix(y_test, lr_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(lr_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Not Survived', 'Survived'], 
            yticklabels=['Not Survived', 'Survived'])
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# ROC Curve
plt.figure(figsize=(10, 8))
plt.plot(rf_fpr, rf_tpr, color='darkorange', lw=2, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot(lr_fpr, lr_tpr, color='blue', lw=2, label='Logistic Regression (area = %0.2f)' % lr_roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Semua Model')
plt.legend(loc="lower right")
plt.show()

## Feature importances (khusus random forest)

In [None]:
# fitur penting dari Random Forest
importances = rf_model.feature_importances_
features = X.columns
feature_importances = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title('Fitur Penting berdasarkan Random Forest')
plt.show()

## Correlation Matrix

In [None]:
corr_matrix = df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# EDA

## Analisis kolom Age

In [None]:
#import library untuk EDA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Analisis kolom Age
sns.histplot(data=df, x='Age', hue='Survived', kde=True)
plt.title('Distribution of Age with Survived')
plt.show()

## Analisis Kolom Sex

In [None]:
# Analisis Kolom sex
# Count plot for categorical features
sns.countplot(data=df, x='Sex', hue='Survived')
plt.title('Sex Distribution')
plt.show()

In [None]:
sns.barplot(data=df, x='Sex', y='Survived')
plt.title('Survival Rate by Gender')
plt.show()

In [None]:
# Analisis kolom Age
sns.histplot(data=df, x='Sex', hue='Survived', kde=True)
plt.title('Distribution of Sex with Survived')
plt.show()

## analisis kolom Pclass

In [None]:
sns.barplot(data=df, x='Pclass', y='Survived')
plt.title('Survival Rate by Passenger Class')
plt.show()