<a href="https://colab.research.google.com/github/dipit099/Deep_Learning-Colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
train_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/titanic/train.csv')
test_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/titanic/test.csv')


In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.isnull().sum()

In [None]:
train_data.columns

In [None]:
object_columns = train_data.select_dtypes(include=['object']).columns
print(object_columns)

In [None]:
train_data.describe()

In [None]:
# Set the figure size
plt.figure(figsize=(10, 6))

# Create a scatter plot
sns.scatterplot(x='Fare', y='Survived', data=train_data, hue='Survived', palette={0: 'red', 1: 'green'})

# Set the title and labels
plt.title('Fare vs Survived (Scatter Plot)')
plt.xlabel('Fare')
plt.ylabel('Survived (0 = No, 1 = Yes)')

# Show the plot
plt.show()


In [None]:
# Data Loading and Preprocessing:
train_data = train_data.drop(['Name', 'Parch', 'SibSp', 'Ticket', 'Cabin'], axis=1)

imputer = SimpleImputer(strategy='mean')
train_data['Age'] = imputer.fit_transform(train_data[['Age']])

#handle categorical data
label_encoders = {}
for column in ['Sex', 'Embarked']:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

X = train_data.drop(['PassengerId', 'Survived'], axis=1)
y = train_data['Survived']

# # Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.int64)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Loaded into tensor model.. otherwise u cant use tensor functions
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Model Definition
class TitanicModel(nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()
        self.layer_1 = nn.Linear(X_tensor.shape[1], 128)
        self.layer_2 = nn.Linear(128, 64)
        self.layer_3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()  #using ReLU activation

    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.relu(self.layer_2(x))
        x = self.layer_3(x)
        return x

model = TitanicModel()
criterion = nn.BCEWithLogitsLoss()      # loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)  # using Adam as optimizer and learning rate


In [None]:
# Training Data
# Number of epochs to train the model
num_epochs = 10

# Loop over the dataset multiple times
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Initialize the epoch loss
    epoch_loss = 0.0

    # Loop over the batches of data
    for batch_X, batch_y in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass: compute the model output
        outputs = model(batch_X).squeeze()

        # Compute the loss
        loss = criterion(outputs, batch_y.float())

        # Backward pass: compute the gradients
        loss.backward()     # Back propagation

        # Update the model parameters
        optimizer.step()

        # Accumulate the loss for this batch
        epoch_loss += loss.item() * batch_X.size(0)

    # Compute the average loss for this epoch
    epoch_loss /= len(train_loader.dataset)

    # Print the loss for this epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')


In [None]:
# # Plot the training loss and accuracy
# fig, ax1 = plt.subplots(figsize=(12, 6))

# # Plot loss
# color = 'tab:red'
# ax1.set_xlabel('Epoch')
# ax1.set_ylabel('Loss', color=color)
# ax1.plot(range(1, num_epochs + 1), epoch_losses, marker='o', linestyle='-', color=color)
# ax1.tick_params(axis='y', labelcolor=color)

# # Create a second y-axis to plot accuracy
# ax2 = ax1.twinx()
# color = 'tab:blue'
# ax2.set_ylabel('Accuracy', color=color)
# ax2.plot(range(1, num_epochs + 1), epoch_accuracies, marker='x', linestyle='--', color=color)
# ax2.tick_params(axis='y', labelcolor=color)

# fig.tight_layout()
# plt.title('Training Loss and Accuracy over Epochs')
# plt.show()

In [None]:
# Function to evaluate the model on test data and compare
def evaluate_model(model, test_loader, criterion):
    # Set the model to evaluation mode
    model.eval()

    # Initialize total loss and counters for accuracy
    total_loss = 0
    correct = 0
    total = 0

    # Disable gradient calculation for evaluation
    with torch.no_grad():
        # Loop over the batches in the test loader
        for batch_X, batch_y in test_loader:
            # Forward pass: compute the model output
            outputs = model(batch_X).squeeze()

            # Compute the loss
            loss = criterion(outputs, batch_y.float())

            # Accumulate the loss for this batch
            total_loss += loss.item() * batch_X.size(0)

            # Convert logits to probabilities and then to binary predictions
            predictions = torch.round(torch.sigmoid(outputs))

            # Count correct predictions
            correct += (predictions == batch_y).sum().item()

            # Count total samples
            total += batch_y.size(0)

    # Calculate the average loss for the test set
    average_test_loss = total_loss / len(test_loader.dataset)

    # Calculate accuracy
    accuracy = correct / total

    # Print the average loss and accuracy
    print(f'Average Test Loss: {average_test_loss:.4f}')
    print(f'Accuracy: {accuracy:.4f}')

# Evaluate the model using the test data
evaluate_model(model, test_loader, criterion)


In [None]:
# Predictions

test_data = test_data.drop(['Name', 'Parch', 'SibSp', 'Ticket', 'Cabin'], axis=1)
test_data['Age'] = imputer.transform(test_data[['Age']])

for column in ['Sex', 'Embarked']:
    test_data[column] = label_encoders[column].transform(test_data[column])



In [None]:
#dont remove rows from testdata
test_data.isnull().sum()

In [None]:
test_data.shape

In [None]:
test_data.head()

In [None]:
# Prepare the test data for prediction
# Drop the 'PassengerId' column as it's not a feature
X_test = test_data.drop(['PassengerId'], axis=1)

# Save the 'PassengerId' column to use later for the output file
PassengerId = test_data['PassengerId']

# Apply the same scaler used on the training data to the test data
# X_test = scaler.transform(X_test)

# Convert the scaled test data to a PyTorch tensor
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Set the model to evaluation mode
model.eval()

# Disable gradient calculation for prediction
with torch.no_grad():
    # Forward pass: compute the model output for the test data
    outputs = model(X_test_tensor).squeeze()

    # Convert logits to probabilities and then to binary predictions
    predictions = torch.round(torch.sigmoid(outputs)).long()



In [None]:
# Create a DataFrame to store the 'PassengerId' and the corresponding predictions
output_df = pd.DataFrame({
    'PassengerId': PassengerId,
    'Survived': predictions.numpy()
})

# Save the predictions to a CSV file
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/datasets/titanic/output.csv', index=False)

# Print a message indicating that the output has been saved
print('Output saved to output.csv')
