# Titanic with Pytorch
Hello there! We'll try to solve the Titanic survival model first as a Linear Regression problem and second as classification problem.

- We'll take a look at our data.
- Choose the features (Featur selection)
- Creating our custom dataset and batches using Pytorch DataLoader which will make it an easy task.

Let's dive in!

In [None]:
# Import all the necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch 
from torch import nn 
import matplotlib.pyplot as plt 
import torch.optim
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader, random_split
from numpy import array

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploring the data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

In [None]:
# Check the length and columns
len(train_data.index)

In [None]:
len(train_data.columns)

So after I created the batches I noticed `nan` values in the `Age` column so I went back here and counted how many are there

In [None]:
# Check for Nan vlues
print(train_data['Age'].isnull().values.sum())

We can choose to drop the age from the training or just drop the NAN values.

I chose to execlude it from training.

In [None]:
# Find survival rate for women
women = train_data.loc[train_data.Sex=='female']["Survived"]
rate_women = sum(women) / len(women)
F"% of women who survived: {rate_women}"

In [None]:
men = train_data.loc[train_data.Sex=='male']['Survived']
men_rate = sum(men) / len(men)
F"% of men who survived: {men_rate}"

In [None]:
# Find the survival rate in the different classes
fare_mean_1st = train_data[train_data["Pclass"]==1].Fare.mean()
fare_mean_2nd = train_data[train_data["Pclass"]==2].Fare.mean()
fare_mean_3rd = train_data[train_data["Pclass"]==3].Fare.mean()
F"Average cost of tickets for 1st, snd, 3rd classes: \
{fare_mean_1st} || {fare_mean_2nd} || {fare_mean_3rd}"


Now let's see how were the effect of other factors in the rate of survival.
The code below might not be the easiest to read, but if we take a second good look it will be clear tp us that we are dividing the numbers of survivals (men/women) by the number of passangers (survived or not) in the specific class.

In [None]:
woman_survived_1st = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==1)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==1)].index)
woman_survived_2nd = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==2)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==2)].index)
woman_survived_3rd = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==3)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==3)].index)

F"Rate of Survival for women in different classes: {woman_survived_1st} || {woman_survived_2nd} || {woman_survived_3rd}"

In [None]:
# Let's find out what other factors could effect the rate of survival
woman_survived_1st = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==1)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==1)].index)
woman_survived_2nd = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==2)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==2)].index)
woman_survived_3rd = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==3)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==3)].index)

F"Rate of Survival for men in different classes: {woman_survived_1st} || {woman_survived_2nd} || {woman_survived_3rd}"

In [None]:
# The verage Age 
age_mean = train_data.Age.mean()
survived_age_mean = train_data[(train_data["Survived"]==1)].Age.mean()
survived_age_std = train_data[(train_data["Survived"]==1)].Age.std()
survived_min_age = train_data[(train_data["Survived"]==1)].Age.min()
survived_max_age = train_data[(train_data["Survived"]==1)].Age.max()

print("The average of survivals age ", survived_age_mean)
print("The STD of survivals age ", survived_age_std)
print("The min age of survivals ", survived_min_age)
print("The max age of survivals ", survived_max_age)

# Let's see the other side
deceased_age_mean = train_data[(train_data["Survived"]==0)].Age.mean()
deceased_age_std = train_data[(train_data["Survived"]==0)].Age.std()
deceased_min_age = train_data[(train_data["Survived"]==0)].Age.min()
deceased_max_age = train_data[(train_data["Survived"]==0)].Age.max()

print()
print("The average of deceased age", deceased_age_mean)
print("The STD of deceased age", deceased_age_std)
print("The min age of deceased ", deceased_min_age)
print("The max age of deceased ", deceased_max_age)

As you can see we can go forever in exploring the data. Now let's start preparing our data for machine learning.
we'll divide our data into features and labels and also into training and testing sets.

In [None]:
# Splitting data into labels and targets
features = ["Pclass", "Sex", "Fare", "SibSp", "Parch"]


# Dividing the data into features and labels
y= train_data['Survived']

# Convert Sex columns into one hot encoding
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])


# Normalizing the data with min max normalization
X["Fare"] = (X["Fare"] - X["Fare"].min()) / (X["Fare"].max() - X["Fare"].min())
X_test["Fare"] = (X_test["Fare"] - X_test["Fare"].min()) / (X_test["Fare"].max() - X_test["Fare"].min())
       

In [None]:
# convert to numpy array 
X = X.to_numpy()
y = y.to_numpy().reshape(-1, 1)
X_test = X_test.to_numpy()

In [None]:
F"Length of train data: {len(X)}  Length of test data: {len(X_test)} "

In [None]:
# Chaeck the types of the data and shapes
print("The type of our data:\n", type(X))
print(type(X_test))



# Print the shapes
print("\nThe shape of our training data: \n", X.shape)
print("\nThe shape of our targets: \n", y.shape)
print("\nThe shape of our test data: \n", X_test.shape)




## Batching the Data

In [None]:
# Batch the data for the linear regression
def batch_data(batch_size, input_data, target, test_data, train_type = "regression"):
    '''
    This function batches the data for our model to train on
    batch_size: number of batches to perform backpropagation on
    input_data: numpy array with our input features
    target: numpy array with our target
    test_data: numpy array of our test data (doesn't contain targets)
    train_type: some small differences in batches between regression vs classification
    '''
    if train_type == "regression":
         target_tensor = torch.FloatTensor(target)
            
    elif train_type == "classification":
        target_tensor = torch.LongTensor(target)
        target_tensor = target_tensor.squeeze()
        
    input_tensor = torch.FloatTensor(input_data)
    test_tensor = torch.FloatTensor(test_data)
    
     # Create our custom dataset with input and corresponding targets
    train_dataset = TensorDataset(input_tensor, target_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    # No need to create multiple batches for the test loader
    test_loader = DataLoader(test_tensor, batch_size=len(test_data))
    
    return train_loader, test_loader
    
    
    

In [None]:
batch_size = 64
train_loader, test_loader = batch_data(batch_size, X, y, X_test)

In [None]:
# Check our data loader
data_iter = iter(train_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

## Building the Linear Regression Model

Now that everything is looking good, let's build our training model!

In [None]:

class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 128)
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    

In [None]:
linear_regression_model = LinearRegression()
print(linear_regression_model)

In [None]:
# Initialize the loss and opimization functions
lr = 0.003
criterion = nn.MSELoss() # mean square error
optimizer = torch.optim.SGD(linear_regression_model.parameters(), lr=lr)

## Training the Model

In [None]:
def train_model(model, epochs, cost_function, print_every = 10):
    losses =[]
    trained_model = None
    for e in range(epochs):
        train_loss = 0
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
        
            optimizer.zero_grad()
            y_pred = model(inputs)
            loss = cost_function(y_pred, labels)
            
             # Perform the backpropagation and the optimization step
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * batch_size
        
        train_loss = train_loss/len(train_loader.sampler)
        losses.append(train_loss)
            
        if epochs % print_every == 0:
            print("Epoch: {} || Loss: {}".format(e, train_loss))
        
        trained_model = model
           
    return losses, trained_model
 
    

In [None]:
losses,  linear_regression_model = train_model(linear_regression_model, 100, criterion)

## Testing

In [None]:
# getting a batch from testing data
with torch.no_grad():
    for data in test_loader:
        output = linear_regression_model(data)
        preds = torch.round(output)
    preds = preds.squeeze()
    survived = preds.numpy()
        

In [None]:
survived.shape

In [None]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': survived})
submission.to_csv('submission.csv', index=False)

In [None]:
df = pd.read_csv("submission.csv")
df.head()

## Building the Classification  Model
- The number of output here will change into 2
- We'll use CrossEntropyLoss instead of MSELoss

In [None]:
train_loader, test_loader = batch_data(64, X, y, X_test, train_type="classification")

In [None]:

class Clasification(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 2)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.fc3(x)
        return x
    

In [None]:
classification_model = Clasification()
classification_model

In [None]:
# Initialize the loss and opimization functions
lr = 0.0005
criterion = nn.CrossEntropyLoss() # mean square error
optimizer = torch.optim.SGD(classification_model.parameters(), lr=lr, momentum=0.9)

In [None]:
losses, classification_model = train_model(classification_model, 1000, criterion)

## Testing & Submision
Here I'll test the classification model and save the submission

In [None]:
# getting a batch from testing data
with torch.no_grad():
    for data in test_loader:
        output = classification_model(data.float())
        _, preds = torch.max(output.data, 1)
survived = preds.numpy()  

In [None]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': survived})
submission.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
df = pd.read_csv("submission.csv")
df.head()