# Titanic with Pytorch
Hello there! We'll try to solve the Titanic survival model first as a Linear Regression problem and second as classification problem.

- We'll take a look at our data.
- Choose the features (Featur selection)
- Creating our custom dataset and batches using Pytorch DataLoader which will make it an easy task.

Let's dive in!

In [None]:
# Import all the necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#This is for data exploration and viualization
import seaborn as sns 
from scipy import stats
import matplotlib.pyplot as plt 

# The following is basically for building and training our models
import torch 
from torch import nn 
import torch.optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.ensemble import RandomForestRegressor

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Exploring the data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.head()

In [None]:
# Check the length and columns
len(train_data.index)

In [None]:
len(train_data.columns)

In [None]:
# Find survival rate for women
women = train_data.loc[train_data.Sex=='female']["Survived"]
rate_women = sum(women) / len(women)
F"% of women who survived: {rate_women}"

In [None]:
men = train_data.loc[train_data.Sex=='male']['Survived']
men_rate = sum(men) / len(men)
F"% of men who survived: {men_rate}"

In [None]:
# Find the survival rate in the different classes
fare_mean_1st = train_data[train_data["Pclass"]==1].Fare.mean()
fare_mean_2nd = train_data[train_data["Pclass"]==2].Fare.mean()
fare_mean_3rd = train_data[train_data["Pclass"]==3].Fare.mean()
F"Average cost of tickets for 1st, snd, 3rd classes: \
{fare_mean_1st} || {fare_mean_2nd} || {fare_mean_3rd}"


Now let's see how were the effect of other factors in the rate of survival.
The code below might not be the easiest to read, but if we take a second good look it will be clear tp us that we are dividing the numbers of survivals (men/women) by the number of passangers (survived or not) in the specific class.

In [None]:
woman_survived_1st = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==1)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==1)].index)
woman_survived_2nd = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==2)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==2)].index)
woman_survived_3rd = len(train_data[(train_data["Sex"]=="female") & (train_data["Survived"]==1) & (train_data["Pclass"]==3)].index) / len(train_data[(train_data["Sex"]=="female") & (train_data["Pclass"]==3)].index)

F"Rate of Survival for women in different classes: {woman_survived_1st} || {woman_survived_2nd} || {woman_survived_3rd}"

In [None]:
# Let's find out what other factors could effect the rate of survival
woman_survived_1st = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==1)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==1)].index)
woman_survived_2nd = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==2)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==2)].index)
woman_survived_3rd = len(train_data[(train_data["Sex"]=="male") & (train_data["Survived"]==1) & (train_data["Pclass"]==3)].index) / len(train_data[(train_data["Sex"]=="male") & (train_data["Pclass"]==3)].index)

F"Rate of Survival for men in different classes: {woman_survived_1st} || {woman_survived_2nd} || {woman_survived_3rd}"

In [None]:
# The verage Age 
age_mean = train_data.Age.mean()
survived_age_mean = train_data[(train_data["Survived"]==1)].Age.mean()
survived_age_std = train_data[(train_data["Survived"]==1)].Age.std()
survived_min_age = train_data[(train_data["Survived"]==1)].Age.min()
survived_max_age = train_data[(train_data["Survived"]==1)].Age.max()

print("The average of survivals age ", survived_age_mean)
print("The STD of survivals age ", survived_age_std)
print("The min age of survivals ", survived_min_age)
print("The max age of survivals ", survived_max_age)

# Let's see the other side
deceased_age_mean = train_data[(train_data["Survived"]==0)].Age.mean()
deceased_age_std = train_data[(train_data["Survived"]==0)].Age.std()
deceased_min_age = train_data[(train_data["Survived"]==0)].Age.min()
deceased_max_age = train_data[(train_data["Survived"]==0)].Age.max()

print()
print("The average of deceased age", deceased_age_mean)
print("The STD of deceased age", deceased_age_std)
print("The min age of deceased ", deceased_min_age)
print("The max age of deceased ", deceased_max_age)

## Investigating Deeper into the data
I actually added this part after unsuccessfully triec to increase the accuracy for my model both linear regression and classification. So I decided to explore the data more and see if there is any Null values or/and outliers that could effect the model results.


In [None]:
# Show Any Null values in the data
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

Box Plot is a ggod way to detect outlier values. They are the invisual dots away from the quartiles.

In [None]:
sns.boxplot(x=train_data['Age'])

In [None]:
sns.boxplot(x=train_data['SibSp'])

Just to have a better visualization for our data let's have a better look with some histograms

In [None]:
fig= plt.figure(figsize=(10,5))
train_data.groupby('Sex')['PassengerId'].nunique().plot(kind='bar')
plt.xlabel('Sex')
plt.title('Number of records by Sex')
plt.show()

In [None]:
fig= plt.figure(figsize=(20,10))
train_data.groupby('Age')['PassengerId'].nunique().plot(kind='bar')
plt.xlabel('Age')
plt.title('Number of records by Age')
plt.show()

In [None]:
# Before ficing our data let's drop any undesired columns
X = train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
X_test = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

### Preprocessing Data
But first make sure that we convert categorical data into one hot encoding and fix any NAN values in the train data

In [None]:
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

X.fillna(X.mean(),inplace=True)
X_test.fillna(X_test.mean(),inplace=True)

In [None]:
X.isnull().sum()

In [None]:
X_test.isnull().sum()

Awesome! No more null values

In [None]:
# Splitting data into labels and targets
features = ["Pclass", "Sex_female", "Age", "Fare", "SibSp", "Parch"]


# Dividing the data into features and labels
y= X['Survived']

X = pd.DataFrame(X, columns = features) 
X_test = pd.DataFrame(X_test, columns = features)

# standardize and Normalizing the data
# This will also help minimize the effect of outliers 
for col in features:
    X[col] = (X[col] - X[col].mean()) / X[col].std()
    X_test[col] = (X_test[col] - X_test[col].mean()) / X_test[col].std()
    
for col in features:    
    X[col] = (X[col] - X[col].min()) / (X[col].max() - X[col].min())
    X_test[col] = (X_test[col] - X_test[col].min()) / (X_test[col].max() - X_test[col].min())
       

## Training the Model with Random Forest

In [None]:

model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=0)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('random_forest_submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
df = pd.read_csv("random_forest_submission.csv")
df

In [None]:
# convert to numpy array 
X = X.to_numpy()
y = y.to_numpy().reshape(-1, 1)
X_test = X_test.to_numpy()

In [None]:
F"Length of train data: {len(X)}  Length of test data: {len(X_test)} "

In [None]:
# Chaeck the types of the data and shapes
print("The type of our data:\n", type(X))
print(type(X_test))



# Print the shapes
print("\nThe shape of our training data: \n", X.shape)
print("\nThe shape of our targets: \n", y.shape)
print("\nThe shape of our test data: \n", X_test.shape)




In [None]:
X_test

## Batching the Data

In [None]:
# Batch the data for the linear regression
def batch_data(batch_size, input_data, target, test_data, train_type = "regression", val_size=0.1):
    '''
    This function batches the data for our model to train on
    batch_size: number of batches to perform backpropagation on
    input_data: numpy array with our input features
    target: numpy array with our target
    test_data: numpy array of our test data (doesn't contain targets)
    train_type: some small differences in batches between regression vs classification
    '''
    if train_type == "regression":
         target_tensor = torch.FloatTensor(target)
            
    elif train_type == "classification":
        target_tensor = torch.LongTensor(target)
        target_tensor = target_tensor.squeeze()
        
    input_tensor = torch.FloatTensor(input_data)
    test_tensor = torch.FloatTensor(test_data)
    
     # Create our custom dataset with input and corresponding targets
    train_dataset = TensorDataset(input_tensor, target_tensor)
    
    # Split training set into validation and training
    num_train = len(train_dataset)
    indicies = list(range(num_train))
    np.random.shuffle(indicies)
    val_split = int(np.floor(val_size * num_train))
    
    train_idx, val_idx = indicies[val_split:], indicies[:val_split]
    
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)
    
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler = train_sampler)
    val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler = val_sampler)
    # No need to create multiple batches for the test loader
    test_loader = DataLoader(test_tensor, batch_size=len(test_data))
    
    return train_loader, val_loader, test_loader
    
    
    

In [None]:
batch_size = 64
train_loader, val_loader, test_loader = batch_data(batch_size, X, y, X_test)

In [None]:
# Check our data loader
data_iter = iter(train_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

## Building the Linear Regression Model

Now that everything is looking good, let's build our training model!

In [None]:

class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 1)     
    def forward(self, x):
        x = self.fc1(x)
        return x
    

In [None]:
linear_regression_model = LinearRegression()
print(linear_regression_model)

In [None]:
# Initialize the loss and opimization functions
lr = 0.001
criterion = nn.MSELoss() # mean square error
optimizer = torch.optim.SGD(linear_regression_model.parameters(), lr=lr, momentum=0.9)
batch_size = 64

## Training the Model

In [None]:
def train_model(model, batch_size, epochs, cost_function, print_every = 100):
    val_loss_min = np.Inf
    
    for e in range(epochs):
        val_loss = 0.0
        train_loss = 0.0
        
        model.train()
        for inputs, labels in train_loader:
        
            optimizer.zero_grad()
            output = model(inputs)
            loss = cost_function(output, labels)
            
             # Perform the backpropagation and the optimization step
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * batch_size
        
        # Evaluating our model performance
        model.eval()
        for inputs, labels in val_loader:
            output = model(inputs)
            loss = cost_function(output, labels)
            
            val_loss += loss.item() * batch_size
                    
        train_loss = train_loss / len(train_loader.sampler)
        val_loss = val_loss / len(val_loader.sampler)
        
        # save model if validation loss has decreased
        if val_loss <= val_loss_min:
            # print the decremnet in the validation
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            val_loss_min, val_loss))
            val_loss_min = val_loss
            torch.save(model.state_dict(), 'model_linear.pt')
            
        if epochs % print_every == 0:
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(e, train_loss, val_loss))
        
    print("Best model with validation loss: {}". format(val_loss_min))

 
    

In [None]:
train_model(linear_regression_model, batch_size, epochs=3000, cost_function=criterion)

In [None]:
linear_regression_model.load_state_dict(torch.load('model_linear.pt'))

## Testing

In [None]:
# getting a batch from testing data
with torch.no_grad():
    for data in test_loader:
        output = linear_regression_model(data)
        preds = torch.round(output)
    preds = preds.squeeze()
    survived = preds.numpy()
        

In [None]:
survived = survived.astype('int')

In [None]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': survived})
submission.to_csv('submission_regression.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
df = pd.read_csv("submission_regression.csv")
df

## Building the Classification  Model
- The number of output here will change into 2
- We'll use CrossEntropyLoss instead of MSELoss

In [None]:
train_loader, val_loader, test_loader = batch_data(batch_size, X, y, X_test, train_type="classification", val_size=0.2)

In [None]:

class Clasification(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.fc3(x)
        return x
    

In [None]:
classification_model = Clasification()
classification_model

In [None]:
# Initialize the loss and opimization functions
lr = 0.0005
criterion = nn.CrossEntropyLoss() # mean square error
optimizer = torch.optim.SGD(classification_model.parameters(), lr=lr, momentum=0.9)
batch_size = 128

In [None]:
train_model(classification_model, batch_size, 1000, criterion)

As a side note deep learning technique is more suitable for large data, in fact this is why it is widely used and the main purpose it was invented which is to handle learning from large data. So it is understandable why it might not perform perfectly here.

## Testing & Submision
Here I'll test the classification model and save the submission

In [None]:
# getting a batch from testing data
with torch.no_grad():
    for data in test_loader:
        output = classification_model(data.float())
        _, preds = torch.max(output.data, 1)
survived = preds.numpy()  

In [None]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': survived})
submission.to_csv('submission_classification.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
df = pd.read_csv("submission_classification.csv")
df.head()

References: 
- https://www.kaggle.com/frtgnn/introduction-to-pytorch-a-very-gentle-start
- https://www.kaggle.com/kiranscaria/titanic-pytorch
- https://www.kaggle.com/alexisbcook/getting-started-with-titanic
- https://www.analyticsvidhya.com/blog/2015/12/improve-machine-learning-results/
- https://androidkt.com/detect-and-remove-outliers-from-pandas-dataframe/