<a href="https://www.kaggle.com/code/hetarthchopra/neural-network-feature-selection?scriptVersionId=114404956" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is just a code run through for the tutorial at https://captum.ai/tutorials/Titanic_Basic_Interpret. 
I have used it to get up to speed with Captum, PyTorch and Model Interpretability. 

Apart from that, I will be building a model with all the features first, and then I will try to increase the accuracy after removin the less important features using Integrated Gradients Approach

In [None]:
!pip install captum

In [None]:
import numpy as np
import torch
import torch.nn as nn

from captum.attr import IntegratedGradients
from captum.attr import LayerConductance
from captum.attr import NeuronConductance

import matplotlib
import matplotlib.pyplot as plt
!matplotlib.inline

from scipy import stats
import pandas as pd

In [None]:
# read the dataset
titanic_data = pd.read_csv('/kaggle/input/titanic/train.csv')
# do one hot encoding
titanic_data = pd.concat([titanic_data,
                          pd.get_dummies(titanic_data['Sex']),
                          pd.get_dummies(titanic_data['Embarked'],prefix="embark"),
                          pd.get_dummies(titanic_data['Pclass'],prefix="pclass")], axis=1)
titanic_data["Age"] = titanic_data["Age"].fillna(titanic_data["Age"].mean())
titanic_data["Fare"] = titanic_data["Fare"].fillna(titanic_data["Fare"].mean())
titanic_data = titanic_data.drop(['Name','Ticket','Cabin','Sex','Embarked','Pclass','PassengerId'], axis=1)

In [None]:
titanic_data.head()

In [None]:
#set random seed for reproducibility
np.random.seed(100)

# convert all to numpy
labels = titanic_data['Survived'].to_numpy()
titanic_data = titanic_data.drop(['Survived'],axis=1)
feature_names = list(titanic_data.columns)
data = titanic_data.to_numpy()

In [None]:
# separate train and test data
train_indices = np.random.choice(len(labels), int(0.7*len(labels)), replace=False)
test_indices = list(set(range(len(labels))) - set(train_indices))  
train_features = data[train_indices]
train_labels = labels[train_indices]
test_features = data[test_indices]
test_labels = labels[test_indices]

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x,y):
        super().__init__()
        self.x = x
        self.y = y 
    
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]
    
    def __len__(self):
        return len(self.x)

# define the train and test dataloader
train_loader = torch.utils.data.DataLoader(Dataset(train_features,train_labels))
test_loader = torch.utils.data.DataLoader(Dataset(test_features,test_labels))

## Build a Baseline Model

In [None]:
torch.manual_seed(1)

# code a neural network with the nn module imported into the class
class Titanic_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(12,12)
        self.sigmoid1 = nn.Sigmoid()
        self.linear2 = nn.Linear(12,8)
        self.sigmoid2 = nn.Sigmoid()
        self.linear3 = nn.Linear(8,2)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x):
        lin1_out = self.linear1(x)
        sigmoid1_out = self.sigmoid1(lin1_out)
        lin2_out = self.linear2(sigmoid1_out)
        sigmoid2_out = self.sigmoid2(lin2_out)
        lin3_out = self.linear3(sigmoid2_out)
        softmax_out = self.softmax(lin3_out)
        return softmax_out

In [None]:
model = Titanic_Model()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
total_loss, total_acc = list(),list()
feat_imp = np.zeros(train_features.shape[1])
num_epochs = 200

for epoch in range(num_epochs):
    losses = 0 
    for idx, (x,y) in enumerate(train_loader):
        x,y = x.float(), y.type(torch.LongTensor)
        x.requires_grad=True
        optimizer.zero_grad()
        # check if the progrma can be run with model(x) and model.forward()
        preds=model.forward(x)
        loss=criterion(preds,y)
        x.requires_grad = False
        loss.backward()
        optimizer.step()
        losses+=loss.item()
    total_loss.append(losses/len(train_loader))
    if epoch%5==0:
        print("Epoch:", str(epoch+1), "\tLoss:", total_loss[-1])

In [None]:
# save the model
torch.save(model.state_dict(), '/kaggle/working/titanic_model.pt')

In [None]:
model.eval()
correct=0
for idx, (x,y) in enumerate(test_loader):
    with torch.no_grad():
        x,y = x.float(), y.type(torch.LongTensor)
        pred = model(x)
        preds_class = torch.argmax(pred)
        if (preds_class.numpy()== y.numpy()[0]):
            correct+=1
print("Accuracy = ", correct/len(test_indices))

In [None]:
test_input_tensor = torch.from_numpy(test_features).type(torch.FloatTensor)

## Calculate the Integrated Gradients

In [None]:
ig = IntegratedGradients(model)

In [None]:
test_input_tensor.requires_grad_()
attr, delta = ig.attribute(test_input_tensor, target = 1, return_convergence_delta  = True)
attr = attr.detach().numpy()

In [None]:
# Helper method to print importances and visualize distribution
def visualize_importances(feature_names, importances, title="Average Feature Importances", plot=True, axis_title="Features"):
    print(title)
    for i in range(len(feature_names)):
        print(feature_names[i], ": ", '%.3f'%(importances[i]))
    x_pos = (np.arange(len(feature_names)))
    if plot:
        plt.figure(figsize=(12,6))
        plt.bar(x_pos, importances, align='center')
        plt.xticks(x_pos, feature_names, wrap=True)
        plt.xlabel(axis_title)
        plt.title(title)
visualize_importances(feature_names, np.mean(np.abs(attr), axis=0))

## Get Top K Least Important Features

In [None]:
k_features=4
features_to_be_dropped = [b for (a,b) in sorted(zip(feat_imp,feature_names))][0:k_features]

# Make the New Dataset

In [None]:
# do not include variables such as Parch, Embark_C, Embark_Q,Embark_S and others that you feel, which have low feature importance
# read the dataset
titanic_data = pd.read_csv('/kaggle/input/titanic/train.csv')
# do one hot encoding
titanic_data = pd.concat([titanic_data,
                          pd.get_dummies(titanic_data['Sex']),
                          pd.get_dummies(titanic_data['Pclass'],prefix="pclass")], axis=1)
titanic_data["Age"] = titanic_data["Age"].fillna(titanic_data["Age"].mean())
titanic_data["Fare"] = titanic_data["Fare"].fillna(titanic_data["Fare"].mean())
titanic_data = titanic_data.drop(['Name','Ticket','Cabin','Sex','Embarked','Pclass','PassengerId','Parch'], axis=1)

In [None]:
# convert all to numpy
labels = titanic_data['Survived'].to_numpy()
titanic_data = titanic_data.drop(['Survived'],axis=1)
feature_names = list(titanic_data.columns)
data = titanic_data.to_numpy()

In [None]:
# separate train and test data
train_indices = np.random.choice(len(labels), int(0.7*len(labels)), replace=False)
test_indices = list(set(range(len(labels))) - set(train_indices))
train_features = data[train_indices]
train_labels = labels[train_indices]
test_features = data[test_indices]
test_labels = labels[test_indices]

In [None]:
# define the train and test dataloader
train_loader = torch.utils.data.DataLoader(Dataset(train_features,train_labels))
test_loader = torch.utils.data.DataLoader(Dataset(test_features,test_labels))

## Retrain model with different dataset

In [None]:
torch.manual_seed(1)

# code a neural network with the nn module imported into the class
class Titanic_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(8,12) # since features have been dropped chaneg input layer
        self.sigmoid1 = nn.Sigmoid()
        self.linear2 = nn.Linear(12,8)
        self.sigmoid2 = nn.Sigmoid()
        self.linear3 = nn.Linear(8,2)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self,x):
        lin1_out = self.linear1(x)
        sigmoid1_out = self.sigmoid1(lin1_out)
        lin2_out = self.linear2(sigmoid1_out)
        sigmoid2_out = self.sigmoid2(lin2_out)
        lin3_out = self.linear3(sigmoid2_out)
        softmax_out = self.softmax(lin3_out)
        return softmax_out

In [None]:
model = Titanic_Model()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
total_loss, total_acc = list(),list()
feat_imp = np.zeros(train_features.shape[1])
num_epochs = 200

for epoch in range(num_epochs):
    losses = 0 
    for idx, (x,y) in enumerate(train_loader):
        x,y = x.float(), y.type(torch.LongTensor)
        x.requires_grad=True
        optimizer.zero_grad()
        # check if the progrma can be run with model(x) and model.forward()
        preds=model.forward(x)
        loss=criterion(preds,y)
        x.requires_grad = False
        loss.backward()
        optimizer.step()
        losses+=loss.item()
    total_loss.append(losses/len(train_loader))
    if epoch%5==0:
        print("Epoch:", str(epoch+1), "\tLoss:", total_loss[-1])

In [None]:
model.eval()
correct=0
for idx, (x,y) in enumerate(test_loader):
    with torch.no_grad():
        x,y = x.float(), y.type(torch.LongTensor)
        pred = model(x)
        preds_class = torch.argmax(pred)
        if (preds_class.numpy()== y.numpy()[0]):
            correct+=1
print("Accuracy = ", correct/len(test_indices))

# # # *We can see from here that the Test Accuracy increases as we use less features after feature selection.*