# Predicting Pulsar Stars
#### Use the HTRU 2 dataset to predict pulsars.

In [4]:
#installs
!pip install jovian --upgrade --quiet
#Imports
import jovian
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
import torchvision
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader, TensorDataset
from torchvision.utils import make_grid
from torchvision.datasets.utils import download_url
from sklearn.preprocessing import StandardScaler    
from sklearn.metrics import confusion_matrix, classification_report
import zipfile
%matplotlib inline

<IPython.core.display.Javascript object>

In [5]:
project_name="Predicting-Pulsars" #Project name to commit to on Jovian.ml

## Download the dataset and initialize the dataset.
 - if you are on kaggle, add [this](https://www.kaggle.com/pavanraj159/predicting-a-pulsar-star) dataset- avoids redownloading the dataset each time.
 - Otherwise uncomment the next two lines of code.

In [6]:
#data_url="https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip"
#download_url(data_url, ".")

#### Extract the csv file - if you downloaded the dataset.

In [7]:
# Uncomment as needed.
#with zipfile.ZipFile("./HTRU2.zip", 'r') as zip_ref:
#    zip_ref.extractall(".")
#!rm -rf HTRU2.zip

### Load the data from the .csv file 
We just need to use the panda library's read_csv() function


In [8]:
filename = "../input/predicting-a-pulsar-star/pulsar_stars.csv" #Change as needed.

df = pd.read_csv(filename)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0    Mean of the integrated profile                17898 non-null  float64
 1    Standard deviation of the integrated profile  17898 non-null  float64
 2    Excess kurtosis of the integrated profile     17898 non-null  float64
 3    Skewness of the integrated profile            17898 non-null  float64
 4    Mean of the DM-SNR curve                      17898 non-null  float64
 5    Standard deviation of the DM-SNR curve        17898 non-null  float64
 6    Excess kurtosis of the DM-SNR curve           17898 non-null  float64
 7    Skewness of the DM-SNR curve                  17898 non-null  float64
 8   target_class                                   17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB
None


Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


## Prepare Dataset for Training
We need to convert the dataframe to Pytorch Tensors using numpy arrays.

In [None]:
inputs_df=df.drop("target_class",axis=1)#Easiest way to get inputs- we just need everything but the targets_class 
inputs_arr=inputs_df.to_numpy()
targets_df=df["target_class"]#Easiest way to get outputs-need just targets_class
targets_arr=targets_df.to_numpy()

In [None]:
#output variables.
inputs=torch.from_numpy(inputs_arr).type(torch.float64)
targets=torch.from_numpy(targets_arr).type(torch.int16)
inputs.shape, targets.shape

### Commit to Jovian(optional)

In [15]:
#jovian.commit(project=project_name, enviroment=None)

### Create the dataset.

In [16]:
dataset=TensorDataset(inputs, targets)

### Split the dataset into training and validation

In [17]:
num_rows=df.shape[0]
val_percent = .1 # How much of the dataset 
val_size = int(num_rows * val_percent)
train_size = num_rows - val_size

Use the random_split function to split dataset into 2 parts of the desired length

In [18]:
torch.manual_seed(2)#Ensure that we get the same validation each time.
train_ds, val_ds = random_split(dataset, (train_size, val_size),)
train_ds

<torch.utils.data.dataset.Subset at 0x7f5bfabebb90>

### Set a batch size.  
I am going to pick 200, but adjust this to you needs.

In [None]:
batch_size=200

## Load Data and transfer data to GPU, if available.

In [None]:
# PyTorch data loaders
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=3, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size*2, num_workers=3, pin_memory=True)

### Transfer to GPU

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [None]:
# get device
device=get_default_device()
device

We can now wrap our training and validation data loaders using DeviceDataLoader for automatically transferring batches of data to the GPU (if available).


In [None]:
train_dl = DeviceDataLoader(train_dl, device)
val_dl = DeviceDataLoader(val_dl, device)

## Create a Model
We are going to be creating a Model with Residual Blocks and Batch Normalization. Roughly based on ResNet5 architecture.

### Linear Block

In [None]:
def logistic_block(in_features=8, out_features=8):
    layers=[nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.ReLU(inplace=True)]
    return nn.Sequential(*layers)

### Residual Block

In [None]:
class ResidualBlockLogistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.Res_Block=nn.Sequential(logistic_block())
        
        
    def forward(self, x):
        
        return self.(out) + x # Non-linear can be applied before or after adding the input

### Create The model Class

In [None]:
class HTRU2Model(nn.Module):
    def __init__(self,in_features=8):
        super.__init__()
        self.logistic1=logistic_block()
        self.logistic2=logistic_block()
        self.res1 = nn.Sequential(logistic_block(),logistic_block())
        self.res2 = nn.Sequential(logistic_block(),logistic_block())
        self.dropout = nn.Dropout(p=0.1)
    def forward(self, x):
        out=self.logistic1(x)



In [None]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader, 
                  weight_decay=0, grad_clip=None, opt_func=optim.Adam):
    torch.cuda.empty_cache()
    history = []
    
    # Set up cutom optimizer with weight decay
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    # Set up one-cycle learning rate scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, 
                                                steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        # Training Phase 
        model.train()
        train_losses = []
        lrs = []
        for batch in tqdm(train_loader):
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            
            # Gradient clipping
            if grad_clip: 
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()
            
            # Record & update learning rate
            lrs.append(get_lr(optimizer))
            sched.step()
        
        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    return history
