In [None]:
# importing packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset
from pickle import dump, load
from scipy.stats import pearsonr

In [None]:
# helper functions
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
# visualize data
df = pd.read_csv('/Users/SnehPandya/Desktop/nn/train_drw_full.csv')
df.head()

In [None]:
# Black Hole dataset--default parameters are for training and predicting AGN mass.  Pass 'train=False' 
# for test-set and 'mass=False' for AGN redshift prediction.

class BHDataset(Dataset):
    def __init__(self, path, train=True, mass=True):
        self.path = path
        self.train = train
        self.mass = mass
        self.sc = StandardScaler()
        
        if self.mass:
            
            if self.train:
                self.data = pd.read_csv(self.path + 'train_drw_full.csv')
                self.features = self.sc.fit_transform(np.asarray(self.data.iloc[:,14:23]))
                dump(self.sc, open('train_scaler_drw_full.pkl','wb'))
        
            else:
                self.data = pd.read_csv(self.path + 'test_drw_full.csv')
                self.sc = load(open('train_scaler_drw_full.pkl','rb'))
                self.features = self.sc.transform(np.asarray(self.data.iloc[:,14:23]))
                
        else:
            
            if self.train:
                self.data = pd.read_csv(self.path + 'train_drw_full.csv')
                self.features = self.sc.fit_transform(np.asarray(self.data.iloc[:,[9,10,11,12,13,16,17,18,19,20]]))
                dump(self.sc, open('train_scaler_drw_full.pkl','wb'))
        
            else:
                self.data = pd.read_csv(self.path + 'test_drw_full.csv')
                self.sc = load(open('train_scaler_drw_full.pkl','rb'))
                self.features = self.sc.transform(np.asarray(self.data.iloc[:,[9,10,11,12,13,16,17,18,19,20]]))
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        
        if self.mass:
            
            ID = torch.from_numpy(np.asarray(self.data.iloc[index,0]))
            target = torch.from_numpy(np.asarray(self.data.iloc[index,1]))
            features = torch.from_numpy(self.features[index].reshape(1,-1).squeeze())
            return (ID, features, target)
        
        else:
            ID = torch.from_numpy(np.asarray(self.data.iloc[index,0]))
            target = torch.from_numpy(np.asarray(self.data.iloc[index,14]))
            features = torch.from_numpy(self.features[index].reshape(1,-1).squeeze())
            return (ID, features, target)

        
# define train and test datasets.  Train test split was done previously using sklearn.
train_mass = BHDataset('/Users/SnehPandya/Desktop/nn/')
test_mass = BHDataset('/Users/SnehPandya/Desktop/nn/', train=False)

train_z = BHDataset('/Users/SnehPandya/Desktop/nn/', mass=False)
test_z = BHDataset('/Users/SnehPandya/Desktop/nn/', mass=False, train=False)

In [None]:
# Define dataloaders with the datasets.  Only shuffle training sets.
train_dl_mass = DataLoader(train_mass, batch_size=256, shuffle=True)
test_dl_mass = DataLoader(test_mass, batch_size=256, shuffle=False)

train_dl_z = DataLoader(train_z, batch_size=256, shuffle=True)
test_dl_z = DataLoader(test_z, batch_size=256, shuffle=False)

In [None]:
# default architecture is to predict AGN mass.  Pass 'mass=False' to predict redshift.
class Net(nn.Module):
    def __init__(self, mass=True):
        super().__init__()
        self.mass = mass
        
        if self.mass:

            self.fc1 = nn.Linear(9, 32)
            self.fc2 = nn.Linear(32, 64)
            self.fc3 = nn.Linear(64, 64)
            self.fc4 = nn.Linear(64, 32)
            self.fc5 = nn.Linear(32, 1)
            
        else:
            
            self.fc1 = nn.Linear(10, 32)
            self.fc2 = nn.Linear(32, 64)
            self.fc3 = nn.Linear(64, 64)
            self.fc4 = nn.Linear(64, 32)
            self.fc5 = nn.Linear(32, 1)
            

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

# defining neural networks
net_mass = Net()
net_z = Net(mass=False)

In [None]:
# count parameters for reference.  For both redshift and mass, (training size/model parameters) < 1.
(count_parameters(net_mass), count_parameters(net_z))

In [None]:
# loss function and optimizer -- change argument of optimizer to correct network depending on prediction
lr = .01
optimizer = optim.AdamW(net_z.parameters(), lr=lr)
loss_function = F.mse_loss

In [None]:
# training loop takes in epoch #, network, train loader, loss function, and optimizer.  
# Plots RMSE per epoch and returns RMSE/loss vs. epoch plots at end

def train(num_epochs, net, dataloader, loss_fn, optimizer):
    
    epoch_list = np.linspace(1, num_epochs , num = num_epochs)
    loss_list, rmse_list = [],[]
    
    for epoch in range(num_epochs):

            outputs_pred, outputs_gt, outputs_ID = ([] for i in range(3))
            
            for data in dataloader:
                
                ID, features, ground_truth = data
                outputs_gt.append(ground_truth.float())
                outputs_ID.append(ID)
                net.zero_grad()
                output = net(features.float())
                outputs_pred.append(output.float())
                loss = loss_fn(output.squeeze(), ground_truth.float().squeeze())
                loss.backward()
                optimizer.step()
                        
            loss_list.append(loss.float())
            ground_truth = torch.cat(outputs_gt).data
            predictions = torch.cat(outputs_pred).data.flatten()
            ID = torch.cat(outputs_ID).data
            rmse = np.sqrt(metrics.mean_squared_error(ground_truth, predictions))
            rmse_list.append(rmse)
            
            plt.plot(ground_truth, ground_truth,color='black', label = 'Mass Ground Truth')
            plt.scatter(ground_truth, predictions,s=2,color='blue', label = 'NN prediction',alpha=.5)
            plt.title('EPOCH:' + str(epoch+1) +'/'+ str(num_epochs) + ', RMSE:' + str(rmse))
            plt.xlabel('AGN Mass')
            plt.ylabel('AGN Mass')
            plt.legend()
            plt.show()
    
    plt.plot(epoch_list, loss_list, color='green', label = 'LOSS')
    plt.xticks(epoch_list)
    plt.title('LOSS VS. # EPOCHS')
    plt.xlabel('EPOCH')
    plt.ylabel('LOSS')

    plt.plot(epoch_list, rmse_list,color = 'orange', label = 'RMSE')
    plt.ylim(0,.45)
    plt.xticks(epoch_list)
    plt.title("RMSE VS. # EPOCHS")
    plt.xlabel('EPOCH')
    plt.ylabel("RMSE")
    plt.legend()
    plt.show()

In [None]:
# train network
train(35, net_z, train_dl_z, loss_function, optimizer)

In [None]:
# save model
# torch.save(net_z.state_dict(), '/Users/SnehPandya/Desktop/nn/AGNet_z_50_.353_10_32_64_64_32.mdl')

In [None]:
# test loop outputs plot of results + dataframe of object ID, ground truth values, and network predictions
def test(dataloader, net):

    with torch.no_grad():
        outputs_ID, outputs_pred, outputs_mass = ([] for i in range(3))

        for data in dataloader:
            ID, X, mass = data  
            outputs_ID.append(ID)
            outputs_mass.append(mass.float())
            output = net(X.float()) 
            outputs_pred.append(output.float())
            loss = loss_function(output.squeeze(), mass.float())

        ground_truth = torch.cat(outputs_mass).data 
        predictions = torch.cat(outputs_pred).data.flatten()
        ID = torch.cat(outputs_ID).data
        rmse = np.sqrt(metrics.mean_squared_error(ground_truth, predictions))
        plt.plot(ground_truth, ground_truth,color='black', label = 'Mass Ground Truth')
        plt.scatter(ground_truth, predictions,s=2, color='blue', label = 'NN prediction')
        plt.title('RMSE:' + str(rmse) + ' | LOSS: ' + str(loss.data.numpy()))
        plt.xlabel('AGN Mass')
        plt.ylabel('AGN Mass')
        plt.legend()
        plt.show()
        
        df = pd.DataFrame({'ID':ID.numpy(), 'ground truth':ground_truth.numpy(), 'network predictions':predictions.numpy() })
        return df

df = test(test_dl_z, net_z)

In [None]:
# output test-set results to csv
# df.to_csv('/Users/SnehPandya/Desktop/NN/best_mass_results_DRW_3.csv')