In [1]:
import pandas as pd
import torch
import os
import numpy as np
from torch.utils.data import DataLoader
from sklearn import metrics
import math

from datasets import DensitySurvey



### Neural Network Final Predictions


In [2]:

areas = ['north', 'south', 'des']
galaxies = ['lrg', 'elg', 'qso', 'glbg', 'rlbg']
device = 'cpu'
max_set_len = 0

df_north = pd.read_csv(f'../regression/results/north_complete.csv')
df_south = pd.read_csv(f'../regression/results/south_complete.csv')
df_des = pd.read_csv(f'../regression/results/des_complete.csv')

df_north.columns



Index(['pixel_id', 'lrg', 'elg', 'qso', 'exposures', 'stellar', 'EBV',
       'airmass', 'ccdskysb_g', 'ccdskysb_r', 'ccdskysb_z', 'exptime_g',
       'exptime_r', 'exptime_z', 'meansky_g', 'meansky_r', 'meansky_z',
       'seeing_g', 'seeing_r', 'seeing_z', 'lrg_lin', 'elg_lin', 'qso_lin'],
      dtype='object')

In [5]:
for area in areas:

    df_test = pd.read_csv(f'data/{area}/{area}_test.csv')
    df_train = pd.read_csv(f'data/{area}/{area}.csv')
    df_test = df_test.append(df_train)
    pixel_id = df_test.pixel_id

    df_test = df_test.drop(columns=['pixel_id', 'exposures'], axis=1, inplace=False)

    for gal in galaxies:
        testdata = DensitySurvey(df_test, gal)

        best_val = -100
        for model in os.listdir(f"trained_models/{area}/{gal}"):
            val = float(model[:-3])
            best_val = max(val, best_val)
        print(best_val)
        model = torch.load(f"trained_models/{area}/{gal}/{best_val}.pt",
                           map_location=torch.device('cpu'))

        testloader = torch.utils.data.DataLoader(testdata, batch_size=128, shuffle=False)

        model.eval()
        y_pred = np.array([])
        y_gold = np.array([])

        with torch.no_grad():

            for i, (inputs, labels) in enumerate(testloader):
                # Split dataloader
                inputs = inputs.to(device)
                # Forward pass through the trained network
                outputs = model(inputs)

                # Get predictions and append to label array + count number of correct and total
                y_pred = np.append(y_pred, outputs.cpu().detach().numpy())
                y_gold = np.append(y_gold, labels.cpu().detach().numpy())

            print("Target", len(y_gold), np.isnan(y_gold).sum(), np.max(y_gold), np.min(y_gold), np.mean(y_gold))
            print(y_gold)
            print("Prediction", len(y_pred), np.isnan(y_pred).sum(), np.max(y_pred), np.min(y_pred), np.mean(y_pred))
            print(y_pred)

            r2, rmse, mae = 0, 0, 0

            try:
                r2 = metrics.r2_score(y_gold, y_pred)
                rmse = math.sqrt(metrics.mean_squared_error(y_gold, y_pred))
                mae = metrics.mean_absolute_error(y_gold, y_pred)

            except:
                print("++++++++++++++++++++")
                print("   NaN Predicted    ")
                print("++++++++++++++++++++")

            print()
            print(f" XXXXXX======== TRIAL {area} - {gal} ended")
            print()
            print("Test Set - R-squared: ", r2)
            print("Test Set - RMSE: ", rmse)
            print("Test Set - MAE: ", mae)
            print()
            print()
            print()

        ax = np.stack((pixel_id, y_pred), axis=1)
        df_deep = pd.DataFrame(ax, columns=['pixel_id', f'{gal}_nn'])

        df_deep.pixel_id = df_deep.pixel_id.astype(int)

        if area == 'north':
            df_north = df_north.merge(df_deep, how='inner', on='pixel_id')
        elif area == 'south':
            df_south = df_south.merge(df_deep, how='inner', on='pixel_id')
        else:
            df_des = df_des.merge(df_deep, how='inner', on='pixel_id')

0.022140414501169103
Target 97537 0 131.0 1.0 35.28534812430155
[37. 27. 21. ... 38. 33. 32.]
Prediction 97537 0 44.132564544677734 24.010534286499023 35.739814353490054
[36.98944092 34.81783676 34.24811935 ... 38.37955093 34.14669037
 34.37604523]


Test Set - R-squared:  0.028040983466964375
Test Set - RMSE:  11.82057481221346
Test Set - MAE:  8.723459963911601



0.20695097948924068
Target 97537 0 285.0 1.0 124.99900550560301
[ 63. 156.  95. ... 115. 152. 107.]
Prediction 97537 0 168.24081420898438 0.0 126.76191536012362
[106.44763947 137.95901489 124.56479645 ... 120.54670715 129.02919006
 133.43963623]


Test Set - R-squared:  0.21045661624063428
Test Set - RMSE:  19.7623431743357
Test Set - MAE:  15.405403179407



0.17315364401834488
Target 97537 0 334.0 1.0 153.21741492971898
[166. 146. 114. ... 193. 188. 170.]
Prediction 97537 0 181.35073852539062 44.17581558227539 152.8061077480042
[151.08296204 148.48658752 162.21138    ... 166.54684448 154.95848083
 159.01893616]


Test Set

In [6]:
print((df_north.head()))
print((df_south.head()))
print((df_des.head()))

df_north = df_north.drop(columns=['y_gold', 'y_gold_x', 'y_gold_y'], axis=1, inplace=False)
df_south = df_south.drop(columns=['y_gold', 'y_gold_x', 'y_gold_y'], axis=1, inplace=False)
df_des = df_des.drop(columns=['y_gold', 'y_gold_x', 'y_gold_y'], axis=1, inplace=False)

   pixel_id  lrg  elg  qso  exposures   stellar       EBV   airmass  \
0     20930   37   63  166   0.230769  0.271967  0.578446  0.447089   
1    128915   27  156  146   0.209790  0.062762  0.060792  0.066278   
2     22899   21   95  114   0.188811  0.092050  0.049091  0.371570   
3    105934   38  100  150   0.181818  0.104603  0.051809  0.056693   
4     14825   26  113  121   0.181818  0.083682  0.077518  0.410721   

   ccdskysb_g  ccdskysb_r  ...  meansky_z  seeing_g  seeing_r  seeing_z  \
0    0.762753    0.492800  ...   0.022182  0.162460  0.173328  0.090772   
1    0.720468    0.591275  ...   0.004364  0.223187  0.069517  0.100978   
2    0.772063    0.579600  ...   0.055957  0.178862  0.207623  0.080423   
3    0.862060    0.616452  ...   0.006092  0.089822  0.177267  0.064683   
4    0.817044    0.601693  ...   0.020293  0.234317  0.191866  0.090556   

     lrg_lin     elg_lin     qso_lin     lrg_nn      elg_nn      qso_nn  
0  37.940550   92.576883  147.112149  36.989441 

KeyError: "['y_gold' 'y_gold_x' 'y_gold_y'] not found in axis"

In [9]:
df_north.columns

Index(['pixel_id', 'lrg', 'elg', 'qso', 'exposures', 'stellar', 'EBV',
       'airmass', 'ccdskysb_g', 'ccdskysb_r', 'ccdskysb_z', 'exptime_g',
       'exptime_r', 'exptime_z', 'meansky_g', 'meansky_r', 'meansky_z',
       'seeing_g', 'seeing_r', 'seeing_z', 'lrg_lin', 'elg_lin', 'qso_lin',
       'lrg_nn', 'elg_nn', 'qso_nn'],
      dtype='object')

In [7]:
df_north.to_csv(f'../regression/results/north_complete.csv', index=False)
df_south.to_csv(f'../regression/results/south_complete.csv', index=False)
df_des.to_csv(f'../regression/results/des_complete.csv', index=False)

In [8]:

print(len(df_north))

97537
