In [1]:

import torch
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm


print(torch.__version__)


1.9.0


# Data Preprocessing


In [2]:
df_train = pd.read_csv('../../bricks_data/train.csv')
df_val = pd.read_csv('../../bricks_data/val.csv')
df_test = pd.read_csv('../../bricks_data/test.csv')


In [3]:
df_train.head()

Unnamed: 0,pixel_id,lrg,elg,qso,exposures,stellar,EBV,airmass,ccdskysb_g,ccdskysb_r,...,galdepth_g,galdepth_r,galdepth_z,seeing_g,seeing_r,seeing_z,mjd_obs_g,mjd_obs_r,mjd_obs_z,Z
0,639531,0.143713,0.216019,0.338174,0.564626,0.055556,0.034658,0.115825,0.713032,0.650563,...,0.661819,0.764081,0.710622,0.192107,0.180748,0.116854,0.392091,0.389434,0.459966,0.109627
1,438841,0.311377,0.32767,0.325726,0.554422,0.055556,0.156182,0.121236,0.673496,0.669451,...,0.780645,0.838735,0.731335,0.205265,0.183886,0.132229,0.50693,0.573248,0.526755,0.019657
2,223392,0.317365,0.337379,0.379668,0.217687,0.095238,0.128167,0.624725,0.689262,0.570709,...,0.79614,0.790286,0.752132,0.210586,0.187711,0.122527,0.711217,0.623392,0.492222,0.047189
3,212211,0.179641,0.286408,0.371369,0.292517,0.464286,0.110097,0.648073,0.687629,0.59411,...,0.746999,0.791229,0.785403,0.193327,0.224447,0.114192,0.544153,0.71076,0.706703,0.367989
4,605797,0.251497,0.371359,0.365145,0.496599,0.043651,0.04114,0.184091,0.686366,0.668034,...,0.687693,0.857195,0.756373,0.163397,0.193967,0.124396,0.41073,0.441666,0.415508,0.195688


In [4]:
df_train.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_val.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_test.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)


# Regression

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
galaxy_types = ['lrg','elg','qso']


In [6]:
X_train = df_train.drop(columns=['lrg','elg','qso'])
X_train.columns

Index(['stellar', 'EBV', 'airmass', 'ccdskysb_g', 'ccdskysb_r', 'ccdskysb_z',
       'exptime_g', 'exptime_r', 'exptime_z', 'meansky_g', 'meansky_r',
       'meansky_z', 'galdepth_g', 'galdepth_r', 'galdepth_z', 'seeing_g',
       'seeing_r', 'seeing_z', 'mjd_obs_g', 'mjd_obs_r', 'mjd_obs_z'],
      dtype='object')

In [7]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)


X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
X_train = poly.fit_transform(X_train)

X_train.shape

(220300, 253)

### Geometrics

### Geometric - Linear Regression

In [8]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    X_train = poly.fit_transform(X_train)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    X_test = poly.fit_transform(X_test)
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Linear Regression R^2 for lrg, Geometric :  0.03949795070079565.
Linear Regression MSE for lrg, Geometric :  0.005258890403115084.

Linear Regression R^2 for elg, Geometric :  0.15000624428060305.
Linear Regression MSE for elg, Geometric :  0.0032096153062344526.

Linear Regression R^2 for qso, Geometric :  0.12898609443771436.
Linear Regression MSE for qso, Geometric :  0.003810218584450977.



### Geometric - Ridge Regression

In [9]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    X_train = poly.fit_transform(X_train)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    X_test = poly.fit_transform(X_test)
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Ridge Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Ridge Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Ridge Regression R^2 for lrg, Geometric :  0.03835915830399017.
Ridge Regression MSE for lrg, Geometric :  0.005265125459470322.

Ridge Regression R^2 for elg, Geometric :  0.14667374101971764.
Ridge Regression MSE for elg, Geometric :  0.0032221989909994794.

Ridge Regression R^2 for qso, Geometric :  0.12567472799427004.
Ridge Regression MSE for qso, Geometric :  0.00382470403627002.



### Geometric - Lasso Regression

In [10]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    X_train = poly.fit_transform(X_train)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    X_test = poly.fit_transform(X_test)
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()


Lasso Regression R^2 for lrg, Geometric :  -2.418299939410673e-06.
Lasso Regression MSE for lrg, Geometric :  0.0054751607500748145.

Lasso Regression R^2 for elg, Geometric :  -3.152215577451045e-08.
Lasso Regression MSE for elg, Geometric :  0.0037760458659981213.

Lasso Regression R^2 for qso, Geometric :  -1.135833249299445e-05.
Lasso Regression MSE for qso, Geometric :  0.00437451324008519.



In [11]:
# Using the Best classifier to test on the testset

In [12]:
poly = PolynomialFeatures(2)

# Stacking Train and Val Set
print("Results for full dataset on unseen test-set")
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    X_train = poly.fit_transform(X_train)
    y_gold = df_test[gal]
    X_test = df_test.drop(columns=['lrg','elg','qso'])
    X_test = poly.fit_transform(X_test)
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Results for full dataset on unseen test-set
Linear Regression R^2 for lrg, Geometric :  0.038215184475231534.
Linear Regression MSE for lrg, Geometric :  0.005213885938982841.

Linear Regression R^2 for elg, Geometric :  0.14893673657901474.
Linear Regression MSE for elg, Geometric :  0.0031791694040346386.

Linear Regression R^2 for qso, Geometric :  0.1328978735435351.
Linear Regression MSE for qso, Geometric :  0.00372192164034895.



In [13]:
poly = PolynomialFeatures(3)

# Stacking Train and Val Set
print("Results for full dataset on unseen test-set")
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    X_train = poly.fit_transform(X_train)
    y_gold = df_test[gal]
    X_test = df_test.drop(columns=['lrg','elg','qso'])
    X_test = poly.fit_transform(X_test)
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Results for full dataset on unseen test-set
Linear Regression R^2 for lrg, Geometric :  0.04386685498643783.
Linear Regression MSE for lrg, Geometric :  0.005183247936662057.

Linear Regression R^2 for elg, Geometric :  0.18290334475943082.
Linear Regression MSE for elg, Geometric :  0.0030522862378503223.

Linear Regression R^2 for qso, Geometric :  0.16021354807630606.
Linear Regression MSE for qso, Geometric :  0.0036046727061550916.



### Running Regression on the TrainSet to compare to previous results

In [13]:
df_geometric = pd.read_csv('../../bricks_data/train.csv')

df_geometric_reg = df_geometric.drop(columns=['pixel_id', 'exposures','Z'])
df_geometric_reg.head()

Unnamed: 0,lrg,elg,qso,stellar,EBV,airmass,ccdskysb_g,ccdskysb_r,ccdskysb_z,exptime_g,...,meansky_z,galdepth_g,galdepth_r,galdepth_z,seeing_g,seeing_r,seeing_z,mjd_obs_g,mjd_obs_r,mjd_obs_z
0,0.255556,0.402256,0.409396,0.205357,0.171079,0.140918,0.578889,0.518889,0.592086,0.175793,...,0.5513443,0.480686,0.67459,0.283245,0.301927,0.311998,0.129479,0.388261,0.465611,0.436698
1,0.288889,0.421053,0.40604,0.049107,0.01643,0.013747,0.630872,0.672316,0.438492,0.168644,...,2.840472e-11,0.511487,0.582933,0.750504,0.211973,0.09155,0.120701,0.518587,0.498003,0.44311
2,0.688889,0.503759,0.704698,0.058036,0.06264,0.152529,0.62611,0.577056,0.605068,0.175793,...,0.3579294,0.596021,0.695755,0.299528,0.269886,0.283358,0.172838,0.510175,0.522231,0.454674
3,0.255556,0.255639,0.228188,0.0625,0.618593,0.411554,0.617269,0.568405,0.628472,0.548694,...,5.566131e-11,0.832612,0.766486,0.719828,0.299032,0.254469,0.209068,0.861329,0.798231,0.806972
4,0.411111,0.488722,0.503356,0.120536,0.200576,0.304894,0.651783,0.620234,0.540708,0.405109,...,8.777782e-11,0.748213,0.680751,0.705337,0.411908,0.393542,0.130524,0.764569,0.798398,0.271826


In [14]:
y_lrg = df_geometric_reg['lrg']#.to_numpy(copy=True)
y_elg = df_geometric_reg['elg']#.to_numpy(copy=True)
y_qso = df_geometric_reg['qso']#.to_numpy(copy=True)

df_geometric_reg.columns

Index(['lrg', 'elg', 'qso', 'stellar', 'EBV', 'airmass', 'ccdskysb_g',
       'ccdskysb_r', 'ccdskysb_z', 'exptime_g', 'exptime_r', 'exptime_z',
       'meansky_g', 'meansky_r', 'meansky_z', 'galdepth_g', 'galdepth_r',
       'galdepth_z', 'seeing_g', 'seeing_r', 'seeing_z', 'mjd_obs_g',
       'mjd_obs_r', 'mjd_obs_z'],
      dtype='object')

In [20]:
X = df_geometric_reg.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
print(X.head())

    stellar       EBV   airmass  ccdskysb_g  ccdskysb_r  ccdskysb_z  \
0  0.205357  0.171079  0.140918    0.578889    0.518889    0.592086   
1  0.049107  0.016430  0.013747    0.630872    0.672316    0.438492   
2  0.058036  0.062640  0.152529    0.626110    0.577056    0.605068   
3  0.062500  0.618593  0.411554    0.617269    0.568405    0.628472   
4  0.120536  0.200576  0.304894    0.651783    0.620234    0.540708   

   exptime_g  exptime_r  exptime_z     meansky_g  ...     meansky_z  \
0   0.175793   0.214407   0.277244  4.425635e-01  ...  5.513443e-01   
1   0.168644   0.112169   0.236833  2.562594e-10  ...  2.840472e-11   
2   0.175793   0.214407   0.277244  2.103379e-01  ...  3.579294e-01   
3   0.548694   0.164378   0.473245  7.195705e-11  ...  5.566131e-11   
4   0.405109   0.131026   0.439217  6.489902e-11  ...  8.777782e-11   

   galdepth_g  galdepth_r  galdepth_z  seeing_g  seeing_r  seeing_z  \
0    0.480686    0.674590    0.283245  0.301927  0.311998  0.129479   
1   

In [21]:
print(len(y_lrg))
print(len(X))

217027
217027


In [25]:
gal = 'lrg'
ols_lrg_geometric = sm.OLS(y_lrg,X).fit()
y_pred= ols_lrg_geometric.predict(df_test.drop(columns=['lrg','elg','qso']))
y_gold = df_test[gal]
print(ols_lrg_geometric.summary())
print()
print(f"Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
print()
gal = 'elg'
ols_elg_geometric = sm.OLS(y_elg,X).fit()
y_pred= ols_elg_geometric.predict(df_test.drop(columns=['lrg','elg','qso']))
y_gold = df_test[gal]
print(ols_elg_geometric.summary())
print()
print(f"Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
print()
gal = 'qso'
ols_qso_geometric = sm.OLS(y_qso,X).fit()
y_pred= ols_qso_geometric.predict(df_test.drop(columns=['lrg','elg','qso']))
y_gold = df_test[gal]
print(ols_qso_geometric.summary())
print()
print(f"Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
print()


                                 OLS Regression Results                                
Dep. Variable:                    lrg   R-squared (uncentered):                   0.884
Model:                            OLS   Adj. R-squared (uncentered):              0.884
Method:                 Least Squares   F-statistic:                          7.840e+04
Date:                Thu, 05 Aug 2021   Prob (F-statistic):                        0.00
Time:                        19:21:44   Log-Likelihood:                      1.3521e+05
No. Observations:              217027   AIC:                                 -2.704e+05
Df Residuals:                  217006   BIC:                                 -2.702e+05
Df Model:                          21                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [None]:
y_train = df_train['lrg']#.to_numpy(copy=True)
X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
y_gold = df_val[gal]
X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Geometric :