In [2]:

import torch
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt


print(torch.__version__)


1.9.0


# Data Preprocessing


In [3]:
df_train = pd.read_csv('../../bricks_data/train.csv')
df_val = pd.read_csv('../../bricks_data/val.csv')
df_test = pd.read_csv('../../bricks_data/test.csv')


In [4]:
df_train.head()

Unnamed: 0,pixel_id,lrg,elg,qso,exposures,stellar,EBV,airmass,ccdskysb_g,ccdskysb_r,...,galdepth_g,galdepth_r,galdepth_z,seeing_g,seeing_r,seeing_z,mjd_obs_g,mjd_obs_r,mjd_obs_z,Z
0,738618,0.255556,0.402256,0.409396,0.462475,0.205357,0.171079,0.140918,0.578889,0.518889,...,0.480686,0.67459,0.283245,0.301927,0.311998,0.129479,0.388261,0.465611,0.436698,0.157027
1,133202,0.288889,0.421053,0.40604,0.054767,0.049107,0.01643,0.013747,0.630872,0.672316,...,0.511487,0.582933,0.750504,0.211973,0.09155,0.120701,0.518587,0.498003,0.44311,0.144649
2,635461,0.688889,0.503759,0.704698,0.334686,0.058036,0.06264,0.152529,0.62611,0.577056,...,0.596021,0.695755,0.299528,0.269886,0.283358,0.172838,0.510175,0.522231,0.454674,0.122328
3,291288,0.255556,0.255639,0.228188,0.099391,0.0625,0.618593,0.411554,0.617269,0.568405,...,0.832612,0.766486,0.719828,0.299032,0.254469,0.209068,0.861329,0.798231,0.806972,0.557726
4,315867,0.411111,0.488722,0.503356,0.109533,0.120536,0.200576,0.304894,0.651783,0.620234,...,0.748213,0.680751,0.705337,0.411908,0.393542,0.130524,0.764569,0.798398,0.271826,0.475285


In [5]:
df_train.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_val.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_test.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)


# Regression

In [6]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
galaxy_types = ['lrg','elg','qso']


### Geometrics

### Geometric - Linear Regression

In [7]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Linear Regression R^2 for lrg, Geometric :  0.045674483731205195.
Linear Regression MSE for lrg, Geometric :  0.01632737441867394.

Linear Regression R^2 for elg, Geometric :  0.10881169349197217.
Linear Regression MSE for elg, Geometric :  0.0106577879384463.

Linear Regression R^2 for qso, Geometric :  0.07608241730362908.
Linear Regression MSE for qso, Geometric :  0.012969421194633212.



### Geometric - Ridge Regression

In [8]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Ridge Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Ridge Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Ridge Regression R^2 for lrg, Geometric :  0.04567581297642975.
Ridge Regression MSE for lrg, Geometric :  0.01632735167686928.

Ridge Regression R^2 for elg, Geometric :  0.10881184750573003.
Ridge Regression MSE for elg, Geometric :  0.010657786096584194.

Ridge Regression R^2 for qso, Geometric :  0.07607979005652432.
Ridge Regression MSE for qso, Geometric :  0.012969458074410068.



### Geometric - Lasso Regression

In [9]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()


Lasso Regression R^2 for lrg, Geometric :  -1.6872004765300375e-05.
Lasso Regression MSE for lrg, Geometric :  0.017109099165712866.

Lasso Regression R^2 for elg, Geometric :  -1.4132761307683595e-05.
Lasso Regression MSE for elg, Geometric :  0.011959244173861135.

Lasso Regression R^2 for qso, Geometric :  -1.3532442602715022e-05.
Lasso Regression MSE for qso, Geometric :  0.014037612169616371.



In [None]:
# Using the Best classifier to test on the testset

In [10]:
# Stacking Train and Val Set
print("Results for full dataset on unseen test-set")
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_test[gal]
    X_test = df_test.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()


Results for full dataset on unseen test-set
Linear Regression R^2 for lrg, Geometric :  0.04619606201951554.
Linear Regression MSE for lrg, Geometric :  0.01605861376099732.

Linear Regression R^2 for elg, Geometric :  0.10681381503967935.
Linear Regression MSE for elg, Geometric :  0.01058674458720245.

Linear Regression R^2 for qso, Geometric :  0.07376991226263807.
Linear Regression MSE for qso, Geometric :  0.012816235046243353.



### Kitanidis


### Kitanidis - Linear Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")

### Kitanidis - Ridge Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Ridge Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Ridge Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")

### Kitanidis - Lasso Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")


### Kitanidis - AdaBoost-Regressor

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = AdaBoostRegressor(n_estimators=100)
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"AdaBoost Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"AdaBoost Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")



