In [1]:

import torch
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing, metrics
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt


print(torch.__version__)


1.9.0


# Data Preprocessing


In [37]:
df_train = pd.read_csv('../../bricks_data/train.csv')
df_val = pd.read_csv('../../bricks_data/val.csv')
df_test = pd.read_csv('../../bricks_data/test.csv')


In [38]:
df_train.head()

Unnamed: 0,pixel_id,lrg,elg,qso,exposures,stellar,EBV,airmass,ccdskysb_g,ccdskysb_r,...,galdepth_g,galdepth_r,galdepth_z,seeing_g,seeing_r,seeing_z,mjd_obs_g,mjd_obs_r,mjd_obs_z,Z
0,0.2359,0.133333,0.292135,0.263333,0.089249,0.218978,0.642731,0.983753,0.475857,0.319229,...,0.648952,0.599763,0.378826,0.772369,0.586223,0.565264,0.882539,0.886028,0.83057,0.600523
1,0.292224,0.444444,0.464419,0.55,0.123732,0.10219,0.158408,0.593908,0.603885,0.506017,...,0.642801,0.584043,0.490447,0.506086,0.414464,0.265659,0.772584,0.698062,0.435535,0.160716
2,0.542426,0.355556,0.505618,0.67,0.41785,0.10219,0.169403,0.161007,0.481019,0.516146,...,0.443395,0.702748,0.439294,0.534037,0.437317,0.272643,0.449691,0.482133,0.507679,0.021694
3,0.751257,0.322222,0.456929,0.636667,0.279919,0.109489,0.214556,0.184611,0.66512,0.591438,...,0.358622,0.739656,0.556167,0.456429,0.415118,0.215858,0.500379,0.457761,0.478062,0.078766
4,0.673988,0.422222,0.531835,0.563333,0.492901,0.094891,0.201247,0.169394,0.61475,0.516285,...,0.484669,0.693925,0.50837,0.465233,0.363679,0.202216,0.397811,0.411058,0.465827,0.102343


In [39]:
df_train.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_val.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)
df_test.drop(columns=['pixel_id', 'exposures','Z'], axis=1, inplace=True)


# Regression

In [40]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
galaxy_types = ['lrg','elg','qso']


### Geometrics

### Geometric - Linear Regression

In [41]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Linear Regression R^2 for lrg, Geometric :  0.047569415581821706.
Linear Regression MSE for lrg, Geometric :  0.016023777999455358.

Linear Regression R^2 for elg, Geometric :  0.11569014023728696.
Linear Regression MSE for elg, Geometric :  0.010353972372662978.

Linear Regression R^2 for qso, Geometric :  0.08781718005332029.
Linear Regression MSE for qso, Geometric :  0.012399955112600352.



### Geometric - Ridge Regression

In [42]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Ridge Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Ridge Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()

Ridge Regression R^2 for lrg, Geometric :  0.0475678736619779.
Ridge Regression MSE for lrg, Geometric :  0.01602380394085379.

Ridge Regression R^2 for elg, Geometric :  0.11568839873859216.
Ridge Regression MSE for elg, Geometric :  0.010353992763060274.

Ridge Regression R^2 for qso, Geometric :  0.08781386964766325.
Ridge Regression MSE for qso, Geometric :  0.01240000011331803.



### Geometric - Lasso Regression

In [43]:
for gal in galaxy_types:
    y_train = df_train[gal]#.to_numpy(copy=True)
    X_train = df_train.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_val[gal]
    X_test = df_val.drop(columns=['lrg','elg','qso'])
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()


Lasso Regression R^2 for lrg, Geometric :  -1.5241256349884935e-05.
Lasso Regression MSE for lrg, Geometric :  0.016824346555137464.

Lasso Regression R^2 for elg, Geometric :  -8.833241829098171e-07.
Lasso Regression MSE for elg, Geometric :  0.011708544696488457.

Lasso Regression R^2 for qso, Geometric :  -1.8542019498024587e-05.
Lasso Regression MSE for qso, Geometric :  0.013593969061525034.



In [None]:
# Using the Best classifier to test on the testset

In [44]:
# Stacking Train and Val Set
df = pd.concat((df_train, df_val), axis=0)
print("Results for full dataset on unseen test-set")
for gal in galaxy_types:
    y_train = df[gal]#.to_numpy(copy=True)
    X_train = df.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = df_test[gal]
    X_test = df_test.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Geometric :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Geometric :  {metrics.mean_squared_error(y_gold, y_pred)}.")
    print()


Results for full dataset on unseen test-set
Linear Regression R^2 for lrg, Geometric :  0.04300889757493065.
Linear Regression MSE for lrg, Geometric :  0.015931073930618617.

Linear Regression R^2 for elg, Geometric :  0.11224466323756643.
Linear Regression MSE for elg, Geometric :  0.010422090163892258.

Linear Regression R^2 for qso, Geometric :  0.0852139658161416.
Linear Regression MSE for qso, Geometric :  0.012461355098423905.



### Kitanidis


### Kitanidis - Linear Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = LinearRegression()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Linear Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Linear Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")

### Kitanidis - Ridge Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = Ridge()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Ridge Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Ridge Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")

### Kitanidis - Lasso Regression

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = Lasso()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"Lasso Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"Lasso Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")


### Kitanidis - AdaBoost-Regressor

In [None]:
for gal in galaxy_types:
    y_train = train_df_kit[gal]#.to_numpy(copy=True)
    X_train = train_df_kit.drop(columns=['lrg','elg','qso'])#.to_numpy(copy=True)
    y_gold = test_df_kit[gal]
    X_test = test_df_kit.drop(columns=['lrg','elg','qso'])
    reg = AdaBoostRegressor(n_estimators=100)
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)

    print(f"AdaBoost Regression R^2 for {gal}, Kitanidis :  {metrics.r2_score(y_gold, y_pred)}.")
    print(f"AdaBoost Regression MSE for {gal}, Kitanidis :  {metrics.mean_squared_error(y_gold, y_pred)}.")



