# Estimativa de redshift para mapeamento 3D do universo

### O redshift (desvio do vermelho) é utilizado na Astronomia como métrica para estimar o quão distante um astro está da Terra.
### Ele é estimado a partir da comprimentos de onda emitidos pelos astros utilizando normalmente um espectrômetro.


In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('input/data.csv',sep=',')

print(df.columns.values)

['ID' 'u' 'g' 'r' 'i' 'z' 'modelmagerr_u' 'modelmagerr_g' 'modelmagerr_r'
 'modelmagerr_i' 'modelmagerr_z' 'redshift']


In [3]:
df.head()

Unnamed: 0,ID,u,g,r,i,z,modelmagerr_u,modelmagerr_g,modelmagerr_r,modelmagerr_i,modelmagerr_z,redshift
0,1,20.05579,19.03638,18.33059,17.91299,17.70723,0.158691,0.026021,0.017158,0.015686,0.040499,0.186443
1,2,23.89449,19.98589,18.34053,17.721,17.40883,1.95608,0.062189,0.018963,0.014417,0.032033,0.296161
2,3,20.48605,18.54287,17.45982,17.04516,16.80097,0.246165,0.01818,0.009551,0.008721,0.019908,0.178013
3,4,20.37043,18.14693,16.9575,16.53085,16.26944,0.17942,0.011766,0.006171,0.005637,0.011508,0.181958
4,5,17.67912,15.79491,14.93857,14.54253,14.23595,0.022103,0.003372,0.002708,0.002611,0.003838,0.039847


In [4]:
df.describe()

Unnamed: 0,ID,u,g,r,i,z,modelmagerr_u,modelmagerr_g,modelmagerr_r,modelmagerr_i,modelmagerr_z,redshift
count,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0,436275.0
mean,218138.0,19.974695,18.271918,17.33751,16.915777,16.636066,0.20275,0.019408,0.009979,0.010528,0.027695,0.149871
std,125941.888683,1.677817,1.354717,1.132568,1.091613,1.102611,0.42893,0.185063,0.016162,0.113036,0.159566,0.11532
min,1.0,11.39764,10.58824,9.731096,9.788093,8.984495,0.002102,0.001434,0.001426,0.001432,0.001461,0.000503
25%,109069.5,18.97939,17.546245,16.784735,16.39157,16.10152,0.043858,0.006758,0.005241,0.005263,0.011819,0.074648
50%,218138.0,19.72274,18.14421,17.33247,16.92838,16.64133,0.0718,0.009268,0.006808,0.006824,0.016625,0.11662
75%,327206.5,20.565335,18.70855,17.697865,17.3089,17.07657,0.136097,0.013675,0.009293,0.009639,0.026264,0.178712
max,436275.0,32.76723,32.49146,27.83899,29.51774,30.12782,69.46073,95.77759,4.801591,54.12327,53.62858,1.645


## Performance metrics:

In [5]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

In [6]:
from sklearn.metrics import mean_squared_error

def mean_metric(y_test, pred):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    score = mean_squared_error(y_test, pred)
    
    # Return the score
    return score

## Model trainning:

In [7]:
#Dividing data in train/test
target = df['redshift']

In [8]:
df.drop('redshift', axis=1, inplace=True)

In [9]:
features = df.drop('ID', 1)
features

Unnamed: 0,u,g,r,i,z,modelmagerr_u,modelmagerr_g,modelmagerr_r,modelmagerr_i,modelmagerr_z
0,20.05579,19.03638,18.33059,17.91299,17.70723,0.158691,0.026021,0.017158,0.015686,0.040499
1,23.89449,19.98589,18.34053,17.72100,17.40883,1.956080,0.062189,0.018963,0.014417,0.032033
2,20.48605,18.54287,17.45982,17.04516,16.80097,0.246165,0.018180,0.009551,0.008721,0.019908
3,20.37043,18.14693,16.95750,16.53085,16.26944,0.179420,0.011766,0.006171,0.005637,0.011508
4,17.67912,15.79491,14.93857,14.54253,14.23595,0.022103,0.003372,0.002708,0.002611,0.003838
5,21.40325,19.70875,18.66151,18.07688,17.68414,0.416773,0.036234,0.018863,0.015361,0.033190
6,19.87126,19.19771,18.54866,18.22217,18.16141,0.097503,0.021330,0.015394,0.015270,0.043037
7,21.95616,19.72914,18.42817,17.84306,17.49233,0.618033,0.033743,0.014466,0.011817,0.024614
8,19.70225,18.57548,17.84305,17.52988,17.35520,0.112133,0.017219,0.011826,0.011809,0.030111
9,22.58988,18.71655,17.57445,17.13871,16.84473,1.171129,0.020667,0.010230,0.009220,0.018824


In [10]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

In [11]:
#Random Forest Regression

regressor = RandomForestRegressor(random_state=0)
regressor.fit(X_train,y_train)
regressor.predict(X_test)

print('Model successfully trained.')

Model successfully trained.


In [12]:
random_forest_score = performance_metric(y_test,regressor.predict(X_test))
random_forest_error = mean_metric(y_test,regressor.predict(X_test))

print('R2 Score: ', random_forest_score * 100, '\nMean Error: ', random_forest_error * 100)

Score:  91.19191944308258 
Mean Error:  0.11705452719345918


## Evaluation:

In [13]:
test_df = pd.read_csv('input/query.csv',sep=',')
test_df.head()

Unnamed: 0,ID,u,g,r,i,z,modelmagerr_u,modelmagerr_g,modelmagerr_r,modelmagerr_i,modelmagerr_z
0,1,18.44576,17.42792,17.04441,16.84457,16.65202,0.059144,0.010632,0.011514,0.012076,0.030227
1,2,19.53878,18.54303,17.92247,17.52453,17.39197,0.085554,0.017542,0.011329,0.010053,0.030612
2,3,20.01489,18.17974,17.29804,16.89536,16.62627,0.113736,0.011341,0.007241,0.006545,0.015118
3,4,19.24964,17.25186,16.19505,15.76581,15.42115,0.09077,0.008102,0.004773,0.004271,0.008688
4,5,19.60299,18.11447,17.30755,16.89753,16.63263,0.166126,0.01849,0.011482,0.010039,0.026445


In [14]:
test_df = test_df.drop('ID', 1)

In [15]:
pred = regressor.predict(test_df)
test_df['redshift_pred'] = pred
test_df

Unnamed: 0,u,g,r,i,z,modelmagerr_u,modelmagerr_g,modelmagerr_r,modelmagerr_i,modelmagerr_z,redshift_pred
0,18.44576,17.42792,17.04441,16.84457,16.65202,0.059144,0.010632,0.011514,0.012076,0.030227,0.036516
1,19.53878,18.54303,17.92247,17.52453,17.39197,0.085554,0.017542,0.011329,0.010053,0.030612,0.152059
2,20.01489,18.17974,17.29804,16.89536,16.62627,0.113736,0.011341,0.007241,0.006545,0.015118,0.097233
3,19.24964,17.25186,16.19505,15.76581,15.42115,0.090770,0.008102,0.004773,0.004271,0.008688,0.130300
4,19.60299,18.11447,17.30755,16.89753,16.63263,0.166126,0.018490,0.011482,0.010039,0.026445,0.108345
5,22.50312,21.57445,19.88077,19.04446,18.71502,0.866988,0.162885,0.044713,0.027349,0.069670,0.524135
6,21.30931,20.03998,19.10053,18.89430,18.45681,0.407815,0.052247,0.029565,0.028241,0.068129,0.289116
7,19.97236,19.25021,18.68809,18.32016,18.19107,0.109546,0.022996,0.016987,0.014976,0.044322,0.191626
8,19.63229,18.01247,17.18051,16.78523,16.49083,0.136155,0.014569,0.009425,0.008591,0.021854,0.125872
9,20.71001,19.76317,19.48955,19.27511,19.17216,0.096429,0.019095,0.019134,0.021628,0.063311,0.113777


In [17]:
test_df[['redshift_pred'] > 1]

TypeError: '>' not supported between instances of 'list' and 'int'