In [21]:
#!pip install -r requirements.txt

In [22]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [23]:
def remove_unused(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    """
    Remove colunas que nao serao usadas.
    
    :param df: Dataframe alvo da transformacao
    :type df: pd.DataFrame
    :return: Dataframe transformado
    :rtype: pd.DataFrame
    """
    df_cp = df.copy()
    df_cp = df.drop(cols, axis=1)
    return df_cp
    

In [24]:
def angle_to_coord(row):
    alpha = row['alpha']
    delta = row['delta']
    a = np.cos(alpha) * np.cos(delta)
    b = np.sin(alpha) * np.cos(delta)
    c = np.sin(delta)
    return a, b, c

In [25]:
def spherical_to_castesian(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converte grandezas esfericas em grandezas de um plano cartesiano.
    
    :param df: Dataframe alvo da transformacao
    :type df: pd.DataFrame
    :return: Dataframe transformado
    :rtype: pd.DataFrame
    """
    df_unified = df.copy()
    df_unified['abc'] = df.apply(angle_to_coord, axis=1)
    df_split = df_unified.copy()
    df_split[['a', 'b', 'c']] = pd.DataFrame(df_unified['abc'].tolist(), index=df.index)
    df_split = remove_unused(df_split, ['abc'])
    return df_split

In [26]:
def remove_outliers(df: pd.DataFrame, features: list, threshold: int = 3) -> pd.DataFrame:
    """
    Remove outliers de um conjunto definido de features caso seu z-score seja superior a algum limiar.
    
    :param df: Dataframe alvo da transformacao
    :type df: pd.DataFrame
    :param features: 
    :return: Dataframe transformado
    :rtype: pd.DataFrame
    """
    df_cp = df.copy()     
    for feature in features:
        outliers = None
        z_scores = zscore(df[feature])
        outliers = df[(z_scores > threshold) | (z_scores < -threshold)]
        rem_index = outliers.index
        df_cp = df.drop(rem_index, axis=0)
    return df_cp

In [27]:
df = pd.read_csv('../data/star_classification_10.csv', index_col='obj_ID')
df = remove_unused(df, ['run_ID', 'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'plate', 'MJD', 'fiber_ID'])
df = spherical_to_castesian(df)
colors = ['u', 'g', 'r', 'i', 'z']
df = remove_outliers(df, colors, 3)

In [28]:
df.head()

Unnamed: 0_level_0,alpha,delta,u,g,r,i,z,class,redshift,a,b,c
obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.237663e+18,15.342907,0.794882,18.74547,17.49025,16.89122,16.5735,16.2991,GALAXY,0.042002,-0.654218,0.250033,0.713781
1.237664e+18,120.365538,55.660432,19.99985,19.68133,19.50156,19.17364,19.16122,QSO,1.633797,0.348727,0.525619,-0.77596
1.237655e+18,245.610038,42.974786,23.11792,20.81292,18.88351,18.12335,17.68182,GALAXY,0.454852,0.45075,0.28626,-0.845505
1.23766e+18,127.957356,6.647703,21.94454,21.01012,20.93496,20.93184,20.56855,QSO,2.608515,-0.618062,0.700648,0.356499
1.237665e+18,159.174526,35.881846,18.89945,17.68422,17.02925,16.6,16.36798,GALAXY,0.083804,0.122096,-0.211234,-0.96978


In [29]:
df.describe()

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,a,b,c
count,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0
mean,178.021888,23.780364,22.08724,20.654442,19.666468,19.100444,18.78272,0.577153,0.004225,-0.004881,0.001283
std,97.21061,19.555423,2.236697,2.040018,1.861123,1.756664,1.764802,0.729592,0.495324,0.501098,0.709656
min,0.011684,-16.450911,12.99664,11.33897,10.98255,10.87374,11.19448,-0.004136,-0.99969,-0.99976,-1.0
25%,127.308462,4.774689,20.357625,19.0037,18.181085,17.77666,17.509645,0.056034,-0.356719,-0.367601,-0.714069
50%,180.661805,22.985185,22.19822,21.12764,20.13578,19.42199,19.01234,0.43303,0.003875,-0.004307,0.012168
75%,234.309315,39.856375,23.70728,22.134215,21.06992,20.41133,19.9307,0.711196,0.360388,0.351354,0.721045
max,359.97891,82.5675,29.32565,27.89482,27.39709,25.67336,26.13011,7.011103,0.999693,0.999865,1.0


In [30]:
df.to_csv('../data/star_classification_10_full_p.csv')