## Importar librerías

In [5]:
# !pip install imblearn

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

## Importar dataset preprocesado

In [2]:
df = pd.read_pickle('features_extraidas.pkl')

## Tratamiento de datos

### Mantener solo muestras únicas

In [3]:
df = df.drop_duplicates('oid', ignore_index = False).reset_index(drop = True)

In [4]:
df.shape

(2068, 149)

### Eliminación de outliers

In [5]:
# Reemplazar con NaN los valores por sobre o debajo de mean + 3*std de la columna

for i in range(df.iloc[:,1:-1].shape[1]):
    df.iloc[:, i+1] = df.iloc[:, i+1].apply(lambda x: x if df.iloc[:, i+1].mean() - 3*df.iloc[:, i+1].std() <= x <= df.iloc[:, i+1].mean() + 3*df.iloc[:, i+1].std()
                                            else np.NaN)

### Tratamiento de NaNs

#### Eliminar aquellas features con más del 50% de sus valores con NaN

In [6]:
counts = df.isna().sum()
percentages = round(df.isna().mean()*100, 1)
null_values = pd.concat([counts, percentages], axis = 1, keys = ['count', '%'])

In [7]:
null_values[null_values['%'] > 50].sort_values('%', ascending = False)

Unnamed: 0,count,%
MaxSlope_g,2068,100.0
MaxSlope_r,2068,100.0
Eta_e_g,1962,94.9
Eta_e_r,1753,84.8
Period_band_g,1143,55.3
delta_period_g,1143,55.3


In [8]:
df = df.drop('MaxSlope_g', axis=1)
df = df.drop('MaxSlope_r', axis=1)
df = df.drop('Eta_e_g', axis=1)
df = df.drop('Eta_e_r', axis=1)
df = df.drop('Period_band_g', axis=1)
df = df.drop('delta_period_g', axis=1).reset_index(drop = True)

#### Eliminar aquellas muestras con más del 50% de sus features con NaN

In [9]:
counts_row = df.isna().sum(axis = 1)
percentages_row = round(df.isna().mean(axis = 1)*100, 1)
null_values_row = pd.concat([df['oid'], counts_row, percentages_row], axis = 1, keys = ['oid', 'count', '%'])

In [10]:
null_rows_oid = null_values_row[null_values_row['%'] >= 50]['oid'].tolist()

In [11]:
df = df[~df['oid'].isin(null_rows_oid)].reset_index(drop = True)

### Normalización de features

In [12]:
for i in range(df.iloc[:,1:-1].shape[1]):
    df.iloc[:, i+1] = df.iloc[:, i+1].sub(df.iloc[:, i+1].min()).div((df.iloc[:, i+1].max() - df.iloc[:, i+1].min()))

### Cambiar NaN por -999

In [13]:
df = df.fillna(-999)

In [14]:
df.describe()

Unnamed: 0,delta_mag_fid_g,delta_mjd_fid_g,first_mag_g,mean_mag_g,min_mag_g,n_det_g,n_neg_g,n_pos_g,positive_fraction_g,delta_mag_fid_r,...,SPM_tau_fall_g,SPM_chi_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,SPM_chi_r,sgscore1
count,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,...,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0,1920.0
mean,-4.442094,-22.181758,-3.611269,-4.187366,-10.94809,-15.929277,-11.444271,-16.443847,-13.585731,-2.895906,...,-1.277383,-131.636067,-6.698878,-22.935523,-20.521102,0.535474,-29.529305,0.38292,-62.435312,-224.080697
std,68.270793,147.886719,64.403372,68.288222,106.403662,125.968848,106.350125,127.951349,119.908631,55.785825,...,39.480522,337.992411,81.950795,151.250715,142.763147,0.219468,169.620773,0.268161,241.882963,417.023913
min,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,-999.0,0.0,-999.0,-999.0
25%,0.138333,0.101953,0.429885,0.440483,0.404457,0.098425,0.0,0.096774,1.0,0.118562,...,0.103927,6e-06,0.024239,0.454995,0.151027,0.471975,0.0371,0.213892,1.4e-05,0.0
50%,0.226589,0.146994,0.555007,0.513531,0.522159,0.149606,0.0,0.16129,1.0,0.212746,...,0.156722,2e-05,0.039298,0.517161,0.236573,0.57694,0.056751,0.28142,3.1e-05,0.032417
75%,0.328176,0.240837,0.68174,0.5661,0.611217,0.259843,0.0,0.258065,1.0,0.312019,...,0.367172,5.7e-05,0.068125,0.556635,0.395639,0.67209,0.127987,0.454739,8.4e-05,0.354098
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Exportar datos

In [22]:
df.to_pickle('dataset_treated.pkl')

## Feature selection

### Separación en sets de testeo y entrenamiento

In [23]:
X = df.copy()
X = X.drop('oid', axis=1)
X = X.drop('classALeRCE', axis=1)
y = df['classALeRCE']
test_size = 0.2

random_state = 15

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify = y)

### Definir variables según paper

In [24]:
x_train_paper = X_train[['AndersonDarling_r', 'Gskew_r', 'Harmonics_mag_6_g', 'IAR_phi_g', 'IAR_phi_r', 'LinearTrend_g', 
                         'LinearTrend_r', 'MHPS_low_g', 'MHPS_low_r', 'MHPS_ratio_g', 'MHPS_ratio_r', 'Power_rate_2', 
                         'PPE', 'Skew_r', 'SPM_beta_g', 'SPM_beta_r', 'SPM_gamma_g', 'SPM_gamma_r', 'SPM_t0_g',
                         'SPM_t0_r', 'SPM_tau_fall_g', 'SPM_tau_fall_r', 'SPM_tau_rise_g', 'SPM_tau_rise_r', 'Mean_g', 'Mean_r']]

x_test_paper = X_test[['AndersonDarling_r', 'Gskew_r', 'Harmonics_mag_6_g', 'IAR_phi_g', 'IAR_phi_r', 'LinearTrend_g', 
                       'LinearTrend_r', 'MHPS_low_g', 'MHPS_low_r', 'MHPS_ratio_g', 'MHPS_ratio_r', 'Power_rate_2', 
                       'PPE', 'Skew_r', 'SPM_beta_g', 'SPM_beta_r', 'SPM_gamma_g', 'SPM_gamma_r', 'SPM_t0_g',
                       'SPM_t0_r', 'SPM_tau_fall_g', 'SPM_tau_fall_r', 'SPM_tau_rise_g', 'SPM_tau_rise_r', 'Mean_g', 'Mean_r']]

### Selección variables según BRF

In [25]:
clf = SelectFromModel(BalancedRandomForestClassifier(sampling_strategy="all", replacement=True, random_state=0), 
                      max_features = 30)

clf.fit(X_train, y_train)

In [26]:
labels_boolean = clf.get_support()
labels = df.iloc[:,1:-1].columns.tolist()

In [27]:
best_labels = ['oid']

for i in range(len(labels)):
    if labels_boolean[i] == True:
        best_labels.append(labels[i])

best_labels.append('classALeRCE')

In [28]:
df[best_labels]

Unnamed: 0,oid,delta_mjd_fid_g,first_mag_g,delta_mjd_fid_r,first_mag_r,min_mag_r,MHPS_ratio_g,MHPS_low_g,MHPS_high_g,GP_DRW_tau_g,...,SPM_gamma_g,SPM_tau_rise_g,SPM_tau_fall_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,classALeRCE
0,ZTF17aadlxmv,0.158550,0.304372,0.163569,0.204082,0.277004,0.487074,-999.000000,0.016517,0.000003,...,0.259842,0.024749,0.096010,0.044929,0.559577,0.214201,0.416786,0.045428,0.281580,SNIa
1,ZTF18aacdbzx,0.075369,0.431364,0.089039,0.312902,0.451723,0.505206,0.693417,0.005329,0.000012,...,0.000009,0.043638,0.230439,0.012328,0.398924,0.136000,0.063725,0.161447,0.186309,SNIbc
2,ZTF18aadmssd,0.226016,0.119141,0.263749,0.021945,0.112781,0.517870,0.246397,0.026605,0.000003,...,0.347992,0.036090,0.203654,0.334789,0.166160,0.405024,0.755150,0.272980,0.452882,SNII
3,ZTF18aadzfso,0.124407,0.371786,0.121165,0.228238,0.333748,0.518514,0.161356,0.031185,0.000003,...,0.053318,0.047546,0.121977,0.036066,0.524338,0.180310,0.606839,0.050221,0.846331,SNIa
4,ZTF18aaermez,0.101252,0.215388,0.171325,0.119346,0.177456,-999.000000,-999.000000,0.000636,0.000003,...,0.089289,0.134137,0.195692,0.068826,0.323696,0.457520,0.483839,0.418713,0.230962,SNIa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,ZTF20abgbxfm,0.259943,0.556880,0.238811,0.547575,0.640540,0.531472,0.977953,0.117685,0.000499,...,0.348722,0.040510,0.224166,0.040867,0.524424,0.140207,0.570431,0.065246,0.269249,SNIa
1916,ZTF20abgdtmv,0.124489,0.527859,0.138912,0.583717,0.691450,0.531468,0.978164,0.217523,0.000218,...,0.079760,0.098651,0.079317,0.034035,0.628114,0.014571,0.340742,0.086573,0.124703,SNIa
1917,ZTF20abgfekk,0.120369,0.517914,0.103320,0.481904,0.449552,0.531463,0.975646,0.036560,0.000183,...,0.215310,0.044950,0.180829,0.059746,0.542792,0.369028,0.521994,0.048918,0.281303,SNIa
1918,ZTF20abgfljj,0.082687,0.714555,0.088984,0.846128,0.610773,0.531462,0.975642,0.288731,0.000034,...,0.242261,0.379759,0.020620,0.017522,0.532537,0.076601,0.014020,0.026031,0.176169,SNIa


In [29]:
df[best_labels].iloc[:,1:-1].columns

Index(['delta_mjd_fid_g', 'first_mag_g', 'delta_mjd_fid_r', 'first_mag_r',
       'min_mag_r', 'MHPS_ratio_g', 'MHPS_low_g', 'MHPS_high_g',
       'GP_DRW_tau_g', 'GP_DRW_sigma_r', 'Power_rate_4', 'Std_g', 'IAR_phi_g',
       'LinearTrend_g', 'AndersonDarling_r', 'MedianAbsDev_r',
       'PairSlopeTrend_r', 'Q31_r', 'Std_r', 'LinearTrend_r', 'SPM_t0_g',
       'SPM_gamma_g', 'SPM_tau_rise_g', 'SPM_tau_fall_g', 'SPM_A_r',
       'SPM_t0_r', 'SPM_gamma_r', 'SPM_beta_r', 'SPM_tau_rise_r',
       'SPM_tau_fall_r'],
      dtype='object')

## Por correlacion

In [30]:
df[labels].corr()

Unnamed: 0,delta_mag_fid_g,delta_mjd_fid_g,first_mag_g,mean_mag_g,min_mag_g,n_det_g,n_neg_g,n_pos_g,positive_fraction_g,delta_mag_fid_r,...,SPM_tau_fall_g,SPM_chi_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,SPM_chi_r,sgscore1
delta_mag_fid_g,1.000000,0.453235,0.350561,0.888277,0.207087,0.293392,0.565639,0.169551,0.564111,0.269298,...,0.575603,0.041386,-0.005942,0.140584,0.150075,0.037511,0.077787,-0.080740,-0.017502,-0.018770
delta_mjd_fid_g,0.453235,1.000000,0.154273,0.401825,0.215019,0.231545,0.678192,0.145082,0.715682,0.243592,...,0.259288,0.024540,0.159090,0.302078,0.297959,0.107187,0.222026,-0.199297,0.004687,0.045020
first_mag_g,0.350561,0.154273,1.000000,0.350839,0.449320,0.184199,0.220978,0.181054,0.194447,-0.003690,...,0.611375,0.070547,0.192139,-0.009958,-0.009491,-0.044274,-0.011204,-0.013615,0.016682,-0.034975
mean_mag_g,0.888277,0.401825,0.350839,1.000000,0.279471,0.233157,0.493924,0.169875,0.500323,0.269372,...,0.575722,0.063440,0.087675,0.140474,0.149974,0.030950,0.077678,-0.078219,0.013908,-0.037188
min_mag_g,0.207087,0.215019,0.449320,0.279471,1.000000,0.219163,0.126264,0.215345,0.109232,-0.006528,...,0.367215,0.116918,0.647306,-0.016696,-0.016045,-0.068312,-0.019096,-0.014495,0.113598,-0.034610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SPM_beta_r,0.037511,0.107187,-0.044274,0.030950,-0.068312,-0.002518,0.092241,-0.013104,0.105960,0.041310,...,-0.022977,0.061832,-0.044693,0.114315,0.136276,1.000000,0.073139,0.131643,0.001376,-0.047379
SPM_tau_rise_r,0.077787,0.222026,-0.011204,0.077678,-0.019096,0.050327,0.240578,0.000984,0.234622,0.155106,...,-0.008823,-0.050011,-0.014601,0.256801,0.425236,0.073139,1.000000,-0.150661,-0.019847,0.045603
SPM_tau_fall_r,-0.080740,-0.199297,-0.013615,-0.078219,-0.014495,-0.058901,-0.127196,-0.042572,-0.113656,-0.129099,...,-0.018649,0.040928,-0.018083,-0.214630,-0.149561,0.131643,-0.150661,1.000000,-0.043411,-0.025296
SPM_chi_r,-0.017502,0.004687,0.016682,0.013908,0.113598,-0.032785,-0.027799,-0.033332,-0.031416,-0.014149,...,-0.010204,0.319260,0.136091,-0.011476,-0.037555,0.001376,-0.019847,-0.043411,1.000000,-0.004814


In [31]:
cor = df[labels].corr().abs()

In [32]:
cor

Unnamed: 0,delta_mag_fid_g,delta_mjd_fid_g,first_mag_g,mean_mag_g,min_mag_g,n_det_g,n_neg_g,n_pos_g,positive_fraction_g,delta_mag_fid_r,...,SPM_tau_fall_g,SPM_chi_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,SPM_chi_r,sgscore1
delta_mag_fid_g,1.000000,0.453235,0.350561,0.888277,0.207087,0.293392,0.565639,0.169551,0.564111,0.269298,...,0.575603,0.041386,0.005942,0.140584,0.150075,0.037511,0.077787,0.080740,0.017502,0.018770
delta_mjd_fid_g,0.453235,1.000000,0.154273,0.401825,0.215019,0.231545,0.678192,0.145082,0.715682,0.243592,...,0.259288,0.024540,0.159090,0.302078,0.297959,0.107187,0.222026,0.199297,0.004687,0.045020
first_mag_g,0.350561,0.154273,1.000000,0.350839,0.449320,0.184199,0.220978,0.181054,0.194447,0.003690,...,0.611375,0.070547,0.192139,0.009958,0.009491,0.044274,0.011204,0.013615,0.016682,0.034975
mean_mag_g,0.888277,0.401825,0.350839,1.000000,0.279471,0.233157,0.493924,0.169875,0.500323,0.269372,...,0.575722,0.063440,0.087675,0.140474,0.149974,0.030950,0.077678,0.078219,0.013908,0.037188
min_mag_g,0.207087,0.215019,0.449320,0.279471,1.000000,0.219163,0.126264,0.215345,0.109232,0.006528,...,0.367215,0.116918,0.647306,0.016696,0.016045,0.068312,0.019096,0.014495,0.113598,0.034610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SPM_beta_r,0.037511,0.107187,0.044274,0.030950,0.068312,0.002518,0.092241,0.013104,0.105960,0.041310,...,0.022977,0.061832,0.044693,0.114315,0.136276,1.000000,0.073139,0.131643,0.001376,0.047379
SPM_tau_rise_r,0.077787,0.222026,0.011204,0.077678,0.019096,0.050327,0.240578,0.000984,0.234622,0.155106,...,0.008823,0.050011,0.014601,0.256801,0.425236,0.073139,1.000000,0.150661,0.019847,0.045603
SPM_tau_fall_r,0.080740,0.199297,0.013615,0.078219,0.014495,0.058901,0.127196,0.042572,0.113656,0.129099,...,0.018649,0.040928,0.018083,0.214630,0.149561,0.131643,0.150661,1.000000,0.043411,0.025296
SPM_chi_r,0.017502,0.004687,0.016682,0.013908,0.113598,0.032785,0.027799,0.033332,0.031416,0.014149,...,0.010204,0.319260,0.136091,0.011476,0.037555,0.001376,0.019847,0.043411,1.000000,0.004814


In [33]:
upper = cor.where(np.triu(np.ones(cor.shape), k=1).astype(bool))

In [34]:
upper

Unnamed: 0,delta_mag_fid_g,delta_mjd_fid_g,first_mag_g,mean_mag_g,min_mag_g,n_det_g,n_neg_g,n_pos_g,positive_fraction_g,delta_mag_fid_r,...,SPM_tau_fall_g,SPM_chi_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,SPM_chi_r,sgscore1
delta_mag_fid_g,,0.453235,0.350561,0.888277,0.207087,0.293392,0.565639,0.169551,0.564111,0.269298,...,0.575603,0.041386,0.005942,0.140584,0.150075,0.037511,0.077787,0.080740,0.017502,0.018770
delta_mjd_fid_g,,,0.154273,0.401825,0.215019,0.231545,0.678192,0.145082,0.715682,0.243592,...,0.259288,0.024540,0.159090,0.302078,0.297959,0.107187,0.222026,0.199297,0.004687,0.045020
first_mag_g,,,,0.350839,0.449320,0.184199,0.220978,0.181054,0.194447,0.003690,...,0.611375,0.070547,0.192139,0.009958,0.009491,0.044274,0.011204,0.013615,0.016682,0.034975
mean_mag_g,,,,,0.279471,0.233157,0.493924,0.169875,0.500323,0.269372,...,0.575722,0.063440,0.087675,0.140474,0.149974,0.030950,0.077678,0.078219,0.013908,0.037188
min_mag_g,,,,,,0.219163,0.126264,0.215345,0.109232,0.006528,...,0.367215,0.116918,0.647306,0.016696,0.016045,0.068312,0.019096,0.014495,0.113598,0.034610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SPM_beta_r,,,,,,,,,,,...,,,,,,,0.073139,0.131643,0.001376,0.047379
SPM_tau_rise_r,,,,,,,,,,,...,,,,,,,,0.150661,0.019847,0.045603
SPM_tau_fall_r,,,,,,,,,,,...,,,,,,,,,0.043411,0.025296
SPM_chi_r,,,,,,,,,,,...,,,,,,,,,,0.004814


In [35]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.60)]
df_corr = df.copy()

In [36]:
df_corr

Unnamed: 0,oid,delta_mag_fid_g,delta_mjd_fid_g,first_mag_g,mean_mag_g,min_mag_g,n_det_g,n_neg_g,n_pos_g,positive_fraction_g,...,SPM_chi_g,SPM_A_r,SPM_t0_r,SPM_gamma_r,SPM_beta_r,SPM_tau_rise_r,SPM_tau_fall_r,SPM_chi_r,sgscore1,classALeRCE
0,ZTF17aadlxmv,0.066609,0.158550,0.304372,0.293031,0.305760,0.133858,0.0,0.145161,1.0,...,0.000003,0.044929,0.559577,0.214201,0.416786,0.045428,0.281580,0.000204,0.174589,SNIa
1,ZTF18aacdbzx,0.016737,0.075369,0.431364,0.409841,0.513145,0.102362,0.0,0.112903,1.0,...,0.000014,0.012328,0.398924,0.136000,0.063725,0.161447,0.186309,0.000015,0.032000,SNIbc
2,ZTF18aadmssd,0.138109,0.226016,0.119141,0.259700,0.193830,0.118110,0.0,0.129032,1.0,...,0.000004,0.334789,0.166160,0.405024,0.755150,0.272980,0.452882,0.000381,0.154649,SNII
3,ZTF18aadzfso,0.049566,0.124407,0.371786,0.349156,0.396286,0.181102,0.0,0.193548,1.0,...,0.000012,0.036066,0.524338,0.180310,0.606839,0.050221,0.846331,0.000009,0.024792,SNIa
4,ZTF18aaermez,0.015613,0.101252,0.215388,0.244511,0.285734,0.118110,0.0,0.129032,1.0,...,0.000051,0.068826,0.323696,0.457520,0.483839,0.418713,0.230962,0.005872,0.158232,SNIa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,ZTF20abgbxfm,0.405382,0.259943,0.556880,0.598688,0.550773,0.448819,0.0,0.467742,1.0,...,0.000032,0.040867,0.524424,0.140207,0.570431,0.065246,0.269249,0.000014,0.030833,SNIa
1916,ZTF20abgdtmv,0.362981,0.124489,0.527859,0.665163,0.619823,0.196850,0.0,0.209677,1.0,...,0.000006,0.034035,0.628114,0.014571,0.340742,0.086573,0.124703,0.000014,0.423167,SNIa
1917,ZTF20abgfekk,0.178670,0.120369,0.517914,0.333324,0.374042,0.244094,0.0,0.258065,1.0,...,0.000018,0.059746,0.542792,0.369028,0.521994,0.048918,0.281303,0.000019,0.003750,SNIa
1918,ZTF20abgfljj,0.182761,0.082687,0.714555,0.526268,0.579182,0.196850,0.0,0.209677,1.0,...,0.000553,0.017522,0.532537,0.076601,0.014020,0.026031,0.176169,0.000016,0.500000,SNIa


In [37]:
df_corr.drop(to_drop, axis=1, inplace=True)

In [38]:
df_corr.columns

Index(['oid', 'delta_mag_fid_g', 'delta_mjd_fid_g', 'first_mag_g', 'min_mag_g',
       'n_det_g', 'delta_mag_fid_r', 'first_mag_r', 'MHPS_ratio_g',
       'MHPS_ratio_r', 'MHPS_high_r', 'Multiband_period', 'GP_DRW_tau_g',
       'GP_DRW_tau_r', 'Psi_CS_g', 'Psi_CS_r', 'Psi_eta_r', 'iqr_r',
       'Amplitude_g', 'Amplitude_r', 'Pvar_r', 'SPM_t0_g', 'SPM_gamma_g',
       'SPM_tau_rise_g', 'SPM_t0_r', 'SPM_gamma_r', 'SPM_beta_r',
       'SPM_tau_rise_r', 'SPM_tau_fall_r', 'sgscore1', 'classALeRCE'],
      dtype='object')

## Por Recursive Feature Elimination

In [39]:
from sklearn.feature_selection import RFE

In [40]:
clf = BalancedRandomForestClassifier(sampling_strategy="all", replacement=True, random_state=0)
selector = RFE(clf, n_features_to_select = 30, step = 1)
selector = selector.fit(X_train, y_train)

In [41]:
labels_boolean3 = selector.support_
labels3 = X_train.columns.tolist()

In [42]:
best_labels3 = ['oid']

for i in range(len(labels3)):
    if labels_boolean3[i] == True:
        best_labels3.append(labels3[i])
        
best_labels3.append('classALeRCE')

In [43]:
df[best_labels3].columns

Index(['oid', 'delta_mjd_fid_g', 'first_mag_g', 'delta_mjd_fid_r',
       'first_mag_r', 'min_mag_r', 'MHPS_ratio_g', 'MHPS_high_g',
       'MHPS_high_r', 'GP_DRW_tau_g', 'GP_DRW_tau_r', 'Skew_g', 'Amplitude_r',
       'Gskew_r', 'Meanvariance_r', 'MedianAbsDev_r', 'PairSlopeTrend_r',
       'StetsonK_r', 'LinearTrend_r', 'SPM_t0_g', 'SPM_gamma_g', 'SPM_beta_g',
       'SPM_tau_rise_g', 'SPM_tau_fall_g', 'SPM_A_r', 'SPM_t0_r',
       'SPM_gamma_r', 'SPM_beta_r', 'SPM_tau_rise_r', 'SPM_tau_fall_r',
       'sgscore1', 'classALeRCE'],
      dtype='object')

In [44]:
features_paper = ['AndersonDarling_r', 'Gskew_r', 'Harmonics_mag_6_g', 'IAR_phi_g', 'IAR_phi_r', 'LinearTrend_g', 
                         'LinearTrend_r', 'MHPS_low_g', 'MHPS_low_r', 'MHPS_ratio_g', 'MHPS_ratio_r', 'Power_rate_2', 
                         'PPE', 'Skew_r', 'SPM_beta_g', 'SPM_beta_r', 'SPM_gamma_g', 'SPM_gamma_r', 'SPM_t0_g',
                         'SPM_t0_r', 'SPM_tau_fall_g', 'SPM_tau_fall_r', 'SPM_tau_rise_g', 'SPM_tau_rise_r', 'Mean_g', 'Mean_r']


features1 = list(best_labels)
features2 = list(df_corr.columns)
features3 = list(best_labels3)

features1.remove('oid')
features1.remove('classALeRCE')
features2.remove('oid')
features2.remove('classALeRCE')
features3.remove('oid')
features3.remove('classALeRCE')

print(features1)
print(features2)
print(features3)

['delta_mjd_fid_g', 'first_mag_g', 'delta_mjd_fid_r', 'first_mag_r', 'min_mag_r', 'MHPS_ratio_g', 'MHPS_low_g', 'MHPS_high_g', 'GP_DRW_tau_g', 'GP_DRW_sigma_r', 'Power_rate_4', 'Std_g', 'IAR_phi_g', 'LinearTrend_g', 'AndersonDarling_r', 'MedianAbsDev_r', 'PairSlopeTrend_r', 'Q31_r', 'Std_r', 'LinearTrend_r', 'SPM_t0_g', 'SPM_gamma_g', 'SPM_tau_rise_g', 'SPM_tau_fall_g', 'SPM_A_r', 'SPM_t0_r', 'SPM_gamma_r', 'SPM_beta_r', 'SPM_tau_rise_r', 'SPM_tau_fall_r']
['delta_mag_fid_g', 'delta_mjd_fid_g', 'first_mag_g', 'min_mag_g', 'n_det_g', 'delta_mag_fid_r', 'first_mag_r', 'MHPS_ratio_g', 'MHPS_ratio_r', 'MHPS_high_r', 'Multiband_period', 'GP_DRW_tau_g', 'GP_DRW_tau_r', 'Psi_CS_g', 'Psi_CS_r', 'Psi_eta_r', 'iqr_r', 'Amplitude_g', 'Amplitude_r', 'Pvar_r', 'SPM_t0_g', 'SPM_gamma_g', 'SPM_tau_rise_g', 'SPM_t0_r', 'SPM_gamma_r', 'SPM_beta_r', 'SPM_tau_rise_r', 'SPM_tau_fall_r', 'sgscore1']
['delta_mjd_fid_g', 'first_mag_g', 'delta_mjd_fid_r', 'first_mag_r', 'min_mag_r', 'MHPS_ratio_g', 'MHPS_high

In [45]:
interseccion1 = [feature for feature in features_paper if feature in features1]
interseccion2 = [feature for feature in features_paper if feature in features2]
interseccion3 = [feature for feature in features_paper if feature in features3]

print("\nFeatures obtenidos por BRF")
print(f'Cantidad de features:\t{len(interseccion1)}')
print('Features Comunes:\n',interseccion1)
print('Features fuera de las mejores del paper:\n' , [feature for feature in features1 if feature not in features_paper] )

print("\nFeatures obtenidos por Correlación")
print(f'Cantidad de features:\t{len(interseccion2)}')
print('Features Comunes:\n',interseccion2)
print('Features fuera de las mejores del paper:\n' , [feature for feature in features2 if feature not in features_paper] )

print("\nFeatures obtenidos por Eliminación Recursiva")
print(f'Cantidad de features:\t{len(interseccion3)}')
print('Features Comunes:\n',interseccion3)
print('Features fuera de las mejores del paper:\n' , [feature for feature in features3 if feature not in features_paper] )


Features obtenidos por BRF
Cantidad de features:	15
Features Comunes:
 ['AndersonDarling_r', 'IAR_phi_g', 'LinearTrend_g', 'LinearTrend_r', 'MHPS_low_g', 'MHPS_ratio_g', 'SPM_beta_r', 'SPM_gamma_g', 'SPM_gamma_r', 'SPM_t0_g', 'SPM_t0_r', 'SPM_tau_fall_g', 'SPM_tau_fall_r', 'SPM_tau_rise_g', 'SPM_tau_rise_r']
Features fuera de las mejores del paper:
 ['delta_mjd_fid_g', 'first_mag_g', 'delta_mjd_fid_r', 'first_mag_r', 'min_mag_r', 'MHPS_high_g', 'GP_DRW_tau_g', 'GP_DRW_sigma_r', 'Power_rate_4', 'Std_g', 'MedianAbsDev_r', 'PairSlopeTrend_r', 'Q31_r', 'Std_r', 'SPM_A_r']

Features obtenidos por Correlación
Cantidad de features:	10
Features Comunes:
 ['MHPS_ratio_g', 'MHPS_ratio_r', 'SPM_beta_r', 'SPM_gamma_g', 'SPM_gamma_r', 'SPM_t0_g', 'SPM_t0_r', 'SPM_tau_fall_r', 'SPM_tau_rise_g', 'SPM_tau_rise_r']
Features fuera de las mejores del paper:
 ['delta_mag_fid_g', 'delta_mjd_fid_g', 'first_mag_g', 'min_mag_g', 'n_det_g', 'delta_mag_fid_r', 'first_mag_r', 'MHPS_high_r', 'Multiband_period', 