##### Imports & setup

In [1]:
import pandas as pd
import numpy as np
import requests
import io
from magcvs_library.functions import tqdm2

# Feature names from https://fink-broker.readthedocs.io/en/latest/services/search/anomaly_detection/
feature_names = ['mean', 'weighted_mean', 'standard_deviation', 'median', 'amplitude', 'beyond_1_std', 'cusum',
                 'inter_percentile_range_10', 'kurtosis', 'linear_trend', 'linear_trend_sigma', 'linear_trend_noise',
                 'linear_fit_slope', 'linear_fit_slope_sigma', 'linear_fit_reduced_chi2', 'magnitude_percentage_ratio_40_5',
                 'magnitude_percentage_ratio_20_10', 'maximum_slope', 'median_absolute_deviation', 'median_buffer_range_percentage_10',
                 'percent_amplitude', 'mean_variance', 'anderson_darling_normal', 'chi2', 'skew', 'stetson_K']

# Renaming to shorter names:
feature_names = ['mean', 'weightedMean', 'std', 'median', 'amplitude', 'beyond1Std',
                 'cusum', 'IPR10', 'kurtosis', 'linT', 'linT_sigma', 'linT_noise',
                 'linF_slope', 'linF_slope_sigma', 'linF_chi2', 'MPR40_5', 'MPR20_10',
                 'maxSlope', 'medianAbsDev', 'medianBRP10', 'percentAmplitude',
                 'meanVariance', 'andersonDarlingNorm', 'chi2', 'skew', 'stetsonK']

---
## magCVs data (positive class)

### Getting data of selected magCVs from Fink's API

In [2]:
# Reading IDs of selected magnetic Cataclysmic Variables:
objids_list = list(pd.read_csv('../data/magnetic_cvs_objectId.csv').values.flatten())

# Retrieving full objects data from Fink:
r = requests.post(
  "https://api.fink-portal.org/api/v1/objects",
  json={
    "objectId": ",".join(objids_list),
    "columns": "i:objectId,d:anomaly_score,d:lc_features_g,d:lc_features_r",
    "output-format": "json"
  }
)

### Cleaning data

In [3]:
pdf = pd.read_json(io.BytesIO(r.content))

# Converting values from str to float in the features columns:
placeholder_features_g, placeholder_features_r = [], []
for i in range(len(pdf)):
    placeholder_features_g.append(np.array(eval(pdf['d:lc_features_g'][i].replace('NaN', 'np.nan').replace('null', 'np.nan'))))
    placeholder_features_r.append(np.array(eval(pdf['d:lc_features_r'][i].replace('NaN', 'np.nan').replace('null', 'np.nan'))))
pdf['d:lc_features_g'] = placeholder_features_g
pdf['d:lc_features_r'] = placeholder_features_r

initial_len = len(pdf) # Number of lines in the dataframe before removing empty, duplicate and NaN lines for reference

pdf.head()

Unnamed: 0,i:objectId,d:anomaly_score,d:lc_features_g,d:lc_features_r
0,ZTF17aaarvmd,,[],[]
1,ZTF18aaadlpa,-0.207629,"[18.805038361213605, 18.97971454799683, 1.4069...","[19.072515062537434, 17.565052863600965, 1.035..."
2,ZTF18abnulwr,-0.019535,"[21.797781262525547, 21.443444600246625, 0.618...","[23.285054981784537, 22.903822703409396, 0.652..."
3,ZTF18aaqphee,-0.012197,"[20.001228861637856, 19.99264607715552, 0.1770...","[19.19776281066766, 19.179895917835594, 0.0864..."
4,ZTF18abwiccd,,[],[]


In [4]:
# Getting rid of lines with NaN in the anomaly_score column:
pdf = pdf[~np.isnan(pdf['d:anomaly_score'])]

# Splitting the dataframe into two, one for each filter (because they will not have the same length after cleaning):
pdf_g = pdf.drop(columns=['d:lc_features_r'])
pdf_r = pdf.drop(columns=['d:lc_features_g'])

# Getting rid of empty lines or with NaN values in the features:
for index, lc_features_g, lc_features_r in zip(tqdm2(pdf.index), pdf_g['d:lc_features_g'], pdf_r['d:lc_features_r']):
    if len(lc_features_g) == 0 or True in np.isnan(lc_features_g):
        pdf_g = pdf_g.drop([index])
    if len(lc_features_r) == 0 or True in np.isnan(lc_features_r):
        pdf_r = pdf_r.drop([index])

print(f'{len(pdf_g)} and {len(pdf_r)} lines remaining in g and r filters respectively after cleaning out of {initial_len}')

100%|██████████| 9179/9179

4478 and 4436 lines remaining in g and r filters respectively after cleaning out of 24851





In [5]:
# Removing duplicates with np.unique (faster than pandas drop_duplicates but the conversion to numpy array rounds values resulting in more duplicates):
features_g = np.vstack(pdf_g["d:lc_features_g"].to_numpy())
features_r = np.vstack(pdf_r["d:lc_features_r"].to_numpy())
features_g_reduced, index_g = np.unique(features_g, axis=0, return_index=True)
features_r_reduced, index_r = np.unique(features_r, axis=0, return_index=True)

print(f'{len(features_g_reduced)} out of {len(features_g)} and {len(features_r_reduced)} out of {len(features_r)} lines remaining in g and r filters respectively after removing duplicates')

3904 out of 4478 and 4067 out of 4436 lines remaining in g and r filters respectively after removing duplicates


In [6]:
# Converting back to pandas DataFrame with feature names as columns:
dfg = pd.DataFrame(features_g_reduced, columns=feature_names, index=pdf_g.index[index_g])
dfr = pd.DataFrame(features_r_reduced, columns=feature_names, index=pdf_r.index[index_r])

# Retrieving corresponding IDs and anomaly scores:
for column in ['i:objectId', 'd:anomaly_score']:
    dfg[column] = pdf_g[column].values[index_g]
    dfr[column] = pdf_r[column].values[index_r]

In [7]:
dfg.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianAbsDev,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,i:objectId,d:anomaly_score
4814,13.817506,13.828649,0.195803,13.896276,0.234263,0.2,0.346548,0.468526,3.736549,-0.012028,...,0.05048,0.2,0.418046,0.014171,0.515629,143.749041,-1.915839,0.754049,ZTF17aabdpti,-0.133778
19892,13.817509,13.828649,0.195798,13.896276,0.234256,0.2,0.346548,0.468512,3.736476,-0.012028,...,0.05048,0.2,0.418033,0.01417,0.515619,143.744461,-1.915821,0.754053,ZTF17aabdpti,-0.132313
13547,13.86976,13.870895,0.06393,13.895045,0.080594,0.4,0.402923,0.161188,-1.246543,0.00109,...,0.05171,0.4,0.109477,0.004609,0.204758,14.404587,-0.309796,0.915388,ZTF17aabdpti,-0.119457
19485,13.90087,13.893656,0.048767,13.896276,0.060799,0.2,0.329353,0.121598,0.822606,7e-05,...,0.044842,0.4,0.071118,0.003508,0.280313,14.517087,-0.979036,0.762692,ZTF17aabdpti,-0.122106
18467,13.902325,13.893262,0.056184,13.918694,0.060799,0.25,0.370289,0.121598,0.423473,-0.000262,...,0.02524,0.0,0.093537,0.004041,0.147103,19.350766,-1.179132,0.848727,ZTF17aabdpti,-0.11762


In [8]:
dfr.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianAbsDev,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,i:objectId,d:anomaly_score
4814,13.678092,13.677388,0.110756,13.673155,0.16093,0.5,0.345357,0.301705,-0.571911,-0.008948,...,0.071209,0.25,0.173293,0.008097,0.3199,26.702542,0.271775,0.774002,ZTF17aabdpti,-0.133778
22123,13.689333,13.682039,0.115438,13.677993,0.16093,0.5,0.3922,0.301705,-1.193498,-0.008726,...,0.105357,0.25,0.168455,0.008433,0.214288,27.829847,-0.055104,0.83319,ZTF17aabdpti,-0.1319
19892,13.692617,13.68472,0.124283,13.689638,0.16093,0.428571,0.405006,0.308424,-1.622742,-0.011084,...,0.124217,0.142857,0.165051,0.009077,0.228212,32.342939,-0.163608,0.870325,ZTF17aabdpti,-0.132313
18467,13.692619,13.684722,0.124279,13.689638,0.160921,0.428571,0.405011,0.308409,-1.62289,-0.011084,...,0.124217,0.142857,0.165033,0.009076,0.228234,32.34101,-0.163525,0.870329,ZTF17aabdpti,-0.11762
19485,13.720621,13.711713,0.109305,13.729766,0.143634,0.333333,0.40915,0.275456,-1.122858,-0.010003,...,0.084572,0.0,0.170586,0.007966,0.184656,25.623112,-0.402614,0.888571,ZTF17aabdpti,-0.122106


### Saving the dataframes to csv files:

In [9]:
# dfg.to_csv('LOCAL_PATH_TO_CLEAN_DATA/positive_class_g.csv')
# dfr.to_csv('LOCAL_PATH_TO_CLEAN_DATA/positive_class_r.csv')

---
## Other lc data (negative class)

### Getting other lc data

In [10]:
path_to_other_data = '../../data/lc_features_not_nan_JAN2024/' # <----- To be changed accordingly
pdf2 = pd.read_parquet(path_to_other_data)

In [11]:
pdf2.head()

Unnamed: 0,objectId,candid,lc_features_g,lc_features_r,cdsxmatch
0,ZTF18abqpjos,2570132662315010001,"{'mean': 17.401908735560085, 'weighted_mean': ...","{'mean': 17.631597001140964, 'weighted_mean': ...",QSO
1,ZTF23aatekmu,2570135064715015025,"{'mean': 19.48988275209835, 'weighted_mean': 1...","{'mean': 18.274583367522656, 'weighted_mean': ...",Unknown
2,ZTF18acdyhbe,2570137543115010004,"{'mean': 19.510292396036682, 'weighted_mean': ...","{'mean': 16.208598134662612, 'weighted_mean': ...",LPV*
3,ZTF19aczmbew,2570137543815010007,"{'mean': 16.886914280166614, 'weighted_mean': ...","{'mean': 15.622438672539376, 'weighted_mean': ...",Unknown
4,ZTF18acmgaps,2570138041515010000,"{'mean': 15.311072617994874, 'weighted_mean': ...","{'mean': 14.604294032777275, 'weighted_mean': ...",Unknown


### Cleaning data

In [12]:
# Converting the lines of dictionaries to lines of arrays with only the values of the dictionaries:
placeholder_features_g2 = []
placeholder_features_r2 = []
for i in tqdm2(range(len(pdf2)), desc='Converting dictionaries to arrays'):
    placeholder_features_g2.append(np.array(list(pdf2['lc_features_g'][i].values())))
    placeholder_features_r2.append(np.array(list(pdf2['lc_features_r'][i].values())))

# Splitting the dataframe into two, one for each filter (because they will not have the same length after cleaning):
pdf_g2 = pdf2.drop(columns=['lc_features_r'])
pdf_r2 = pdf2.drop(columns=['lc_features_g'])
pdf_g2['lc_features_g'] = placeholder_features_g2
pdf_r2['lc_features_r'] = placeholder_features_r2

# Removing lines containing NaN values in the features:
nans_g = np.array([], dtype=bool)
nans_r = np.array([], dtype=bool)
for lc_features_g, lc_features_r in zip(tqdm2(pdf_g2["lc_features_g"], desc='Removing lines containing NaN values'), pdf_r2["lc_features_r"]):
    nans_g = np.append(nans_g, True in np.isnan(lc_features_g))
    nans_r = np.append(nans_r, True in np.isnan(lc_features_r))
pdf_g2, pdf_r2 = pdf_g2[~nans_g], pdf_r2[~nans_r]

print(f'{len(pdf_g2)} and {len(pdf_r2)} lines remaining in g and r filters respectively after cleaning out of {len(pdf2)}')

# Removing duplicates with np.unique (faster than pandas drop_duplicates but the conversion to numpy array rounds values resulting in more duplicates):
print('\nRemoving duplicates...')
features_g2 = np.vstack(pdf_g2["lc_features_g"].to_numpy())
features_r2 = np.vstack(pdf_r2["lc_features_r"].to_numpy())
features_g_reduced2, index_g2 = np.unique(features_g2, axis=0, return_index=True)
features_r_reduced2, index_r2 = np.unique(features_r2, axis=0, return_index=True)

print(f'{len(features_g_reduced2)} out of {len(features_g2)} and {len(features_r_reduced2)} out of {len(features_r2)} lines remaining in g and r filters respectively after removing duplicates')

# Converting back to pandas DataFrame with feature names as columns:
dfg2 = pd.DataFrame(features_g_reduced2, columns=feature_names, index=pdf_g2.index[index_g2])
dfr2 = pd.DataFrame(features_r_reduced2, columns=feature_names, index=pdf_r2.index[index_r2])

# Retrieving corresponding IDs, candid and cdsxmatch:
for column in ['objectId', 'candid', 'cdsxmatch']:
    dfg2[column] = pdf_g2[column].values[index_g2]
    dfr2[column] = pdf_r2[column].values[index_r2]

# Removing non-interesting features:
dfg2 = dfg2.drop(columns=['linT', 'linT_sigma', 'linT_noise', 'linF_slope', 'linF_slope_sigma', 'linF_chi2']).sort_index()
dfr2 = dfr2.drop(columns=['linT', 'linT_sigma', 'linT_noise', 'linF_slope', 'linF_slope_sigma', 'linF_chi2']).sort_index()

Converting dictionaries to arrays: 100%|██████████| 854991/854991
Removing lines containing NaN values: 100%|██████████| 854991/854991


370116 and 476192 lines remaining in g and r filters respectively after cleaning out of 854991

Removing duplicates...
280737 out of 370116 and 458436 out of 476192 lines remaining in g and r filters respectively after removing duplicates


In [13]:
dfg2.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,MPR40_5,...,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,candid,cdsxmatch
4,15.311073,15.311549,0.016083,15.310174,0.019654,0.5,0.319486,0.039307,1.5547,0.012634,...,0.5,0.021452,0.00105,0.120292,0.480684,0.333862,0.753545,ZTF18acmgaps,2570138041515010000,Unknown
5,16.214894,16.187528,0.14257,16.245442,0.16692,0.25,0.346265,0.33384,1.649902,0.132189,...,0.0,0.228017,0.008793,0.116104,43.857147,-1.142769,0.880143,ZTF18admszze,2570138042815010007,EB*_Candidate
6,15.852028,15.825034,0.165751,15.773093,0.187482,0.2,0.422894,0.374964,-2.072773,0.329431,...,0.2,0.3041,0.010456,0.307637,50.931219,0.706914,0.937469,ZTF17aaarukh,2570138043815010009,EB*_Candidate
7,19.438001,19.443232,0.091535,19.441024,0.112977,0.333333,0.262289,0.220397,-1.925554,0.315863,...,0.0,0.118133,0.004709,0.209873,0.169737,-0.072578,0.936406,ZTF18abmeyvg,2570138515615010000,LPV*
10,15.627889,15.568981,0.268735,15.712448,0.395895,0.125,0.307353,0.581005,7.689995,0.016944,...,0.5,0.74533,0.017196,2.03488,217.266066,-2.756536,0.734211,ZTF17aadpfdr,2570140382915010003,RRLyr


In [14]:
dfr2.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,MPR40_5,...,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,candid,cdsxmatch
1,18.274583,18.292172,0.063039,18.300971,0.080741,0.2,0.3411,0.161482,3.327641,0.082961,...,0.4,0.1339,0.00345,0.429565,0.297513,-1.742347,0.824531,ZTF23aatekmu,2570135064715015025,Unknown
2,16.208598,16.206205,0.017464,16.206426,0.021101,0.5,0.333154,0.042202,1.62083,0.06348,...,0.5,0.025445,0.001077,0.108705,0.259096,0.721641,0.762089,ZTF18acdyhbe,2570137543115010004,LPV*
4,14.604294,14.602183,0.029684,14.610426,0.04399,0.333333,0.343673,0.081399,2.381211,0.090717,...,0.333333,0.05944,0.002033,0.346817,1.455762,-1.222128,0.774382,ZTF18acmgaps,2570138041515010000,Unknown
6,15.331138,15.287782,0.126026,15.357968,0.170787,0.4,0.364279,0.341574,0.790907,0.132743,...,0.2,0.211082,0.00822,0.161264,32.199396,-0.487399,0.837663,ZTF17aaarukh,2570138043815010009,EB*_Candidate
7,16.126804,16.102918,0.127338,16.174381,0.173383,0.5,0.25814,0.302922,-1.754535,0.292069,...,0.25,0.224481,0.007896,0.681763,2.105995,-0.223512,0.923711,ZTF18abmeyvg,2570138515615010000,LPV*


### Saving the dataframes to csv files:

In [15]:
# dfg2.to_csv('LOCAL_PATH_TO_CLEAN_DATA/negative_class_g.csv')
# dfr2.to_csv('LOCAL_PATH_TO_CLEAN_DATA/negative_class_r.csv')