##### Imports & setup

In [1]:
import pandas as pd
import numpy as np
import requests
import io
from magcvs_library.functions import tqdm2

# Feature names from https://fink-broker.readthedocs.io/en/latest/services/search/anomaly_detection/
feature_names = ['mean', 'weighted_mean', 'standard_deviation', 'median', 'amplitude', 'beyond_1_std', 'cusum',
                 'inter_percentile_range_10', 'kurtosis', 'linear_trend', 'linear_trend_sigma', 'linear_trend_noise',
                 'linear_fit_slope', 'linear_fit_slope_sigma', 'linear_fit_reduced_chi2', 'magnitude_percentage_ratio_40_5',
                 'magnitude_percentage_ratio_20_10', 'maximum_slope', 'median_absolute_deviation', 'median_buffer_range_percentage_10',
                 'percent_amplitude', 'mean_variance', 'anderson_darling_normal', 'chi2', 'skew', 'stetson_K']

# Renaming to shorter names:
feature_names = ['mean', 'weightedMean', 'std', 'median', 'amplitude', 'beyond1Std',
                 'cusum', 'IPR10', 'kurtosis', 'linT', 'linT_sigma', 'linT_noise',
                 'linF_slope', 'linF_slope_sigma', 'linF_chi2', 'MPR40_5', 'MPR20_10',
                 'maxSlope', 'medianAbsDev', 'medianBRP10', 'percentAmplitude',
                 'meanVariance', 'andersonDarlingNorm', 'chi2', 'skew', 'stetsonK']

---
## magCVs data (positive class)

### Getting data of selected magCVs from Fink's API

In [2]:
# Reading IDs of selected magnetic Cataclysmic Variables:
objids_list = list(pd.read_csv('../data/magnetic_cvs_objectId.csv').values.flatten())

# Retrieving full objects data from Fink:
r = requests.post(
  "https://api.fink-portal.org/api/v1/objects",
  json={
    "objectId": ",".join(objids_list),
    "columns": "i:objectId,d:anomaly_score,d:lc_features_g,d:lc_features_r",
    "output-format": "json"
  }
)

### Cleaning data

In [3]:
pdf = pd.read_json(io.BytesIO(r.content))

# Converting values from str to float in the features columns:
placeholder_features_g, placeholder_features_r = [], []
for i in range(len(pdf)):
    placeholder_features_g.append(np.array(eval(pdf['d:lc_features_g'][i].replace('NaN', 'np.nan').replace('null', 'np.nan'))))
    placeholder_features_r.append(np.array(eval(pdf['d:lc_features_r'][i].replace('NaN', 'np.nan').replace('null', 'np.nan'))))
pdf['d:lc_features_g'] = placeholder_features_g
pdf['d:lc_features_r'] = placeholder_features_r

initial_len = len(pdf) # Number of lines in the dataframe before removing empty, duplicate and NaN lines for reference

pdf.rename(columns={'i:objectId': 'objectId', 'd:anomaly_score': 'anomaly_score'}, inplace=True)

pdf.head()

Unnamed: 0,objectId,anomaly_score,d:lc_features_g,d:lc_features_r
0,ZTF17aaarvmd,,[],[]
1,ZTF18aaadlpa,-0.207629,"[18.805038361213605, 18.97971454799683, 1.4069...","[19.072515062537434, 17.565052863600965, 1.035..."
2,ZTF18abnulwr,-0.019535,"[21.797781262525547, 21.443444600246625, 0.618...","[23.285054981784537, 22.903822703409396, 0.652..."
3,ZTF18aaqphee,-0.012197,"[20.001228861637856, 19.99264607715552, 0.1770...","[19.19776281066766, 19.179895917835594, 0.0864..."
4,ZTF18abwiccd,,[],[]


In [4]:
# Getting rid of lines with NaN in the anomaly_score column:
pdf = pdf[~np.isnan(pdf['anomaly_score'])]

# Splitting the dataframe into two, one for each filter (because they will not have the same length after cleaning):
pdf_g = pdf.drop(columns=['d:lc_features_r'])
pdf_r = pdf.drop(columns=['d:lc_features_g'])

# Getting rid of empty lines or with NaN values in the features:
for index, lc_features_g, lc_features_r in zip(tqdm2(pdf.index), pdf_g['d:lc_features_g'], pdf_r['d:lc_features_r']):
    if len(lc_features_g) == 0 or True in np.isnan(lc_features_g):
        pdf_g = pdf_g.drop([index])
    if len(lc_features_r) == 0 or True in np.isnan(lc_features_r):
        pdf_r = pdf_r.drop([index])

print(f'{len(pdf_g)} and {len(pdf_r)} lines remaining in g and r filters respectively after cleaning out of {initial_len}')

100%|██████████| 9192/9192

4491 and 4446 lines remaining in g and r filters respectively after cleaning out of 24864





In [5]:
# Removing duplicates with np.unique (faster than pandas drop_duplicates but the conversion to numpy array rounds values resulting in more duplicates):
features_g = np.vstack(pdf_g["d:lc_features_g"].to_numpy())
features_r = np.vstack(pdf_r["d:lc_features_r"].to_numpy())
features_g_reduced, index_g = np.unique(features_g, axis=0, return_index=True)
features_r_reduced, index_r = np.unique(features_r, axis=0, return_index=True)

print(f'{len(features_g_reduced)} out of {len(features_g)} and {len(features_r_reduced)} out of {len(features_r)} lines remaining in g and r filters respectively after removing duplicates')

3917 out of 4491 and 4075 out of 4446 lines remaining in g and r filters respectively after removing duplicates


In [6]:
# Converting back to pandas DataFrame with feature names as columns:
dfg = pd.DataFrame(features_g_reduced, columns=feature_names, index=pdf_g.index[index_g])
dfr = pd.DataFrame(features_r_reduced, columns=feature_names, index=pdf_r.index[index_r])

# Retrieving corresponding IDs and anomaly scores:
for column in ['objectId', 'anomaly_score']:
    dfg[column] = pdf_g[column].values[index_g]
    dfr[column] = pdf_r[column].values[index_r]

In [7]:
dfg.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianAbsDev,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,anomaly_score
4816,13.817506,13.828649,0.195803,13.896276,0.234263,0.2,0.346548,0.468526,3.736549,-0.012028,...,0.05048,0.2,0.418046,0.014171,0.515629,143.749041,-1.915839,0.754049,ZTF17aabdpti,-0.133778
19903,13.817509,13.828649,0.195798,13.896276,0.234256,0.2,0.346548,0.468512,3.736476,-0.012028,...,0.05048,0.2,0.418033,0.01417,0.515619,143.744461,-1.915821,0.754053,ZTF17aabdpti,-0.132313
13552,13.86976,13.870895,0.06393,13.895045,0.080594,0.4,0.402923,0.161188,-1.246543,0.00109,...,0.05171,0.4,0.109477,0.004609,0.204758,14.404587,-0.309796,0.915388,ZTF17aabdpti,-0.119457
19496,13.90087,13.893656,0.048767,13.896276,0.060799,0.2,0.329353,0.121598,0.822606,7e-05,...,0.044842,0.4,0.071118,0.003508,0.280313,14.517087,-0.979036,0.762692,ZTF17aabdpti,-0.122106
18477,13.902325,13.893262,0.056184,13.918694,0.060799,0.25,0.370289,0.121598,0.423473,-0.000262,...,0.02524,0.0,0.093537,0.004041,0.147103,19.350766,-1.179132,0.848727,ZTF17aabdpti,-0.11762


In [8]:
dfr.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianAbsDev,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,anomaly_score
4816,13.678092,13.677388,0.110756,13.673155,0.16093,0.5,0.345357,0.301705,-0.571911,-0.008948,...,0.071209,0.25,0.173293,0.008097,0.3199,26.702542,0.271775,0.774002,ZTF17aabdpti,-0.133778
22135,13.689333,13.682039,0.115438,13.677993,0.16093,0.5,0.3922,0.301705,-1.193498,-0.008726,...,0.105357,0.25,0.168455,0.008433,0.214288,27.829847,-0.055104,0.83319,ZTF17aabdpti,-0.1319
19903,13.692617,13.68472,0.124283,13.689638,0.16093,0.428571,0.405006,0.308424,-1.622742,-0.011084,...,0.124217,0.142857,0.165051,0.009077,0.228212,32.342939,-0.163608,0.870325,ZTF17aabdpti,-0.132313
18477,13.692619,13.684722,0.124279,13.689638,0.160921,0.428571,0.405011,0.308409,-1.62289,-0.011084,...,0.124217,0.142857,0.165033,0.009076,0.228234,32.34101,-0.163525,0.870329,ZTF17aabdpti,-0.11762
19496,13.720621,13.711713,0.109305,13.729766,0.143634,0.333333,0.40915,0.275456,-1.122858,-0.010003,...,0.084572,0.0,0.170586,0.007966,0.184656,25.623112,-0.402614,0.888571,ZTF17aabdpti,-0.122106


### Saving the dataframes to csv files:

In [9]:
# dfg.to_csv('../../data/clean_data/positive_class_g.csv')
# dfr.to_csv('../../data/clean_data/positive_class_r.csv')

---
## Other lc data (negative class)

### Getting other lc data

In [10]:
path_to_other_data = '../../data/lc_features_not_nan_JAN2024/' # <----- To be changed accordingly
pdf2 = pd.read_parquet(path_to_other_data)

In [11]:
pdf2

Unnamed: 0,objectId,candid,lc_features_g,lc_features_r,cdsxmatch
0,ZTF18abqpjos,2570132662315010001,"{'mean': 17.401908735560085, 'weighted_mean': ...","{'mean': 17.631597001140964, 'weighted_mean': ...",QSO
1,ZTF23aatekmu,2570135064715015025,"{'mean': 19.48988275209835, 'weighted_mean': 1...","{'mean': 18.274583367522656, 'weighted_mean': ...",Unknown
2,ZTF18acdyhbe,2570137543115010004,"{'mean': 19.510292396036682, 'weighted_mean': ...","{'mean': 16.208598134662612, 'weighted_mean': ...",LPV*
3,ZTF19aczmbew,2570137543815010007,"{'mean': 16.886914280166614, 'weighted_mean': ...","{'mean': 15.622438672539376, 'weighted_mean': ...",Unknown
4,ZTF18acmgaps,2570138041515010000,"{'mean': 15.311072617994874, 'weighted_mean': ...","{'mean': 14.604294032777275, 'weighted_mean': ...",Unknown
...,...,...,...,...,...
854986,ZTF18acgvhut,2561562322715010007,"{'mean': 14.211665117449408, 'weighted_mean': ...","{'mean': 13.437082811350741, 'weighted_mean': ...",Unknown
854987,ZTF18aafwgob,2561562325315015030,"{'mean': 17.232443687067494, 'weighted_mean': ...","{'mean': 17.52010801900393, 'weighted_mean': 1...",QSO
854988,ZTF22aafdvaa,2561568830215010006,"{'mean': 19.109515534174133, 'weighted_mean': ...","{'mean': 18.219019868362494, 'weighted_mean': ...",Unknown
854989,ZTF20aabqukc,2561570812015010005,"{'mean': 20.228360777250852, 'weighted_mean': ...","{'mean': 20.110789243866186, 'weighted_mean': ...",Unknown


### Cleaning data

Our two final datasets should represent two classes of objects:
1. The positive class (dfg & dfr) which should only contain magnetic cataclysmic variable stars  
2. The negative class (dfg2 & dfr2) which should only contain objects that are not magCVs

Since, for now, the negative class may contain any object, we should check if there are any magCVs in there and remove them.

In [12]:
positive_IDs = np.unique(pdf['objectId'].to_numpy())
negative_IDs = np.unique(pdf2['objectId'].to_numpy())
common_IDs = np.intersect1d(positive_IDs, negative_IDs)

for id in tqdm2(common_IDs):
    pdf2 = pdf2[pdf2['objectId'] != id]
pdf2.reset_index(drop=True, inplace=True)

100%|██████████| 44/44


In [13]:
# Converting the lines of dictionaries to lines of arrays with only the values of the dictionaries:
placeholder_features_g2 = []
placeholder_features_r2 = []
for i in tqdm2(range(len(pdf2)), desc='Converting dictionaries to arrays'):
    placeholder_features_g2.append(np.array(list(pdf2['lc_features_g'][i].values())))
    placeholder_features_r2.append(np.array(list(pdf2['lc_features_r'][i].values())))

# Splitting the dataframe into two, one for each filter (because they will not have the same length after cleaning):
pdf_g2 = pdf2.drop(columns=['lc_features_r'])
pdf_r2 = pdf2.drop(columns=['lc_features_g'])
pdf_g2['lc_features_g'] = placeholder_features_g2
pdf_r2['lc_features_r'] = placeholder_features_r2

# Removing lines containing NaN values in the features:
nans_g = np.array([], dtype=bool)
nans_r = np.array([], dtype=bool)
for lc_features_g, lc_features_r in zip(tqdm2(pdf_g2["lc_features_g"], desc='Removing lines containing NaN values'), pdf_r2["lc_features_r"]):
    nans_g = np.append(nans_g, True in np.isnan(lc_features_g))
    nans_r = np.append(nans_r, True in np.isnan(lc_features_r))
pdf_g2, pdf_r2 = pdf_g2[~nans_g], pdf_r2[~nans_r]

print(f'{len(pdf_g2)} and {len(pdf_r2)} lines remaining in g and r filters respectively after cleaning out of {len(pdf2)}')

# Removing duplicates with np.unique (faster than pandas drop_duplicates but the conversion to numpy array rounds values resulting in more duplicates):
print('\nRemoving duplicates...')
features_g2 = np.vstack(pdf_g2["lc_features_g"].to_numpy())
features_r2 = np.vstack(pdf_r2["lc_features_r"].to_numpy())
features_g_reduced2, index_g2 = np.unique(features_g2, axis=0, return_index=True)
features_r_reduced2, index_r2 = np.unique(features_r2, axis=0, return_index=True)

print(f'{len(features_g_reduced2)} out of {len(features_g2)} and {len(features_r_reduced2)} out of {len(features_r2)} lines remaining in g and r filters respectively after removing duplicates')

# Converting back to pandas DataFrame with feature names as columns:
dfg2 = pd.DataFrame(features_g_reduced2, columns=feature_names, index=pdf_g2.index[index_g2])
dfr2 = pd.DataFrame(features_r_reduced2, columns=feature_names, index=pdf_r2.index[index_r2])

# Retrieving corresponding IDs, candid and cdsxmatch:
for column in ['objectId', 'candid', 'cdsxmatch']:
    dfg2[column] = pdf_g2[column].values[index_g2]
    dfr2[column] = pdf_r2[column].values[index_r2]

Converting dictionaries to arrays: 100%|██████████| 854777/854777
Removing lines containing NaN values: 100%|██████████| 854777/854777


369977 and 476039 lines remaining in g and r filters respectively after cleaning out of 854777

Removing duplicates...
280609 out of 369977 and 458298 out of 476039 lines remaining in g and r filters respectively after removing duplicates


In [14]:
dfg2.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,candid,cdsxmatch
83190,11.663722,11.665459,0.071121,11.69372,0.086575,0.5,0.413908,0.167845,-1.621703,0.004334,...,0.5,0.122566,0.006098,0.513783,6.213382,-0.631503,0.899349,ZTF18aabfxtw,2560235106115015006,Mira
350649,11.762719,11.754457,0.105654,11.758624,0.178379,0.333333,0.357947,0.288927,0.503221,0.009617,...,0.333333,0.187469,0.008982,0.261366,11.729162,-0.14626,0.771714,ZTF18aabfxtw,2571234896115015006,Mira
429177,11.806346,11.80628,0.037063,11.811834,0.05483,0.333333,0.362717,0.10226,0.620586,0.000835,...,0.0,0.062462,0.003139,0.193347,11.064746,-0.25163,0.806653,ZTF18achzfzd,2561416034515015045,XB
345336,11.806522,11.806475,0.048839,11.80883,0.054841,0.5,0.401822,0.109682,-2.400176,0.003366,...,0.0,0.059458,0.004137,0.083901,19.266667,-0.194037,0.940179,ZTF18achzfzd,2571272184515015071,XB
797785,11.979817,11.8374,0.419975,11.842549,0.574808,0.142857,0.323382,0.929709,6.919347,0.008603,...,0.714286,1.087953,0.035057,1.841744,114.225226,2.625493,0.524642,ZTF19aanjvsc,2581419000315015014,SB*


In [15]:
dfr2.head()

Unnamed: 0,mean,weightedMean,std,median,amplitude,beyond1Std,cusum,IPR10,kurtosis,linT,...,medianBRP10,percentAmplitude,meanVariance,andersonDarlingNorm,chi2,skew,stetsonK,objectId,candid,cdsxmatch
807666,11.473439,11.396889,0.137957,11.503887,0.15081,0.25,0.387191,0.301619,-0.92801,0.017545,...,0.0,0.211706,0.012024,0.120878,125.707449,-0.871732,0.908868,ZTF19aakrouf,2570346673515015019,Mira
444278,11.501469,11.41423,0.134917,11.566741,0.160703,0.2,0.399836,0.321406,0.227228,0.017339,...,0.2,0.274559,0.01173,0.317539,116.936928,-1.172923,0.896408,ZTF19aakrouf,2572309763515015019,Mira
328548,11.501474,11.414232,0.134921,11.566741,0.160715,0.2,0.399836,0.321429,0.22703,0.01734,...,0.2,0.274559,0.011731,0.31747,116.941764,-1.172821,0.896407,ZTF19aakrouf,2571313344615015018,Mira
843589,11.523382,11.427757,0.132072,11.580271,0.170383,0.166667,0.395678,0.323944,0.978051,0.017259,...,0.333333,0.288089,0.011461,0.495649,109.061102,-1.35913,0.881449,ZTF19aakrouf,2572377513515015021,Mira
417005,11.523383,11.427757,0.132073,11.580271,0.170387,0.166667,0.395678,0.323951,0.977985,0.017259,...,0.333333,0.288089,0.011461,0.495616,109.062581,-1.359097,0.881448,ZTF19aakrouf,2573334564615015012,Mira


### Saving the dataframes to csv files:

In [None]:
# dfg2.to_csv('../../data/clean_data/negative_class_g.csv')
# dfr2.to_csv('../../data/clean_data/negative_class_r.csv')