In [11]:
import pandas as pd
import numpy as np
from actsnclass import DataBase
from classifier_sigmoid import get_sigmoid_features_dev

In [12]:
# this was taken from https://github.com/COINtoolbox/ActSNClass/blob/master/actsnclass/database.py
def build_samples(features: pd.DataFrame, initial_training: int,
                 frac_Ia=0.5, screen=False):
    """Build initial samples for Active Learning loop.
    
    Parameters
    ----------
    features: pd.DataFrame
        Complete feature matrix. Columns are: ['objectId', 'type', 
        'a_g', 'b_g', 'c_g', 'snratio_g', 'chisq_g', 'nrise_g', 
        'a_r', 'b_r', 'c_r', 'snratio_r', 'chisq_r', 'nrise_r']
        
    initial_training: int
        Number of objects in the training sample.
    frac_Ia: float (optional)
        Fraction of Ia in training. Default is 0.5.
    screen: bool (optional)
        If True, print intermediary information to screen.
        Default is False.
        
    Returns
    -------
    actsnclass.DataBase
        DataBase for active learning loop
    """
    data = DataBase()
    
    # initialize the temporary label holder
    train_indexes = np.random.choice(np.arange(0, features.shape[0]),
                                     size=initial_training, replace=False)
    
    Ia_flag = features['type'].values == 'Ia'
    Ia_indx = np.arange(0, features.shape[0])[Ia_flag]
    nonIa_indx =  np.arange(0, features.shape[0])[~Ia_flag]
    
    indx_Ia_choice = np.random.choice(Ia_indx, size=max(1, initial_training // 2),
                                      replace=False)
    indx_nonIa_choice = np.random.choice(nonIa_indx, 
                        size=initial_training - max(1, initial_training // 2),
                        replace=False)
    train_indexes = list(indx_Ia_choice) + list(indx_nonIa_choice)
    
    temp_labels = features['type'].values[np.array(train_indexes)]

    if screen:
        print('\n temp_labels = ', temp_labels, '\n')

    # set training
    train_flag = np.array([item in train_indexes for item in range(features.shape[0])])
    
    train_Ia_flag = features['type'].values[train_flag] == 'Ia'
    data.train_labels = train_Ia_flag.astype(int)
    data.train_features = features[train_flag].values[:,2:]
    data.train_metadata = features[['id', 'type']][train_flag]
    
    # set test set as all objs apart from those in training
    test_indexes = np.array([i for i in range(features.shape[0])
                             if i not in train_indexes])
    test_ia_flag = features['type'].values[test_indexes] == 'Ia'
    data.test_labels = test_ia_flag.astype(int)
    data.test_features = features[~train_flag].values[:, 2:]
    data.test_metadata = features[['id', 'type']][~train_flag]
    
    # set metadata names
    data.metadata_names = ['id', 'type']
    
    # set everyone to queryable
    data.queryable_ids = data.test_metadata['id'].values
    
    if screen:
        print('Training set size: ', data.train_metadata.shape[0])
        print('Test set size: ', data.test_metadata.shape[0])
        print('  from which queryable: ', len(data.queryable_ids))
        
    return data

In [39]:
matrix_clean = pd.read_csv('data/features0.dat', comment='#', delim_whitespace=True)

In [40]:
matrix_clean.drop(labels=['id','redshift', 'code', 'sample'], inplace=True, axis=1)
matrix_clean.rename(columns={'index': 'id'}, inplace=True)

In [41]:
matrix_clean.shape

(589, 14)

In [43]:
np.unique(matrix_clean['id'].values).shape

(589,)

In [27]:
data = build_samples(matrix_clean, initial_training=10, screen=True)


 temp_labels =  ['Ia' 'Ia' 'Ia' 'Ia' 'Ia' 'II' 'II' 'II' 'II' 'II'] 

Training set size:  10
Test set size:  579
  from which queryable:  579


In [28]:
data.train_features.shape

(10, 12)

In [29]:
data.features_names

[]

In [30]:
 features_names = ['a_g', 'b_g', 'c_g', 'snratio_g', 'chisq_g', 'nrise_g', 
                          'a_r', 'b_r', 'c_r', 'snratio_r', 'chisq_r', 'nrise_r']
train = pd.DataFrame(data.train_features, columns=features_names)

In [11]:
data_nonIa = pd.read_csv('data/all_nonIa.csv.gz')

In [25]:
data_nonIa

Unnamed: 0,objectId,cjd,cfid,cmagpsf,csigmapsf,TNS
0,ZTF19aahkvpi,2.459317e+06,1,18.757772,0.094382,(TNS) AGN
1,ZTF19aahkvpi,2.459317e+06,2,18.033100,0.082408,(TNS) AGN
2,ZTF19aahkvpi,2.459314e+06,1,18.563200,0.105809,(TNS) AGN
3,ZTF19aahkvpi,2.459314e+06,1,18.563190,0.105809,(TNS) AGN
4,ZTF19aahkvpi,2.459308e+06,2,19.194294,0.158532,(TNS) AGN
...,...,...,...,...,...,...
101947,ZTF19aarhrcv,2.458856e+06,2,15.964293,0.026971,(TNS) Varstar
101948,ZTF19aarhrcv,2.458856e+06,2,15.964300,0.026971,(TNS) Varstar
101949,ZTF19aarhrcv,2.458855e+06,1,15.700174,0.024344,(TNS) Varstar
101950,ZTF19aarhrcv,2.458855e+06,1,15.700200,0.024344,(TNS) Varstar


In [17]:
np.unique(data_nonIa['objectId'].values).shape

(1195,)

In [19]:
data2 = data_nonIa.drop_duplicates(subset=['objectId'], keep='first')

In [22]:
types, freq = np.unique(data2['TNS'].values, return_counts=True)

In [24]:
for i in range(len(types)):
    print(types[i], ' ---  ', freq[i])

(TNS) AGN  ---   30
(TNS) CV  ---   92
(TNS) FRB  ---   2
(TNS) Galaxy  ---   2
(TNS) LBV  ---   4
(TNS) LRN  ---   1
(TNS) M dwarf  ---   6
(TNS) Nova  ---   8
(TNS) Other  ---   19
(TNS) QSO  ---   2
(TNS) SLSN-I  ---   38
(TNS) SLSN-II  ---   22
(TNS) SN  ---   9
(TNS) SN I  ---   16
(TNS) SN II  ---   448
(TNS) SN II-pec  ---   3
(TNS) SN IIL  ---   1
(TNS) SN IIP  ---   54
(TNS) SN IIb  ---   26
(TNS) SN IIn  ---   104
(TNS) SN Ia-91T-like  ---   53
(TNS) SN Ia-91bg-like  ---   12
(TNS) SN Ia-CSM  ---   2
(TNS) SN Ia-pec  ---   21
(TNS) SN Iax[02cx-like]  ---   9
(TNS) SN Ib  ---   41
(TNS) SN Ib-pec  ---   1
(TNS) SN Ib/c  ---   10
(TNS) SN Ibn  ---   8
(TNS) SN Ic  ---   67
(TNS) SN Ic-BL  ---   22
(TNS) SN Icn  ---   2
(TNS) TDE  ---   15
(TNS) Varstar  ---   45


In [2]:
fname = 'data/features_matrix.csv'

In [3]:
data = pd.read_csv(fname)

In [4]:
data.shape

(860, 14)

In [6]:
sum(data['type'] == 'Ia')

262

In [7]:
validation = data.sample(frac=0.33, replace=False)

In [8]:
validation.shape

(284, 14)

In [10]:
sum(validation['type'] == 'Ia')

93

In [9]:
validation.keys()

Index(['id', 'type', 'a_g', 'b_g', 'c_g', 'snratio_g', 'chisq_g', 'nrise_g',
       'a_r', 'b_r', 'c_r', 'snratio_r', 'chisq_r', 'nrise_r'],
      dtype='object')

In [15]:
validation.to_csv('data/validation.csv', index=False)

In [10]:
val_flag = np.array([item in validation['id'].values for item in data['id'].values])

In [11]:
pool = data[~val_flag]

In [12]:
pool.shape

(576, 14)

In [13]:
sum(pool['type'] == 'Ia')/pool.shape[0]

0.3125

In [16]:
pool.to_csv('data/pool.csv', index=False)

In [6]:
import pandas as pd
import numpy as np
import os

In [2]:
fname = 'data/features_42876.csv'
data = pd.read_csv(fname)

In [5]:
data.shape

(42876, 14)

In [4]:
np.unique(data['id'].values).shape

(42876,)

In [15]:
flist = os.listdir('../../data/AL_data/')
flist.remove('generate_al_dataset.ipynb')
flist.remove('.ipynb_checkpoints')

In [24]:
all_ids = data['id'].values
used_files = []

for i in range(len(flist)):
    
    d1 = pd.read_parquet('../../data/AL_data/' + flist[i])
    ids = d1['candid'].values

    flag = np.array([item in ids for item in all_ids])
    if sum(flag) > 0:
        used_files.append(flist[i])
        all_ids = all_ids[~flag]
    

In [30]:
from shutil import copyfile

In [31]:
for n in used_files:
    copyfile('../../data/AL_data/' + n, 'results_42876/raw_data/' + n)