In [1]:
DATA_PATH = '../data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [2]:
import numpy as np
import pandas as pd
import measurements, extract
import matplotlib.pyplot as plt

In [131]:
def deleteCopies(df_lcs, copies_dict):
    for original_id, current_copies_list in copies_dict.items():
        df_lcs = df_lcs.drop(current_copies_list, level='ID')
    return df_lcs

def areEqualLightCurves(df_lcs, id1, id2):
    lc1 = df_lcs.loc[id1].sort_values(by='MJD')
    lc1 = lc1[['Mag', 'Magerr', 'MJD']].reset_index(drop=True)
    lc2 = df_lcs.loc[id2].sort_values(by='MJD')
    lc2 = lc2[['Mag', 'Magerr', 'MJD']].reset_index(drop=True)
    return lc1.equals(lc2)

def possiblyRepeatedObservations(df_lcs, min_obs = 1):
    
    df_counts_by_observation = df_lcs.groupby(['Mag', 'Magerr', 'MJD'], as_index=False).size().reset_index(name="ObsCount")
    df_counts_by_observation = df_counts_by_observation[df_counts_by_observation.ObsCount > min_obs]
    return df_counts_by_observation

def possiblyRepeatedByGroups(df_lcs):
    groups = []
    existing = dict()
    df_poss_repeat_obs = possiblyRepeatedObservations(df_lcs)
    for _, row in df_poss_repeat_obs.iterrows():
        current_ids = df_lcs[
            (df_lcs.Mag == row.Mag)&(df_lcs.Magerr == row.Magerr) &(df_lcs.MJD == row.MJD)
        ].index.get_level_values('ID').unique().format()
        if(len(current_ids) > 1) and tuple(current_ids) not in existing:
            groups.append(current_ids)
            existing[tuple(current_ids)] = True
    return groups

def repeatedLightCurves(df_lcs, possibly_by_groups_list):
    copies = {}
    len_possibly_by_groups_list = len(possibly_by_groups_list)
    for i_group, group in enumerate(possibly_by_groups_list):
        if i_group % int(len_possibly_by_groups_list/10) == 0:
            print(i_group, '/', len_possibly_by_groups_list)
        found_as_copy = []
        for i, id1 in enumerate(group):
#             print(i, len(group))
            if id1 not in found_as_copy:
                for j, id2 in enumerate(group[i+1:]):
                    are_equal = areEqualLightCurves(df_lcs, id1, id2)
                    if are_equal and (id1 not in copies or id2 not in copies[id1]): 
                        found_as_copy.append(id2)
                        print(id1, id2)
                        if id1 not in copies: 
                            copies[id1] = []
                        copies[id1].append(id2)
    return copies

### Non-Transient

Import non-transient light curves

In [179]:
filename = 'nontransient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_nt_lcs = pd.read_pickle(filepath)
df_nt_lcs.shape

(1924409, 4)

Delete rows of blended observations

In [180]:
df_nt_lcs = df_nt_lcs.drop_duplicates(['ID','MJD'], keep='first')
df_nt_lcs.shape

(1802695, 4)

Replace index into --> [ID(Object), observation_id]

In [181]:
df_nt_lcs.index.name = 'observation_id'
df_nt_lcs = df_nt_lcs.set_index(['ID'], append=True)
df_nt_lcs = df_nt_lcs.reorder_levels(['ID', 'observation_id'])
df_nt_lcs.shape

(1802695, 3)

In [182]:
# df_nt_lcs.head()

Find duplicated light-curves in non-transient dataframe

In [183]:
# Find list containing groups of possibly repeated light curves
# Takes a while...
possibly_by_groups_list_nt = possiblyRepeatedByGroups(df_nt_lcs)

In [184]:
copies_nt = repeatedLightCurves(df_nt_lcs, possibly_by_groups_list_nt)

0 / 4955
495 / 4955
990 / 4955
1485 / 4955
1980 / 4955
2475 / 4955
2970 / 4955
3465 / 4955
3960 / 4955
4455 / 4955
4950 / 4955


In [185]:
copies_nt, df_nt_lcs.shape

({}, (1802695, 3))

In [187]:
dt_nt_lcs = deleteCopies(df_nt_lcs, copies_nt)

In [188]:
df_nt_lcs.shape

(1802695, 3)

In [189]:
df_nt_lcs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mag,Magerr,MJD
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CataID1121048016765,0,21.12,0.52,53732.34137
CataID1121048016765,1,20.85,0.45,53788.24432
CataID1121048016765,2,21.05,0.46,54054.45393
CataID1121048016765,3,20.09,0.28,54116.34092
CataID1121048016765,4,20.35,0.33,54185.1445


######  TO PICKLE

In [190]:
filename = 'nontransient_lightcurves_clean.pickle'
outdir = LIGHTCURVES_PATH; filepath = outdir + filename
df_nt_lcs.to_pickle(filepath)

### Transient Light Curves

Import and filter transients

In [162]:
filename = 'transient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_t_lcs = pd.read_pickle(filepath)
df_t_lcs.shape

(451474, 4)

Delete rows of blended observations

In [163]:
# Delete rows of blended observations
df_t_lcs = df_t_lcs.drop_duplicates(['TransientID','MJD'], keep='first')
df_t_lcs.shape

(447260, 4)

Reset Index

In [164]:
df_t_lcs.index.name = 'observation_id'
df_t_lcs = df_t_lcs.rename(columns={'TransientID':'ID'})
df_t_lcs = df_t_lcs.set_index(['ID'], append=True)
df_t_lcs = df_t_lcs.reorder_levels(['ID', 'observation_id'])

In [167]:
num_objects = df_t_lcs.index.get_level_values('ID').unique()
len(num_objects)

4984

Find copies

In [169]:
# Takes a while...
possibly_by_groups_list_t = possiblyRepeatedByGroups(df_t_lcs)

In [170]:
copies_t = repeatedLightCurves(df_t_lcs, possibly_by_groups_list_t)

0 / 211
TranID1010311350054106115 TranID1209261350054109025
TranID1405261260424118499 TranID1201131260424117626
TranID1406251180794109133 TranID1203281180794110813
TranID1209220231174118379 TranID1310240231174115951
TranID1306161011264117300 TranID1607061011264118208
TranID1604051150914112284 TranID1303181150914117439
TranID1301221040864152906 TranID1504011040864145999
TranID1104281380304112024 TranID1310271380304111884
TranID1301171040504112807 TranID1601131040504110352
TranID1601200090724142448 TranID1304290090724141600
TranID1212101070614108872 TranID1211151070614108238
TranID1212101070614108872 TranID1211231070614109198
TranID1212101070614108872 TranID1011161070614109449
TranID1502140070764121999 TranID1206230070764120012
TranID1201250150604137052 TranID1312310150604133926
21 / 211
TranID1211151070614108238 TranID1211231070614109198
TranID1211151070614108238 TranID1011161070614109449
TranID1604061040774129136 TranID1303161040774127865
TranID1512031430414122287 TranID120119143041412

In [171]:
len(copies_t.items())

107

Delete copies

In [172]:
df_t_lcs = deleteCopies(df_t_lcs, copies_t)

Show results

In [173]:
num_objects = df_t_lcs.index.get_level_values('ID').unique()
len(num_objects)

4869

In [174]:
df_t_lcs.shape

(440469, 3)

In [175]:
df_t_lcs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mag,Magerr,MJD
ID,observation_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TranID1409030010044114444,0,18.8765,0.166417,53766.089871
TranID1409030010044114444,1,20.0519,0.281733,53990.458866
TranID1409030010044114444,2,20.2199,0.295764,53996.286004
TranID1409030010044114444,3,21.1192,0.49539,54385.205789
TranID1409030010044114444,4,19.3289,0.195002,54355.282285


###### TO PICKLE

In [191]:
filename = 'transient_lightcurves_clean.pickle'
outdir = LIGHTCURVES_PATH; filepath = outdir + filename
df_t_lcs.to_pickle(filepath)

Test repeated have been cleared

In [176]:
possibly_by_groups_list_t_updated = possiblyRepeatedByGroups(df_t_lcs)

In [177]:
len(possibly_by_groups_list_t_updated)

106

In [178]:
copies_t = repeatedLightCurves(df_t_lcs, possibly_by_groups_list_t_updated)

0 / 106
10 / 106
20 / 106
30 / 106
40 / 106
50 / 106
60 / 106
70 / 106
80 / 106
90 / 106
100 / 106
