In [1]:
DATA_PATH = '../data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [2]:
import numpy as np
import pandas as pd
import measurements, extract
import matplotlib.pyplot as plt

In [3]:
min_obs = 5

Import non-transient light curves

In [4]:
filename = 'nontransient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_nt_lcs = pd.read_pickle(filepath)
df_nt_lcs.shape

(1924409, 4)

Filter non-transient lightcurves

In [5]:
# Delete rows of blended observations
df_nt_lcs = df_nt_lcs.drop_duplicates(['ID','MJD'], keep='first')
# Add observation count to every nontransient
df_count = df_nt_lcs.groupby('ID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['ID', 'ObsCount']]
df_nt_lcs = df_nt_lcs.merge(df_count, how='inner')
# Remove nontransient objects with less than 5 observations
df_nt_lcs = df_nt_lcs[df_nt_lcs.ObsCount >= min_obs]
df_nt_lcs.shape

(1798465, 5)

In [6]:
len(df_nt_lcs.ID.unique())

15193

In [94]:
df_nt_lcs.head()

Unnamed: 0_level_0,ID,Mag,Magerr,MJD,ObsCount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,CataID1121048016765,21.12,0.52,53732.34137,37
1,CataID1121048016765,20.85,0.45,53788.24432,37
2,CataID1121048016765,21.05,0.46,54054.45393,37
3,CataID1121048016765,20.09,0.28,54116.34092,37
4,CataID1121048016765,20.35,0.33,54185.1445,37


In [100]:
df_nt_lcs2 = df_nt_lcs.copy()
df_nt_lcs2.index.name = 'index'
df_nt_lcs2 = df_nt_lcs2.set_index(['ID'], append=True)
df_nt_lcs2 = df_nt_lcs2.reorder_levels(['ID', 'index'])

In [110]:
df_nt_lcs2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mag,Magerr,MJD,ObsCount
ID,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CataID1121048016765,0,21.12,0.52,53732.34137,37
CataID1121048016765,1,20.85,0.45,53788.24432,37
CataID1121048016765,2,21.05,0.46,54054.45393,37
CataID1121048016765,3,20.09,0.28,54116.34092,37
CataID1121048016765,4,20.35,0.33,54185.1445,37


### Experimental

Find duplicated light-curves in non-transient dataframe

In [185]:
def areEqualLightCurves(df_lcs, id1, id2):
    lc1 = df_lcs.loc[id1].sort_values(by='MJD')
    lc1 = lc1[['Mag', 'Magerr', 'MJD']].reset_index(drop=True)
    lc2 = df_lcs.loc[id2].sort_values(by='MJD')
    lc2 = lc2[['Mag', 'Magerr', 'MJD']].reset_index(drop=True)
    return lc1.equals(lc2)

def possiblyRepeatedObservations(df_lcs, obscount = 1):
    df_counts_by_observation = df_lcs.groupby(['Mag', 'Magerr', 'MJD'], as_index=False).count()
    df_counts_by_observation = df_counts_by_observation[df_counts_by_observation.ObsCount > obscount]
    return df_counts_by_observation

def possiblyRepeatedByGroups(df_lcs):
    groups = []
    df_poss_repeat_obs = possiblyRepeatedObservations(df_lcs)
    for index, row in df_poss_repeat_obs.iterrows():
        current_ids = df_lcs[
            (df_lcs.Mag == row.Mag)&(df_lcs.Magerr == row.Magerr) &(df_lcs.MJD == row.MJD)
        ].index.get_level_values('ID').unique()
        groups.append(current_ids)
    return groups

def repeatedLightCurves(df_lcs, possibly_by_groups_list):
    copies = {}
    len_possibly_by_groups_list = len(possibly_by_groups_list)
    for i_group, group in enumerate(possibly_by_groups_list):
        if i_group % int(len_possibly_by_groups_list/10) == 0:
            print(i_group, '/', len_possibly_by_groups_list)
        found_as_copy = []
        for i, id1 in enumerate(group):
#             print(i, len(group))
            if id1 not in found_as_copy:
                for j, id2 in enumerate(group[i+1:]):
                    are_equal = areEqualLightCurves(df_lcs, id1, id2)
                    if are_equal and (id1 not in copies or id2 not in copies[id1]): 
                        found_as_copy.append(id2)
                        print(id1, id2)
                        if id1 not in copies: 
                            copies[id1] = []
                        copies[id1].append(id2)
    return copies

In [None]:
# Takes a while...
possibly_by_groups_list = possiblyRepeatedByGroups(df_nt_lcs2)

In [149]:
copies = repeatedLightCurves(df_nt_lcs2, possibly_by_groups_list)

0 / 9367
936 / 9367
1872 / 9367
2808 / 9367
3744 / 9367
4680 / 9367
5616 / 9367
6552 / 9367
7488 / 9367
8424 / 9367
9360 / 9367


In [150]:
copies

{}

#### Find transient light curves in non-transient dataframe

Import and filter transients

In [154]:
filename = 'transient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_t_lcs = pd.read_pickle(filepath)
df_t_lcs.shape

(451474, 4)

In [159]:
# Delete rows of blended observations
df_t_lcs = df_t_lcs.drop_duplicates(['TransientID','MJD'], keep='first')
# Add observation count to every transient
df_count = df_t_lcs.groupby('TransientID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['TransientID', 'ObsCount']]
df_t_lcs = df_t_lcs.merge(df_count, how='inner')
# Remove objects with less than min_obs
df_t_lcs = df_t_lcs[df_t_lcs.ObsCount >= min_obs]

In [163]:
df_t_lcs2 = df_t_lcs.rename(columns={'TransientID':'ID'})
df_t_lcs2 = df_t_lcs2.set_index(['ID'], append=True)
df_t_lcs2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Mag,Magerr,MJD,ObsCount
Unnamed: 0_level_1,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,TranID1409030010044114444,18.8765,0.166417,53766.089871,13
1,TranID1409030010044114444,20.0519,0.281733,53990.458866,13
2,TranID1409030010044114444,20.2199,0.295764,53996.286004,13
3,TranID1409030010044114444,21.1192,0.49539,54385.205789,13
4,TranID1409030010044114444,19.3289,0.195002,54355.282285,13


In [179]:
df_t_lcs2 = df_t_lcs.copy()
df_t_lcs2.index.name = 'index'
df_t_lcs2 = df_t_lcs2.rename(columns={'TransientID':'ID'})
df_t_lcs2 = df_t_lcs2.set_index(['ID'], append=True)
df_t_lcs2 = df_t_lcs2.reorder_levels(['ID', 'index'])

In [181]:
# Takes a while...
possibly_by_groups_list_t = possiblyRepeatedByGroups(df_t_lcs2)

In [186]:
copies_t = repeatedLightCurves(df_t_lcs2, possibly_by_groups_list_t)

0 / 8663
TranID1010311350054106115 TranID1209261350054109025
TranID1405261260424118499 TranID1201131260424117626
TranID1406251180794109133 TranID1203281180794110813
TranID1209220231174118379 TranID1310240231174115951
TranID1306161011264117300 TranID1607061011264118208
TranID1604051150914112284 TranID1303181150914117439
TranID1301221040864152906 TranID1504011040864145999
TranID1104281380304112024 TranID1310271380304111884
TranID1301171040504112807 TranID1601131040504110352
TranID1601200090724142448 TranID1304290090724141600
TranID1212101070614108872 TranID1211151070614108238
TranID1212101070614108872 TranID1211231070614109198
TranID1212101070614108872 TranID1011161070614109449
TranID1211151070614108238 TranID1211231070614109198
TranID1211151070614108238 TranID1011161070614109449
TranID1211231070614109198 TranID1011161070614109449
TranID1502140070764121999 TranID1206230070764120012
TranID1201250150604137052 TranID1312310150604133926
TranID1604061040774129136 TranID1303161040774127865
866

In [194]:
df_t_lcs3 = df_t_lcs2.copy()
for original, list in copies_t.items():
    df_t_lcs3 = df_t_lcs3.drop(list, level='ID')

In [195]:
# Takes a while...
possibly_by_groups_list_t = possiblyRepeatedByGroups(df_t_lcs3)

In [196]:
copies_t = repeatedLightCurves(df_t_lcs3, possibly_by_groups_list_t)

0 / 4318
431 / 4318
862 / 4318
1293 / 4318
1724 / 4318
2155 / 4318
2586 / 4318
3017 / 4318
3448 / 4318
3879 / 4318
4310 / 4318


#### Test merges

In [198]:
df_t_nt_lcs = pd.concat([df_t_lcs3, df_nt_lcs2])

In [201]:
# Takes a while...
possibly_by_groups_list_t_nt = possiblyRepeatedByGroups(df_t_nt_lcs)

In [202]:
copies_t_nt = repeatedLightCurves(df_t_nt_lcs, possibly_by_groups_list_t_nt)

0 / 13685
1368 / 13685
2736 / 13685
4104 / 13685
5472 / 13685
6840 / 13685
8208 / 13685
9576 / 13685
10944 / 13685
12312 / 13685
13680 / 13685
