Visualize output from TabulatorRiseDeclineStat

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('/home/jnordin/tmp/TransientTable.csv')

In [None]:
df.columns

In [None]:
df = df.drop(columns=['Unnamed: 0', 'channels', 'sim_model_index'])
# sntype and sim_type_index are different, no idea 
# t_predetect are all null here, so skip for now. 

#### 1. Extract redshift information

In [None]:
# First we divide df into two parts
# one with z_source in ['HOSTGAL2_ZQUANT', 'HOSTGAL_ZQUANT', 'HOSTGAL_ZSPEC']
# where we can extract redshift information
iGal = df['z_source'].isin(['HOSTGAL2_ZQUANT', 'HOSTGAL_ZQUANT', 'HOSTGAL_ZSPEC'])
df_gal = df[iGal]
df_nogal = df[~iGal]

In [None]:
df_nogal['z'] = None
df_nogal['z_err'] = None
df_nogal = df_nogal.drop(columns=['z_source', 'z_samples', 'z_weights'])

In [None]:
df_nogal = df_nogal.drop_duplicates()

In [None]:
df_gal['z'] = df_gal['z_samples'].str.extract(r',\s+(\d+.\d+),').astype(float)

In [None]:
df_gal['z_err'] = df_gal['z_samples'].str.extract(r'\((\d+.\d+),').astype(float)

In [None]:
df_gal['z_err'] = df_gal['z']-df_gal['z_err']

In [None]:
df_gal = df_gal.drop(columns=['z_source', 'z_samples', 'z_weights'])

In [None]:
df_gal = df_gal.drop_duplicates()

In [None]:
df_all = pd.concat([df_gal, df_nogal])

### 2. Checck for  duplicates 
Meaning rows of the same transient with the same ndet but different other values. 

Remaining duplicates seem to be "true", meaning multiple alerts from the same transients that produce the same features. Happens if the latest detection has a weak signififance and does not contribute.



In [None]:
# Dont do it, not used and takes a lot of time
# df_dup  = pd.concat(g for _, g in df_all.groupby(["ndet","stock","mag_det", "mag_last"]) if len(g) > 1)

### 3. Correct magnitudes for false zeropoints (skip once corrected).

In [None]:
# These should all not be used once we have a corrected data run
df_all['mag_det'] += 2.5
df_all['mag_last'] += 2.5
df_all['t_predetect'] = df_all['jd_det'] - df_all['t_predetect']
# Probably something in the above does not work, yielding nonsense answers. 
df_all['t_predetect'][ np.abs(df_all['t_predetect'])>999 ] = None

### 4. Inspect columns 

In [None]:
df_all

In [None]:
df_all['sntype'].plot.hist()

In [None]:
df_all['ndet'].plot.hist(bins=50)

In [None]:
# Temporary pause?
#df_all.to_csv('/home/jnordin/tmp/elasticcRiseDecline_v5.csv')
#df_all = pd.read_csv('/home/jnordin/tmp/elasticcRiseDecline_v5.csv')

One thing we do want to find out is how many detections we typically have for each source. Asking since the ones with only a few detection can be kept out of parsnip.

In [None]:
plt.figure(1,figsize=(12,12))
sns.histplot(df_all, x='ndet', hue='sim_type_index', hue_norm=(10,95), palette='viridis')

In [None]:
for eachtype in set(df_all['sim_type_index']):
    dets = df_all['ndet'][df_all['sim_type_index']==eachtype]
    print(eachtype, np.min(dets), np.mean(dets), np.median(dets), np.max(dets))
    if np.mean(dets)<5:
        plt.figure()
        plt.title(eachtype)
        plt.hist(dets,bins=20)

### Quick study on some subset

In [None]:
ndet = 10

In [None]:
df_det = df_all[df_all['ndet']==ndet]

In [None]:
df_det.shape

In [None]:
# Remove some columns not of immediate use
df_det = df_det.drop(columns=['ndet', 'success', 'sim_peakmjd', 'stock'])

In [None]:
# Uneven sample size
df_det['sim_type_index'].hist()

In [None]:
# Types from 
# https://github.com/LSSTDESC/elasticc/blob/main/alert_schema/elasticc_origmap.txt

In [None]:
df_det['t_predetect']

In [None]:
plt.figure(1,figsize=(12,12))
sns.histplot(df_det, x='t_predetect', hue='sim_type_index', hue_norm=(10,95), palette='viridis')

In [None]:
plt.figure(2,figsize=(12,10))
sns.histplot(d_det, x='mag_det', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=True)

In [None]:
plt.figure(2,figsize=(12,10))
sns.histplot(d_det, x='t_lc', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(4,figsize=(12,10))
sns.histplot(d_det, x='t_rise', hue='sim_type_index', hue_norm=(10,95), palette='magma', cumulative=False)

In [None]:
plt.figure(4,figsize=(12,5))
sns.histplot(d_det, x='t_fall', hue='sim_type_index', hue_norm=(10,95), palette='magma', cumulative=False)

In [None]:
d_det.columns

In [None]:
plt.figure(6,figsize=(12,10))
sns.histplot(d_det, x='host_sep', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
band = 'u'

In [None]:
plt.figure(10,figsize=(12,5))
sns.histplot(d_det, x='rise_slope_lsst'+band, hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(11,figsize=(12,5))
sns.histplot(d_det, x='rise_slopesig_lsst'+band, hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(13,figsize=(12,5))
sns.histplot(d_det, x='fall_slope_lsst'+band, hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(14,figsize=(12,5))
sns.histplot(d_det, x='fall_slopesig_lsst'+band, hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
col = 'lsstu-lsstg'

In [None]:
plt.figure(16,figsize=(12,5))
sns.histplot(d_det, x=col+'_det', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(16,figsize=(12,5))
sns.histplot(d_det, x=col+'_peak', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

In [None]:
plt.figure(17,figsize=(12,5))
sns.histplot(d_det, x=col+'_last', hue='sim_type_index', hue_norm=(10,95), palette='viridis', cumulative=False)

### Create a "nice" dataset for training

In [None]:
df_all = df_all.drop(columns=['success', 'sim_peakmjd', 'stock'])

In [None]:
df_all.columns

In [None]:
bandval = {'lsstu':1, 'lsstg':2, 'lsstr':4, 'lssti':8, 'lsstz':16, 'lssty':32}

In [None]:
c = 9

In [None]:
set(df_all[df_all.columns[c]])

In [None]:
plt.hist(df_all[df_all.columns[c]])
plt.xlabel(df_all.columns[c])

In [None]:
df_all['band_det_id'] = df_all['band_det'].apply(bandval.get)

In [None]:
df_all['band_last_id'] = df_all['band_last'].apply(bandval.get)

In [None]:
df_all = df_all.drop(columns=['band_det', 'band_last'])

In [None]:
df_all = df_all.drop(columns=['sntype'])

In [None]:
df_all.columns

In [None]:
# Define a first courser class, corresponding to rough 
sim_to_class = {
10:  111, #      SNIa-SALT2   
 11: 115, #      SNIa-91bg    
 12: 114, #      SNIax        

 20: 112, #      SNIb-Templates    
 21: 112, #      SNIb+HostXT_V19   112
 25: 112, #      SNIc-Templates    112
 26: 112, #      SNIc+HostXT_V19   112
 27: 112, #      SNIcBL+HostXT_V19 112

 30: 113, #      SNII-NMF          113
 31: 113, #      SNII-Templates    113
 32: 113, #      SNII+HostXT_V19   113
 35: 113, #      SNIIn-MOSFIT      113
 36: 113, #      SNIIn+HostXT_V19  113
 37: 113, #      SNIIb+HostXT_V19  113

 40: 131, #      SLSN-I+host       131
 40: 131, #     SLSN-I_no_host    131
 42: 132,  #     TDE               132
 45: 133,  #      ILOT              133
 46: 134,  #      CART              134
    
 50: 121,  #      KN_K17            121
 51: 121,  #      KN_B19            121
 59: 135,  #      PISN              135

 60: 221,  #      AGN               221

 80: 212,  #      RRL               212
 82: 122,  #      Mdwarf-flare      122
 83: 214,  #      EB                214
 84: 123,  #      dwarf-nova        123
 87: 124,  #      uLens-Single_PyLIMA 124
 88: 124,  #      uLens-Single-GenLens 124
 89: 124,  #      uLens-Binary            124
 90: 211,  #      Cepheid              211
 91: 213,  #      d-Sct              213
}

In [None]:
def shorten(inclass):
    foo = str(inclass)
    return int(foo[0:-1])

In [None]:
df_all['class_full'] = df_all['sim_type_index'].apply(sim_to_class.get)

In [None]:
df_all['class_intermediate'] = df_all['class_full'].apply(shorten)

In [None]:
df_all['class_short'] = df_all['class_intermediate'].apply(shorten)

In [None]:
# Finally, we do a mix class where we specifically take what we think might work
# for parsnip
class_to_parsnip = {
    110: 1,    # Extragalactic transients with decent lc length
    111: 1,
    112: 1,
    113: 1,
    114: 1,
    115: 1,
    131: 1,
    132: 1,
    133: 1,
    134: 1,
    135: 1,
    121: 2,     # Extragalacti transients too fast to see with lsst cadence
    122: 3,     # Galactic transients
    123: 3,     
    124: 3,     
    211: 4,     # Galactic repeaters
    212: 4,     
    213: 4,     
    214: 4,     
    215: 4,     
    221: 5,      # Extragalactic repeaters     
}

In [None]:
df_all['class_parsnip'] = df_all['class_full'].apply(class_to_parsnip.get)

In [None]:
plt.hist(df_all['class_parsnip'])

In [None]:
# A version of the above where we assume a series of classifiers
# Idea will be to first compare {1,2,3} w 4 then
# {1,2} - 3
# 1 vs 2
# 
class_to_stepwise = {
    110: 1,    # Extragalactic transients with decent lc length
    111: 1,
    112: 1,
    113: 1,
    114: 1,
    115: 1,
    131: 1,
    132: 1,
    133: 1,
    134: 1,
    135: 1,
    124: 2,    # Mikrolenses, where fit fails
    121: 3,    # Extragalacti transients too fast to see with lsst cadence
    122: 3,    
    123: 3,    # These are the dwarf nova, could go both ways, or go to 4 
    211: 4,     # Galactic repeaters
    212: 4,     
    213: 4,     
    214: 4,     
    215: 4,     
    221: 4,      # Extragalactic repeaters     
}

In [None]:
df_all['class_aggregate'] = df_all['class_full'].apply(class_to_stepwise.get)

In [None]:
plt.hist(df_all['class_aggregate'])

In [None]:
# Should do this already in prep notebook
for c in ['bool_rise', 'bool_fall', 'bool_peaked', 'bool_pure',
       'bool_fastrise', 'bool_fastfall', 'bool_hasgaps']:
    df_all[c] = df_all[c].astype(bool)

In [None]:
# Now we can drop the original weird classificaiton
df_all = df_all.drop(columns=['sim_type_index'])

In [None]:
df_all.to_csv('/home/jnordin/tmp/elasticc_feature_trainingset.csv')