## Zooniverse Analysis

In [None]:
import shutil
import json
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import numpy as np
import seaborn as sns
from IPython.display import Image, display, HTML


In [None]:
sns.set_context('poster')

In [None]:
dfzoo = pd.read_csv('/epyc/users/ecbellm/ZTF_Boyajian/ztf-dippers-classifications.csv')

In [None]:
len(dfzoo)

In [None]:
dfzoo.head()

In [None]:
wnew = dfzoo['created_at'].str.startswith('2021')

In [None]:
np.sum(wnew)

In [None]:
dfzoo = dfzoo[wnew]

this is one classification per row. but includes both v1 and v2 scanning

In [None]:
dfzoo.iloc[0]['annotations']

we only have followup questions for the yes and maybe classificaitons, and some of the early yes/maybe classifications don't have followup at all

In [None]:
dfzoo[['classification_id','user_name']].groupby('user_name').agg(len).sort_values(by='classification_id')

In [None]:
def get_flat_outside_window(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[0]['value'])

def get_dip_both_bands(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[1]['value'] if (len(x) > 1) else None)

def get_asymmetric(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[2]['value'] if (len(x) > 1) else None)

def get_multiple_dips(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[3]['value'] if (len(x) > 1) else None)

def get_ps1_id(df):
    return dfzoo['subject_data'].apply(json.loads).apply(lambda x: list(x.values())[0]['ps1_id'])

In [None]:
dfzoo['flat_outside_window'] = get_flat_outside_window(dfzoo)
dfzoo['dip_both_bands'] = get_dip_both_bands(dfzoo)
dfzoo['asymmetric'] = get_asymmetric(dfzoo)
dfzoo['multiple_dips'] = get_multiple_dips(dfzoo)
dfzoo['ps1_id'] = get_ps1_id(dfzoo)

In [None]:
dfzoo['flat_outside_window'].value_counts()

In [None]:
dfzoo.groupby(['ps1_id','flat_outside_window']).agg(len)['user_name'].value_counts()

In [None]:
dfzoo['dip_both_bands'].value_counts()

In [None]:
dfzoo['asymmetric'].value_counts()

In [None]:
dfzoo['multiple_dips'].value_counts()

## Platinum sample: at least two votes w/ flat outside, multiple bands, asymmetric


In [None]:
wplatinum = (dfzoo['flat_outside_window'] == 'Yes') & (dfzoo['dip_both_bands'] == 'Yes') & (dfzoo['asymmetric'] == 'Yes')

In [None]:
np.sum(wplatinum)

In [None]:
dfzoo.loc[wplatinum,'ps1_id'].value_counts()

In [None]:
image_dir = '/astro/users/keatonb/ZTF_Boyajian/Candidate_plots/'

In [None]:
for ps1_id, row in dfzoo.loc[wplatinum,'ps1_id'].value_counts().iteritems():
        print(ps1_id, row)
        display(Image(f'{image_dir}/{ps1_id}.png'))

## Gold sample: flat outside, could be single band, asymmetric 

In [None]:
wgold = ((dfzoo['flat_outside_window'] == 'Yes') & 
        #((dfzoo['dip_both_bands'] == 'Yes') | 
         ((dfzoo['dip_both_bands'] == 'Not enough information to tell due to sampling.'))  
        & (dfzoo['asymmetric'] == 'Yes'))

In [None]:
np.sum(wgold)

In [None]:
for ps1_id, row in dfzoo.loc[wgold,'ps1_id'].value_counts().iteritems():
        print(ps1_id, row)
        display(Image(f'{image_dir}/{ps1_id}.png'))

### Lithium (or Palladium) sample: out-of-window variability that looks like dips

In [None]:
wlithium = ((dfzoo['flat_outside_window'] == 'No, but the other excursions look like dips') & 
            (dfzoo['dip_both_bands'] == 'Yes') & (dfzoo['asymmetric'] == 'Yes'))

In [None]:
np.sum(wlithium)

In [None]:
for ps1_id, row in dfzoo.loc[wlithium,'ps1_id'].value_counts().iteritems():
        print(ps1_id, row)
        display(Image(f'{image_dir}/{ps1_id}.png'))

In [None]:
# old v1 code put on ice for now

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data=dfzoo,y='classification')
#plt.savefig(f'fig/{dataset}/classification_summary.png', bbox_inches='tight')

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data=dfzoo.loc[wyes],y='user_name')

In [None]:
dfzoo.groupby('ps1_id').agg(len)['classification_id'].index[935]

In [None]:
wmax_id = dfzoo['ps1_id'] == 172521268464107338
dfzoo.loc[wmax_id,['user_name','classification']]

In [None]:
!ls

In [None]:
image_dir = '/epyc/users/kyboone/ztf_boyajian/zooniverse/'

In [None]:
display(Image(f'{image_dir}/172521268464107338.png'))

In [None]:
wbest = wyes & wwell_sampled 
count_yes = dfzoo[['ps1_id','classification']].loc[wbest].groupby('ps1_id').agg(len)
wtwoyes = count_yes['classification'] > 1
best_ids = count_yes.loc[wtwoyes].index.values


In [None]:
len(best_ids)

In [None]:
print(best_ids)

In [None]:
!mkdir -p gold_sample_figures

In [None]:
!pwd

In [None]:
for idi in best_ids:
    display(Image(f'{image_dir}/{idi}.png'))
    shutil.copyfile(f'{image_dir}/{idi}.png', f'gold_sample_figures/{idi}.png')

### Silver sample: at least one yes; number of yeses + maybes >= 2, no sampling check

In [None]:
# at least one yes
count_yes = dfzoo[['ps1_id','classification']].loc[wyes].groupby('ps1_id').agg(len)
count_yes = count_yes.loc[count_yes['classification'] >= 1]

# any number of maybes
count_maybe = dfzoo[['ps1_id','classification']].loc[wmaybe].groupby('ps1_id').agg(len)

count_yes_and_maybe = count_yes.join(count_maybe,lsuffix='_yes',rsuffix='_maybe',how='outer')

In [None]:
count_yes_and_maybe.sum(axis=1) 

In [None]:
wokay = (count_yes_and_maybe.sum(axis=1) >= 2)
okay_ids = count_yes_and_maybe[wokay].index.values

In [None]:
#remove the duplicated best ids
okay_ids = np.setdiff1d(okay_ids,best_ids)

In [None]:
len(okay_ids)

In [None]:
print(okay_ids)

In [None]:
!mkdir -p silver_sample_figures

In [None]:
for idi in okay_ids:
    display(Image(f'{image_dir}/{idi}.png'))
    shutil.copyfile(f'{image_dir}/{idi}.png', f'silver_sample_figures/{idi}.png')