## Zooniverse Analysis

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import numpy as np
import seaborn as sns
from IPython.display import Image, display, HTML


In [None]:
sns.set_context('poster')

In [None]:
dfzoo = pd.read_csv('/epyc/users/ecbellm/ZTF_Boyajian/ztf-dippers-classifications.csv')

In [None]:
len(dfzoo)

In [None]:
dfzoo.head()

this is one classification per row.

In [None]:
dfzoo.iloc[0]['annotations']

we only have followup questions for the yes and maybe classificaitons, and some of the early yes/maybe classifications don't have followup at all

In [None]:
dfzoo.loc[2444]['annotations']

In [None]:
dfzoo[['classification_id','user_name']].groupby('user_name').agg(len).sort_values(by='classification_id')

In [None]:
def get_classification(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[0]['value'])

def get_well_sampled(df):
    return dfzoo['annotations'].apply(json.loads).apply(lambda x: x[1]['value'] if (len(x) > 1) else None)

def get_ps1_id(df):
    return dfzoo['subject_data'].apply(json.loads).apply(lambda x: list(x.values())[0]['ps1_id'])

In [None]:
dfzoo['classification'] = get_classification(dfzoo)
dfzoo['well_sampled'] = get_well_sampled(dfzoo)
dfzoo['ps1_id'] = get_ps1_id(dfzoo)

In [None]:
wyes = dfzoo['classification'] == 'Yes'
wmaybe = dfzoo['classification'] == 'Maybe'
wyesmaybe = wyes | wmaybe
wno = ~wyesmaybe

In [None]:
dfzoo.loc[wyes,'well_sampled']

In [None]:
np.sum(wyes)

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data=dfzoo,y='classification')
#plt.savefig(f'fig/{dataset}/classification_summary.png', bbox_inches='tight')

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(data=dfzoo.loc[wyes],y='user_name')

In [None]:
dfzoo.groupby('ps1_id').agg(len)['classification_id'].index[935]

In [None]:
wmax_id = dfzoo['ps1_id'] == 172521268464107338
dfzoo.loc[wmax_id,['user_name','classification']]

In [None]:
!ls

In [None]:
image_dir = '/epyc/users/kyboone/ztf_boyajian/zooniverse/'

In [None]:
display(Image(f'{image_dir}/172521268464107338.png'))

## Top tier: at least two yes votes
(still to do: check for "well-sampled" value, if we think it's needed)

In [None]:
wbest = wyes #& (dfzoo['user_name'] != 'kboone')
count_yes = dfzoo[['ps1_id','classification']].loc[wbest].groupby('ps1_id').agg(len)
wtwoyes = count_yes['classification'] > 1
best_ids = count_yes.loc[wtwoyes].index.values


In [None]:
len(best_ids)

In [None]:
print(best_ids)

In [None]:
for idi in best_ids:
    display(Image(f'{image_dir}/{idi}.png'))

### Second tier: only one yes (> 2 yes is above), at least one maybe

In [None]:
count_yes = dfzoo[['ps1_id','classification']].loc[wyes].groupby('ps1_id').agg(len)
count_yes = count_yes.loc[count_yes['classification'] == 1]
count_maybe = dfzoo[['ps1_id','classification']].loc[wmaybe].groupby('ps1_id').agg(len)

count_yes_and_maybe = count_yes.join(count_maybe,lsuffix='_yes',rsuffix='_maybe',how='inner')

In [None]:
okay_ids = count_yes_and_maybe.index.values

In [None]:
len(okay_ids)

In [None]:
print(okay_ids)

In [None]:
for idi in okay_ids:
    display(Image(f'{image_dir}/{idi}.png'))