In [1]:
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table

In [2]:

# https://github.com/zooniverse/decals/blob/master/decals/a_download_decals/get_catalogs/selection_cuts.py#L28
def apply_selection_cuts(input_catalog, snap_tolerance=1e-3):
    """
    Select only galaxies with petrotheta > 3 and not within 1e-3 of bad measurement snap value
    Args:
        catalog (astropy.Table): Galaxy catalog including NSA information
        snap_tolerance (float): Minimum deviation from bad measurement snap value allowed
    Returns:
        (astropy.Table) catalog of galaxies matching selection criteria above
    """

    # Galaxies should be sufficiently extended across the sky
    petrotheta_above_3 = input_catalog['petrotheta'] > 3

    # NSA catalog’s petrotheta calculation sometimes fails to a ‘default’ value
    # Any galaxies with petrotheta within 1e-3 of the snap_to value likely has the wrong size.
    bad_petrotheta_value = 27.653702  # this 'magic' value can be confirmed by looking at petrotheta histograms
    snap_lower_limit = bad_petrotheta_value - snap_tolerance
    snap_upper_limit = bad_petrotheta_value + snap_tolerance

    above_snap_lower_limit = input_catalog['petrotheta'] > snap_lower_limit
    below_snap_upper_limit = input_catalog['petrotheta'] < snap_upper_limit
    within_snap_window = above_snap_lower_limit & below_snap_upper_limit

    selected_catalog = input_catalog[petrotheta_above_3 & ~ within_snap_window]
    return selected_catalog


In [3]:
dr2_s = Table.read('/media/walml/beta/galaxy_zoo/decals/dr1_dr2/subjects/decals_dr1_and_dr2_with_subj_id.csv').to_pandas()
# dr2_s = dr2_s[['dr8objid', 'sdss_id']]

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [4]:
dr2_c = pd.read_csv('/media/walml/beta/galaxy_zoo/decals/gzreduction_ouroborous/working_dir/votes/dr2_aggregated_votes.csv')

In [5]:
dr2_c_and_s = pd.merge(dr2_c, dr2_s, on='subject_id', how='inner')  # may drop some unclassified subjects, but all classifications must match
assert len(dr2_c) == len(dr2_c_and_s)

# which could be uploaded?

In [6]:
to_upload = Table.read('/media/walml/beta/galaxy_zoo/decals/catalogs/dr5_nsa1_0_0_to_upload.fits').to_pandas()  # same table, now with png_ready/fits_ready/fits_filled cols added
to_upload['iauname'] = to_upload['iauname'].str.decode('utf8')
len(to_upload)

343128

In [7]:
to_upload['nsa_version'] = 'v1_0_0'
to_upload['redshift'] = to_upload['z']

In [8]:
# should be uploaded if not in DR1/2 and fits_filled

In [9]:
to_upload['in_dr2'] = to_upload['iauname'].isin(dr2_c_and_s['iauname'])
to_upload['in_dr2'].sum()

89220

(-89221, 0, -0.0)

In [13]:
should_be_uploaded = to_upload[(to_upload['fits_filled'] & ~to_upload['in_dr2'])]  # important not!
len(should_be_uploaded)

220801

In [14]:
should_be_uploaded['png_loc'] = should_be_uploaded['png_loc'].apply(lambda x: x.decode('utf8'))
should_be_uploaded['relative_png_loc'] = should_be_uploaded['png_loc'].apply(lambda x: x.replace('/Volumes/EXTERNAL/decals/png/dr5/', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
with_cuts = apply_selection_cuts(should_be_uploaded)
assert len(with_cuts) == len(should_be_uploaded)  # I have already applied these cuts in the creation of to_upload.fits

In [13]:
should_be_uploaded.to_csv('should_be_uploaded.csv', index=False)

## what's been uploaded?

In [14]:

# latest_subjects = pd.read_csv('/home/walml/repos/gz-panoptes-reduction/data/latest_subjects_export.csv')
latest_subjects = pd.read_csv('/media/walml/beta/misc_downloads/galaxy-zoo-subjects.csv')
len(latest_subjects)

  interactivity=interactivity, compiler=compiler, result=result)


788491

In [15]:
latest_subjects = latest_subjects[latest_subjects['workflow_id'].isin([6122.0, 10582.0, 10581.0])]  # decals public workflows
# latest_subjects = latest_subjects[latest_subjects['workflow_id'].isin([10582.0])]  # decals enhanced
latest_subjects = latest_subjects[~latest_subjects['subject_set_id'].isin([77652])]  # remove eagle subjects
len(latest_subjects)

436559

In [16]:
latest_subjects

Unnamed: 0,subject_id,project_id,workflow_id,subject_set_id,metadata,locations,classifications_count,retired_at,retirement_reason,created_at,updated_at
2250,20283538,5733,6122.0,19832,"{""!ra"":116.4483378146343,""!dec"":25.86208586745...","{""0"":""https://panoptes-uploads.zooniverse.org/...",40,2018-03-20 22:25:30 UTC,classification_count,2018-03-15 18:17:16 UTC,2018-03-15 18:17:16 UTC
2252,20283539,5733,6122.0,19832,"{""!ra"":323.31862288433615,""!dec"":11.6900743271...","{""0"":""https://panoptes-uploads.zooniverse.org/...",40,2018-03-19 19:51:28 UTC,classification_count,2018-03-15 18:17:16 UTC,2018-03-15 18:17:16 UTC
2254,20283540,5733,6122.0,19832,"{""!ra"":321.47404187434915,""!dec"":0.41605876776...","{""0"":""https://panoptes-uploads.zooniverse.org/...",40,2018-03-20 15:39:15 UTC,classification_count,2018-03-15 18:17:16 UTC,2018-03-15 18:17:16 UTC
2256,20283541,5733,6122.0,19832,"{""!ra"":337.0527522979068,""!dec"":0.543208463333...","{""0"":""https://panoptes-uploads.zooniverse.org/...",40,2018-03-20 21:13:41 UTC,classification_count,2018-03-15 18:17:16 UTC,2018-03-15 18:17:16 UTC
2258,20283542,5733,6122.0,19832,"{""!ra"":119.21033791857249,""!dec"":28.3758651318...","{""0"":""https://panoptes-uploads.zooniverse.org/...",40,2018-03-20 16:02:39 UTC,classification_count,2018-03-15 18:17:16 UTC,2018-03-15 18:17:16 UTC
...,...,...,...,...,...,...,...,...,...,...,...
772820,47217984,5733,10582.0,85299,"{""!ra"":170.01910635775005,""!dec"":23.9484406442...","{""0"":""https://panoptes-uploads.zooniverse.org/...",12,,,2020-06-22 02:24:25 UTC,2020-06-22 02:24:25 UTC
772821,47217985,5733,10582.0,85299,"{""!ra"":215.2050027217024,""!dec"":22.31331097960...","{""0"":""https://panoptes-uploads.zooniverse.org/...",5,,,2020-06-22 02:24:26 UTC,2020-06-22 02:24:26 UTC
772822,47217986,5733,10582.0,85299,"{""!ra"":233.3496977796491,""!dec"":18.84022338841...","{""0"":""https://panoptes-uploads.zooniverse.org/...",6,,,2020-06-22 02:24:27 UTC,2020-06-22 02:24:27 UTC
772823,47217987,5733,10582.0,85299,"{""!ra"":131.76375574276742,""!dec"":15.2477306104...","{""0"":""https://panoptes-uploads.zooniverse.org/...",9,,,2020-06-22 02:24:30 UTC,2020-06-22 02:24:30 UTC


In [17]:
latest_subjects['metadata'] = latest_subjects['metadata'].apply(json.loads)

In [18]:
def get_iauname(metadata):
    possible_keys = ['iauname', '!iauname', 'IAUNAME', '!IAUNAME']
    for key in possible_keys:
        if key in metadata.keys():
            return metadata[key]
    return np.nan

In [19]:
latest_subjects['iauname'] = latest_subjects['metadata'].apply(get_iauname)

In [20]:
pd.isna(latest_subjects['iauname']).sum()

0

In [21]:
missing_from_subjects = should_be_uploaded[~should_be_uploaded['iauname'].isin(latest_subjects['iauname'])]  # should be uploaded, but is not in subjects export
len(missing_from_subjects)  # exported right after classifications

11456

In [22]:
missing_from_subjects

Unnamed: 0,iauname,nsa_id,ra,dec,petrotheta,petroth50,petroth90,z,fits_loc,png_loc,fits_ready,fits_filled,png_ready,nsa_version,redshift,in_dr2,relative_png_loc
20896,J215202.24+010834.5,27953,328.009357,1.142945,5.566817,2.379937,6.013899,0.115884,b'/Volumes/EXTERNAL/decals/fits/dr5/J215/J2152...,/Volumes/EXTERNAL/decals/png/dr5/J215/J215202....,True,True,True,v1_0_0,0.115884,False,J215/J215202.24+010834.5.png
22651,J224938.18+002508.5,30060,342.409052,0.419040,6.951218,3.434981,7.767619,0.101616,b'/Volumes/EXTERNAL/decals/fits/dr5/J224/J2249...,/Volumes/EXTERNAL/decals/png/dr5/J224/J224938....,True,True,True,v1_0_0,0.101616,False,J224/J224938.18+002508.5.png
22659,J225759.79-005519.6,30069,344.499138,-0.922141,5.798562,2.364125,7.086283,0.045398,b'/Volumes/EXTERNAL/decals/fits/dr5/J225/J2257...,/Volumes/EXTERNAL/decals/png/dr5/J225/J225759....,True,True,True,v1_0_0,0.045398,False,J225/J225759.79-005519.6.png
22662,J225804.46-002744.2,30072,344.518524,-0.462451,18.527157,8.654810,27.184856,0.016207,b'/Volumes/EXTERNAL/decals/fits/dr5/J225/J2258...,/Volumes/EXTERNAL/decals/png/dr5/J225/J225804....,True,True,True,v1_0_0,0.016207,False,J225/J225804.46-002744.2.png
22680,J225521.47-005651.9,30093,343.839469,-0.947811,12.797614,5.273067,17.658121,0.109974,b'/Volumes/EXTERNAL/decals/fits/dr5/J225/J2255...,/Volumes/EXTERNAL/decals/png/dr5/J225/J225521....,True,True,True,v1_0_0,0.109974,False,J225/J225521.47-005651.9.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343036,J135947.14+014021.8,698357,209.946212,1.672686,6.752739,3.951229,11.879787,0.139400,b'/Volumes/EXTERNAL/decals/fits/dr5/J135/J1359...,/Volumes/EXTERNAL/decals/png/dr5/J135/J135947....,True,True,True,v1_0_0,0.139400,False,J135/J135947.14+014021.8.png
343060,J142413.43+014628.7,698407,216.059344,1.780008,57.861927,21.499472,52.799202,0.111200,b'/Volumes/EXTERNAL/decals/fits/dr5/J142/J1424...,/Volumes/EXTERNAL/decals/png/dr5/J142/J142413....,True,True,True,v1_0_0,0.111200,False,J142/J142413.43+014628.7.png
343073,J142431.49-003142.4,698428,216.131200,-0.528175,5.513321,2.764185,5.407866,0.072400,b'/Volumes/EXTERNAL/decals/fits/dr5/J142/J1424...,/Volumes/EXTERNAL/decals/png/dr5/J142/J142431....,True,True,True,v1_0_0,0.072400,False,J142/J142431.49-003142.4.png
343107,J143619.83-010748.1,698522,219.083613,-1.130237,18.740425,7.763114,25.235826,0.105971,b'/Volumes/EXTERNAL/decals/fits/dr5/J143/J1436...,/Volumes/EXTERNAL/decals/png/dr5/J143/J143619....,True,True,True,v1_0_0,0.105971,False,J143/J143619.83-010748.1.png


These iauname galaxies should have been uploaded, but are not in the subject export. Could be not uploaded, could since have been somehow deactivated.

In [23]:
latest_classifications = pd.read_csv('~/Downloads/classifications_final.csv')  # only if at least 1 classification
missing_from_classifications = should_be_uploaded[~should_be_uploaded['iauname'].isin(latest_classifications['iauname'])]  # should be uploaded, but has not been classified
len(missing_from_classifications)  # these have all not recieved 1 classification
# all 9k need exactly 5 classifications

9573

Galaxies to upload must
- Not have recieved any classifications
- Not be in the inactive subject set (d, is deleted so doesn't show, all good)
- Not be in the active subject set (a) as they might just not have been classified yet

In [24]:
no_classifications = set(missing_from_classifications['iauname'])

In [25]:
# latest_subjects['subject_set_id'].value_counts()
in_missing_subject_set_a = set(latest_subjects.query('subject_set_id == "85299"')['iauname'])

In [26]:
iaunames_to_upload = (no_classifications - in_missing_subject_set_a)
len(iaunames_to_upload)

9074

In [27]:
will_upload = should_be_uploaded[should_be_uploaded['iauname'].isin(iaunames_to_upload)]
assert len(iaunames_to_upload) == len(will_upload)

In [28]:
!pwd

/home/walml/repos/zoobot/notebooks/catalogs


In [29]:
will_upload.to_csv('missing_galaxies_no_classifications_not_85299.csv', index=False)

In [30]:
assert False

AssertionError: 

These iauname galaxies should have been uploaded, but have not recieved any classifications. Could be not uploaded, could be new, could be awaiting activation.

In [None]:
# pd.isna(latest_classifications['iauname']).sum()  # 33 have no iauname, Tobias to review

In [None]:
# so we need 9k*5 for these galaxies not classified, plus enough to finish should_be_uploaded

In [None]:
latest_subjects

In [None]:
latest_subjects.duplicated(subset=['subject_id', 'subject_set_id']).mean()

In [None]:
latest_subjects.duplicated(subset=['subject_id', 'subject_set_id', 'workflow_id']).mean()

In [None]:
# making sure the random subjects have retirement=5 if they are in should_be_uploaded and are not duplicates

In [None]:
random = latest_subjects.query('subject_set_id == "74905"')
random['iauname'].value_counts().value_counts()  # apparently they are ALL duplicated exactly twice??
# random_metadata = list(latest_subjects.query('subject_set_id == "74905"')['metadata'])

In [None]:
random['subject_id'].value_counts()

In [None]:
random_metadata[0]

In [None]:
retirement_limits = [g.get('#retirement_limit', None) for g in random_metadata]

In [None]:
pd.value_counts(retirement_limits)

In [None]:
not_duplicates = pd.read_csv('/home/walml/repos/zoobot/notebooks/debugging/not_duplicates.csv')['subject_id']

In [None]:
retirement_limits = [g.get('#retirement_limit', None) for g in random_metadata if g['subject_id'] not in not_duplicates]
pd.value_counts(retirement_limits)

In [None]:
latest_subjects['subject_set_id'].value_counts()  # there's no 74909 as it isn't active! only includes active subject sets in the export. 74909 is not going to reactivate though, conside fully classified

In [None]:
unretired = latest_subjects[pd.isna(latest_subjects['retired_at'])]
unretired['subject_set_id'].value_counts()  #  concerning - two in three random galaxies are not retired? maybe caesar needs a refresh to apply the new rule, don't want to use 200k classifications just for final retirements

In [None]:
unretired

In [None]:
32497877

In [None]:
random_or_missing_a_subjects = latest_subjects[latest_subjects['subject_set_id'].isin([74905, 85299])]
len(random_or_missing_a_subjects)

In [None]:
set(should_be_uploaded['iauname']) - set(latest_classifications['iauname']) - set(random_or_missing_a_subjects['iauname'])

# separately...

What about galaxies which have <5 classifications but should actually not be uploaded, or never finish? I should turn them off

In [None]:
wrongly_classified = latest_classifications[~latest_classifications['iauname'].isin(should_be_uploaded['iauname'])]

In [None]:
wrongly_classified

In [None]:
wrongly_classified['smooth-or-featured_total-votes'].hist(bins=40)

In [None]:
wrongly_classified['smooth-or-featured_total-votes'].hist(bins=10, range=(0, 10))

In [None]:
wrongly_classified