In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import logging
import argparse
import glob
import json

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import matplotlib.gridspec as gridspec
import seaborn as sns
from PIL import Image

import pandas as pd

from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, mean_squared_error, mean_absolute_error
import tensorflow as tf

import seaborn as sns
sns.set_context('notebook')

from shared_astro_utils import astropy_utils, matching_utils
from zoobot.estimators import make_predictions, bayesian_estimator_funcs
from zoobot.tfrecord import read_tfrecord
from zoobot.uncertainty import discrete_coverage
from zoobot.estimators import input_utils, losses, dirichlet_stats
from zoobot.tfrecord import catalog_to_tfrecord
from zoobot.active_learning import metrics, simulated_metrics, acquisition_utils, check_uncertainty, simulation_timeline, run_estimator_config
from zoobot.active_learning import acquisition_utils
from zoobot import label_metadata


In [3]:
os.chdir('/home/walml/repos/zoobot')

In [4]:
questions = label_metadata.decals_questions
label_cols = label_metadata.decals_label_cols
version = 'decals'

schema = losses.Schema(label_cols, questions, version=version)
schema.questions

{smooth-or-featured, indices 0 to 2, asked after None: (0, 2), disk-edge-on, indices 3 to 4, asked after smooth-or-featured_featured-or-disk, index 1: (3, 4), has-spiral-arms, indices 5 to 6, asked after disk-edge-on_no, index 4: (5, 6), bar, indices 7 to 9, asked after disk-edge-on_no, index 4: (7, 9), bulge-size, indices 10 to 14, asked after disk-edge-on_no, index 4: (10, 14), how-rounded, indices 15 to 17, asked after smooth-or-featured_smooth, index 0: (15, 17), edge-on-bulge, indices 18 to 20, asked after disk-edge-on_yes, index 3: (18, 20), spiral-winding, indices 21 to 23, asked after has-spiral-arms_yes, index 5: (21, 23), spiral-arm-count, indices 24 to 29, asked after has-spiral-arms_yes, index 5: (24, 29), merging, indices 30 to 33, asked after None: (30, 33)}


[smooth-or-featured, indices 0 to 2, asked after None,
 disk-edge-on, indices 3 to 4, asked after smooth-or-featured_featured-or-disk, index 1,
 has-spiral-arms, indices 5 to 6, asked after disk-edge-on_no, index 4,
 bar, indices 7 to 9, asked after disk-edge-on_no, index 4,
 bulge-size, indices 10 to 14, asked after disk-edge-on_no, index 4,
 how-rounded, indices 15 to 17, asked after smooth-or-featured_smooth, index 0,
 edge-on-bulge, indices 18 to 20, asked after disk-edge-on_yes, index 3,
 spiral-winding, indices 21 to 23, asked after has-spiral-arms_yes, index 5,
 spiral-arm-count, indices 24 to 29, asked after has-spiral-arms_yes, index 5,
 merging, indices 30 to 33, asked after None]

In [5]:
dr1_dr2_vols_loc = 'dr2_volunteer_catalog.parquet'
dr5_vols_loc = 'dr5_volunteer_catalog.parquet'

dr1_ml_loc = 'latest_ml_catalog_dr1_only.parquet'
dr2_ml_loc = 'latest_ml_catalog_dr2_only.parquet'
dr5_ml_loc = 'latest_ml_catalog_dr5_only.parquet'

dr1_dr2_vols = pd.read_parquet(dr1_dr2_vols_loc)
dr5_vols = pd.read_parquet(dr5_vols_loc)

dr1_ml = pd.read_parquet(dr1_ml_loc)
dr2_ml = pd.read_parquet(dr2_ml_loc)
dr5_ml = pd.read_parquet(dr5_ml_loc)


In [6]:
# I already picked before which data release to use for galaxies classified in both (DR2 over DR1), lets stick with that by checking the filename
dr1_dr2_vols['dr1'] = dr1_dr2_vols['jpeg_loc'].apply(lambda x: 'dr1/standard' in x)

In [7]:
dr1_vols = dr1_dr2_vols.query('dr1')
dr2_vols = dr1_dr2_vols[~dr1_dr2_vols['dr1']]

In [8]:
dr1_ml_use = dr1_ml[dr1_ml['iauname'].isin(dr1_vols['iauname'])]
dr2_ml_use = dr2_ml[dr2_ml['iauname'].isin(dr2_vols['iauname'])]
print(len(dr1_ml_use), len(dr2_ml_use), len(dr1_dr2_vols))
assert len(dr1_ml_use) + len(dr2_ml_use) == len(dr1_dr2_vols)

32017 60943 92960


In [9]:
# assert all(dr5_ml['iauname'].isin(dr5_vols))

iaunames_any_vols = set(dr5_vols['iauname']).union(set(dr1_dr2_vols['iauname']))
print(len(iaunames_any_vols))

dr5_ml_use = dr5_ml[dr5_ml['iauname'].isin(iaunames_any_vols)]
# for DR5, lets again use DR5 predictions if we have them and DR2 if not
print(len(dr5_ml_use))

dr1_ml_use = dr1_ml_use[~dr1_ml_use['iauname'].isin(dr5_ml_use['iauname'])]
dr2_ml_use = dr2_ml_use[~dr2_ml_use['iauname'].isin(dr5_ml_use['iauname'])]
print(len(dr1_ml_use), len(dr2_ml_use), len(dr1_dr2_vols))
# assert len(dr1_ml_use) + len(dr2_ml_use) + dr5_ml == 314k ish
print(len(dr1_ml_use) + len(dr2_ml_use) + len(dr5_ml_use))

313789
309067
3852 870 92960
313789


In [10]:
df = pd.concat([dr5_ml_use, dr2_ml_use, dr1_ml_use])

In [11]:
assert not any(df['iauname'].duplicated())

In [12]:
df['file_loc'] = df['png_loc'].str.replace('/data/phys-zooniverse/chri5177/galaxy_zoo/decals/', '')
del df['png_loc']

In [13]:
df['file_loc']

0                     dr5/png/J223/J223253.27-005423.9.png
1                     dr5/png/J223/J223445.65-010456.2.png
2                     dr5/png/J223/J223515.21-003519.5.png
3                     dr5/png/J223/J223402.99+001117.3.png
4                     dr5/png/J223/J223710.17-005700.4.png
                               ...                        
32384    dr1_dr2/png/dr1/standard/J001337.43+000452.1_s...
32395    dr1_dr2/png/dr1/standard/J223508.28+011340.5_s...
32400    dr1_dr2/png/dr1/standard/J213415.99+004744.7_s...
32414    dr1_dr2/png/dr1/standard/J001256.27+005121.3_s...
32425    dr1_dr2/png/dr1/standard/J234157.85+001630.4_s...
Name: file_loc, Length: 313789, dtype: object

In [14]:
df.columns.values

array(['iauname', 'smooth-or-featured_smooth_concentration',
       'smooth-or-featured_featured-or-disk_concentration',
       'smooth-or-featured_artifact_concentration',
       'disk-edge-on_yes_concentration', 'disk-edge-on_no_concentration',
       'has-spiral-arms_yes_concentration',
       'has-spiral-arms_no_concentration', 'bar_strong_concentration',
       'bar_weak_concentration', 'bar_no_concentration',
       'bulge-size_dominant_concentration',
       'bulge-size_large_concentration',
       'bulge-size_moderate_concentration',
       'bulge-size_small_concentration', 'bulge-size_none_concentration',
       'how-rounded_round_concentration',
       'how-rounded_in-between_concentration',
       'how-rounded_cigar-shaped_concentration',
       'edge-on-bulge_boxy_concentration',
       'edge-on-bulge_none_concentration',
       'edge-on-bulge_rounded_concentration',
       'spiral-winding_tight_concentration',
       'spiral-winding_medium_concentration',
       'spiral-wi

In [15]:
df.to_parquet('final_ml_catalog.parquet')
df.to_parquet('final_ml_catalog.csv')