In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# importing libraries

import glob
import sys
from zipfile import ZipFile 
import concurrent.futures
import gc
from time import time
import cv2

sys.path.insert(0,'../src/')

import PIL as pil

import pandas as pd
import numpy as np
np.random.seed(42)
import random

import matplotlib.pyplot as plt

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
import urllib.request


In [4]:
from data_processing import *
import cvt as cvt
from labeling_system import *

---

In [5]:
reference_images = pd.read_csv('../input/filename_mapping.csv')

main_catalogue = pd.read_csv('../input/gz2_classes.csv')

In [6]:
reference_images.head()

Unnamed: 0,objid,sample,asset_id
0,587722981736120347,original,1
1,587722981736579107,original,2
2,587722981741363294,original,3
3,587722981741363323,original,4
4,587722981741559888,original,5


In [7]:
main_catalogue.head()

Unnamed: 0,specobjid,dr8objid,dr7objid,ra,dec,rastring,decstring,sample,gz2class,total_classifications,...,t11_arms_number_a36_more_than_4_fraction,t11_arms_number_a36_more_than_4_weighted_fraction,t11_arms_number_a36_more_than_4_debiased,t11_arms_number_a36_more_than_4_flag,t11_arms_number_a37_cant_tell_count,t11_arms_number_a37_cant_tell_weight,t11_arms_number_a37_cant_tell_fraction,t11_arms_number_a37_cant_tell_weighted_fraction,t11_arms_number_a37_cant_tell_debiased,t11_arms_number_a37_cant_tell_flag
0,1.802675e+18,,588017703996096547,160.9904,11.70379,10:43:57.70,+11:42:13.6,original,SBb?t,44,...,0.225,0.225,0.225,0,10,10.0,0.25,0.25,0.25,0
1,1.992984e+18,,587738569780428805,192.41083,15.164207,12:49:38.60,+15:09:51.1,original,Ser,45,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0
2,1.489569e+18,,587735695913320507,210.8022,54.348953,14:03:12.53,+54:20:56.2,original,Sc+t,46,...,0.651,0.651,0.651,0,3,3.0,0.07,0.07,0.07,0
3,2.924084e+18,1.237668e+18,587742775634624545,185.30342,18.382704,12:21:12.82,+18:22:57.7,original,SBc(r),45,...,0.071,0.071,0.071,0,6,6.0,0.429,0.429,0.429,0
4,1.387165e+18,1.237658e+18,587732769983889439,187.36679,8.749928,12:29:28.03,+08:44:59.7,extra,Ser,49,...,0.0,0.0,0.0,0,1,1.0,1.0,1.0,1.0,0


In [None]:
main_catalogue.drop(main_catalogue[main_catalogue['gz2class'] == 'A'].index, inplace=True)
main_catalogue.shape

(243253, 233)

In [9]:
main_catalogue = main_catalogue.merge(
    reference_images[['objid', 'asset_id']], 
    left_on='dr7objid', 
    right_on='objid', 
    how='left'
).drop(columns=['objid'])  # Drop extra 'objid' column after merging
main_catalogue = main_catalogue.sort_values(by=['asset_id']).reset_index(drop=True)

main_catalogue.head()

Unnamed: 0,specobjid,dr8objid,dr7objid,ra,dec,rastring,decstring,sample,gz2class,total_classifications,...,t11_arms_number_a36_more_than_4_weighted_fraction,t11_arms_number_a36_more_than_4_debiased,t11_arms_number_a36_more_than_4_flag,t11_arms_number_a37_cant_tell_count,t11_arms_number_a37_cant_tell_weight,t11_arms_number_a37_cant_tell_fraction,t11_arms_number_a37_cant_tell_weighted_fraction,t11_arms_number_a37_cant_tell_debiased,t11_arms_number_a37_cant_tell_flag,asset_id
0,3.231894e+17,1.237649e+18,587722981741363294,182.92526,-1.092357,12:11:42.06,-01:05:32.5,original,Ei,52,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,3
1,3.231899e+17,1.237649e+18,587722981741363323,182.97011,-1.219537,12:11:52.83,-01:13:10.3,original,Sc,30,...,0.0,0.0,0,1,1.0,1.0,1.0,1.0,0,4
2,3.23169e+17,1.237649e+18,587722981741559888,183.4381,-1.238414,12:13:45.14,-01:14:18.3,original,Er,53,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,5
3,3.231718e+17,1.237649e+18,587722981741625481,183.474,-1.231429,12:13:53.76,-01:13:53.1,original,Er,37,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,6
4,3.231679e+17,1.237649e+18,587722981741625484,183.47778,-1.084604,12:13:54.67,-01:05:04.6,original,Ei,45,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,7


In [10]:
# RUN 1
# =======
# E, S, SB, Se
# --------------

def run1_soft_labels(row):
    # E = smooth (Task 1)
    p_e = row["t01_smooth_or_features_a01_smooth_debiased"]

    # S = features/disk (Task 1) * no bar (Task 3) * spiral (Task 4)
    p_s = (
        row["t01_smooth_or_features_a02_features_or_disk_debiased"] *
        row["t03_bar_a07_no_bar_debiased"] *
        row["t04_spiral_a08_spiral_debiased"]
    )

    # SB = features/disk * bar * spiral
    p_sb = (
        row["t01_smooth_or_features_a02_features_or_disk_debiased"] *
        row["t03_bar_a06_bar_debiased"] *
        row["t04_spiral_a08_spiral_debiased"]
    )

    # Se = edge-on (Task 2)
    p_se = row["t02_edgeon_a04_yes_debiased"]

    # Normalize
    total = p_e + p_s + p_sb + p_se
    if total == 0:
        return np.array([1.0, 0.0, 0.0, 0.0])  # fallback: assume elliptical

    return np.array([p_e, p_s, p_sb, p_se]) / total

In [11]:
soft_label_dict = {
    int(row["asset_id"]): run1_soft_labels(row)
    for _, row in main_catalogue.iterrows()
}

In [28]:
# RUN 2
# ========

def run2_soft_labels(row):
    # E: r, i, c
    # Se: r,b,n
    # SB: a,b,c,d
    # S: a,b,c,d

    # -----------

    pr = (row['t07_rounded_a16_completely_round_debiased'])

    pi = (row['t07_rounded_a17_in_between_debiased'])

    pc = (row['t07_rounded_a18_cigar_shaped_debiased'])

    # -----

    pSer = (row['t09_bulge_shape_a25_rounded_debiased'])

    pSeb = (row['t09_bulge_shape_a26_boxy_debiased'])

    pSen = (row['t09_bulge_shape_a27_no_bulge_debiased'])

    # Normalize
    total = pr + pi + pc + pSer + pSeb + pSen
    if total == 0:
        return np.array([1.0, 0.0, 0.0, 0.0])  # fallback: assume elliptical

    return np.array([pr,pi, pc, pSer, pSeb, pSen]) / total

In [29]:
fefgefefef

In [30]:
main_catalogue.head(10)

Unnamed: 0,specobjid,dr8objid,dr7objid,ra,dec,rastring,decstring,sample,gz2class,total_classifications,...,t11_arms_number_a36_more_than_4_weighted_fraction,t11_arms_number_a36_more_than_4_debiased,t11_arms_number_a36_more_than_4_flag,t11_arms_number_a37_cant_tell_count,t11_arms_number_a37_cant_tell_weight,t11_arms_number_a37_cant_tell_fraction,t11_arms_number_a37_cant_tell_weighted_fraction,t11_arms_number_a37_cant_tell_debiased,t11_arms_number_a37_cant_tell_flag,asset_id
0,3.231894e+17,1.237649e+18,587722981741363294,182.92526,-1.092357,12:11:42.06,-01:05:32.5,original,Ei,52,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,3
1,3.231899e+17,1.237649e+18,587722981741363323,182.97011,-1.219537,12:11:52.83,-01:13:10.3,original,Sc,30,...,0.0,0.0,0,1,1.0,1.0,1.0,1.0,0,4
2,3.23169e+17,1.237649e+18,587722981741559888,183.4381,-1.238414,12:13:45.14,-01:14:18.3,original,Er,53,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,5
3,3.231718e+17,1.237649e+18,587722981741625481,183.474,-1.231429,12:13:53.76,-01:13:53.1,original,Er,37,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,6
4,3.231679e+17,1.237649e+18,587722981741625484,183.47778,-1.084604,12:13:54.67,-01:05:04.6,original,Ei,45,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,7
5,3.231674e+17,1.237649e+18,587722981741625520,183.5262,-1.164011,12:14:06.29,-01:09:50.4,original,Ei,43,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,8
6,3.231685e+17,1.237649e+18,587722981741625545,183.56682,-1.135895,12:14:16.04,-01:08:09.2,original,Er,40,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,9
7,3.231558e+17,1.237649e+18,587722981741756545,183.82603,-1.109849,12:15:18.25,-01:06:35.5,original,Er,40,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,11
8,3.231446e+17,1.237649e+18,587722981741756579,183.9019,-1.173982,12:15:36.46,-01:10:26.3,original,Sc,44,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,12
9,3.23146e+17,1.237649e+18,587722981741822057,183.96939,-1.203995,12:15:52.65,-01:12:14.4,original,Ec,42,...,0.0,0.0,0,1,1.0,1.0,1.0,1.0,0,13


In [31]:
soft_label_dict[3]

array([0.16192611, 0.32109753, 0.        , 0.51697636, 0.        ,
       0.        ])

In [32]:
from scipy.stats import entropy

def get_label_entropy(soft_label):
    return entropy(soft_label, base=2)

In [33]:
get_label_entropy(soft_label_dict[3])

1.4436394524210754

In [None]:
c = 0
for i in soft_label_dict:
    label = soft_label_dict[i]
    if get_label_entropy(label) > 1.5:
        
print(c)

45580


In [42]:
main_catalogue.drop(main_catalogue[main_catalogue['asset_id'] == 3].index, inplace=True)

In [43]:
main_catalogue.head()

Unnamed: 0,specobjid,dr8objid,dr7objid,ra,dec,rastring,decstring,sample,gz2class,total_classifications,...,t11_arms_number_a36_more_than_4_weighted_fraction,t11_arms_number_a36_more_than_4_debiased,t11_arms_number_a36_more_than_4_flag,t11_arms_number_a37_cant_tell_count,t11_arms_number_a37_cant_tell_weight,t11_arms_number_a37_cant_tell_fraction,t11_arms_number_a37_cant_tell_weighted_fraction,t11_arms_number_a37_cant_tell_debiased,t11_arms_number_a37_cant_tell_flag,asset_id
1,3.231899e+17,1.237649e+18,587722981741363323,182.97011,-1.219537,12:11:52.83,-01:13:10.3,original,Sc,30,...,0.0,0.0,0,1,1.0,1.0,1.0,1.0,0,4
2,3.23169e+17,1.237649e+18,587722981741559888,183.4381,-1.238414,12:13:45.14,-01:14:18.3,original,Er,53,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,5
3,3.231718e+17,1.237649e+18,587722981741625481,183.474,-1.231429,12:13:53.76,-01:13:53.1,original,Er,37,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,6
4,3.231679e+17,1.237649e+18,587722981741625484,183.47778,-1.084604,12:13:54.67,-01:05:04.6,original,Ei,45,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,7
5,3.231674e+17,1.237649e+18,587722981741625520,183.5262,-1.164011,12:14:06.29,-01:09:50.4,original,Ei,43,...,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,8


In [48]:
list(soft_label_dict.keys())[:5]

[3, 4, 5, 6, 7]