In [2]:
import os
import csv

In [3]:
catalog_filename = './gz2_hart16.csv'
output_directory = './tags/'

In [4]:
mapping_filename = './gz2_filename_mapping.csv'

##### folow question tree
###### refer to galaxyzoo2-hart16-question tree.pdf
###### tasks in this code are correspond to catalog file, starting from task 01, but don't correspond to order in question tree, which start from task 00.

In [None]:
column_name_to_index = {}

# Read gz2 catalog and process each row
with open(catalog_filename, 'r') as file:
    reader = csv.DictReader(file)
    for index, column_name in enumerate(reader.fieldnames):
        column_name_to_index[column_name] = index

    for row in reader:
        # Task 00: Check if the galaxy is smooth, features/disk, or a star/artifact
        t01_smooth = float(row['t01_smooth_or_features_a01_smooth_weighted_fraction'])
        t01_features = float(row['t01_smooth_or_features_a02_features_or_disk_weighted_fraction'])
        t01_star = float(row['t01_smooth_or_features_a03_star_or_artifact_weighted_fraction'])

        max_t01 = max(t01_smooth, t01_features, t01_star)
        label_t01 = ''
        
        if max_t01 == t01_smooth:
            # If smooth, go to Task 07
            label_t01 = 'smooth'
            t07_complete = float(row['t07_rounded_a16_completely_round_weighted_fraction'])
            t07_in_between = float(row['t07_rounded_a17_in_between_weighted_fraction'])
            t07_cigar = float(row['t07_rounded_a18_cigar_shaped_weighted_fraction'])

            max_t07 = max(t07_complete, t07_in_between, t07_cigar)
            if max_t07 == t07_complete:
                label_t07 = 'completely round galaxy'
            elif max_t07 == t07_in_between:
                label_t07 = 'in-between round galaxy'
            else:
                label_t07 = 'cigar shaped galaxy'

        elif max_t01 == t01_features:
            label_t01 = 'features or disk-shaped'
            # Task 02: Check if it's an edge-on disk
            t02_yes = float(row['t02_edgeon_a04_yes_weighted_fraction'])
            t02_no = float(row['t02_edgeon_a05_no_weighted_fraction'])

            if t02_yes > t02_no:
                label_t02 = 'edge-on galaxy'

                # Task 09: Bulge shape
                t09_round = float(row['t09_bulge_shape_a25_rounded_weighted_fraction'])
                t09_boxy = float(row['t09_bulge_shape_a26_boxy_weighted_fraction'])
                t09_no = float(row['t09_bulge_shape_a27_no_bulge_weighted_fraction'])
                max_t09 = max(t09_round, t09_boxy, t09_no)
                if max_t09 == t09_round:
                    label_t09 = 'with rounded edge-on bulge'
                elif max_t09 == t09_boxy:
                    label_t09 = 'with boxy edge-on bulge'
                else:
                    label_t09 = None
            else:
                label_t02 = None

                # Task 03: Bar structure
                t03_bar = float(row['t03_bar_a06_bar_weighted_fraction'])
                t03_no_bar = float(row['t03_bar_a07_no_bar_weighted_fraction'])
                max_t03 = max(t03_bar, t03_no_bar)
                label_t03 = 'bar-shaped structure in the center of galaxy' if max_t03 == t03_bar else None

                # Task 04: Check for spiral arm patterns
                t04_spiral = float(row['t04_spiral_a08_spiral_weighted_fraction'])
                t04_no_spiral = float(row['t04_spiral_a09_no_spiral_weighted_fraction'])
                if t04_spiral > t04_no_spiral:
                    label_t04 = 'spiral galaxy'

                    # Task 10: Spiral arms winding
                    t10_tight = float(row['t10_arms_winding_a28_tight_weighted_fraction'])
                    t10_medium = float(row['t10_arms_winding_a29_medium_weighted_fraction'])
                    t10_loose = float(row['t10_arms_winding_a30_loose_weighted_fraction'])
                    max_t10 = max(t10_tight, t10_medium, t10_loose)
                    if max_t10 == t10_tight:
                        label_t10 = 'tightly wound spiral arms'
                    elif max_t10 == t10_medium:
                        label_t10 = 'medium wound spiral arms'
                    else:
                        label_t10 = 'loosely wound spiral arms'

                    # Task 11: Number of spiral arms
                    t11_1 = float(row['t11_arms_number_a31_1_weighted_fraction'])
                    t11_2 = float(row['t11_arms_number_a32_2_weighted_fraction'])
                    t11_3 = float(row['t11_arms_number_a33_3_weighted_fraction'])
                    t11_4 = float(row['t11_arms_number_a34_4_weighted_fraction'])
                    t11_more_than_4 = float(row['t11_arms_number_a36_more_than_4_weighted_fraction'])
                    max_t11 = max(t11_1, t11_2, t11_3, t11_4, t11_more_than_4)
                    if max_t11 == t11_1:
                        label_t11 = '1 spiral arm'
                    elif max_t11 == t11_2:
                        label_t11 = '2 spiral arms'
                    elif max_t11 == t11_3:
                        label_t11 = '3 spiral arms'
                    elif max_t11 == t11_4:
                        label_t11 = '4 spiral arms'
                    else:
                        label_t11 = 'more than 4 spiral arms'

                else:
                    label_t04, label_t10, label_t11  = None

                # Task 05: Central bulge prominence
                t05_no = float(row['t05_bulge_prominence_a10_no_bulge_weighted_fraction'])
                t05_noticeable = float(row['t05_bulge_prominence_a11_just_noticeable_weighted_fraction'])
                t05_obvious = float(row['t05_bulge_prominence_a12_obvious_weighted_fraction'])
                t05_dominant = float(row['t05_bulge_prominence_a13_dominant_weighted_fraction'])

                max_t05 = max(t05_no, t05_noticeable, t05_obvious, t05_dominant)
                if max_t05 == t05_no:
                    label_t05 = None
                elif max_t05 == t05_noticeable:
                    label_t05 = 'just noticeable bulge in center'
                elif max_t05 == t05_obvious:
                    label_t05 = 'obvious bulge in center'
                else:
                    label_t05 = 'dominant bulge in center'

        else:
            # Task 00: It's a star or artifact, skip further checks
            label_t01 = 'star or artifact'
            
        

        # Task 08 odd feature
        t08_ring = float(row['t08_odd_feature_a19_ring_weighted_fraction'])
        t08_lens = float(row['t08_odd_feature_a20_lens_or_arc_weighted_fraction'])
        t08_disturbed = float(row['t08_odd_feature_a21_disturbed_weighted_fraction'])
        t08_irregular = float(row['t08_odd_feature_a22_irregular_weighted_fraction'])
        t08_else = float(row['t08_odd_feature_a23_other_weighted_fraction'])
        t08_merger = float(row['t08_odd_feature_a24_merger_weighted_fraction'])
        t08_dustlane = float(row['t08_odd_feature_a38_dust_lane_weighted_fraction'])
        
        max_t08 = max(t08_ring, t08_lens, t08_disturbed, t08_irregular, t08_else, t08_merger, t08_dustlane)
        label_t08 = ''
        if max_t08 == t08_ring:
            label_t08 = 'odd feature is a ring'
        elif max_t08 == t08_lens:
            label_t08 = 'odd feature is a lens or arc'
        elif max_t08 == t08_disturbed:
            label_t08 = 'odd feature is a disturbed galaxy'
        elif max_t08 == t08_irregular:
            label_t08 = 'odd feature is an irregular galaxy'
        elif max_t08 == t08_else:
            label_t08 = 'something else odd feature'
        elif max_t08 == t08_merger:
            label_t08 = 'a merger'
        else: 
            label_t08 = 'dust lane'




        # Write labels to file
        dr7objid = row['dr7objid']
        output_filename = os.path.join(output_directory, f'{dr7objid}.txt')
        with open(output_filename, 'w') as output_file:
            labels = [label_t01, label_t02, label_t03, label_t04, label_t05, label_t07, label_t09, label_t10, label_t11]
            first = True
            for label in labels:
                if label is not None:
                    if not first:
                        output_file.write(", ")
                    output_file.write(label)
                    first = False

print("Successfully completed")


In [10]:
def task_01(row):
    # smooth or feature disk

    t01_smooth = float(row['t01_smooth_or_features_a01_smooth_weighted_fraction'])
    t01_features = float(row['t01_smooth_or_features_a02_features_or_disk_weighted_fraction'])
    t01_star = float(row['t01_smooth_or_features_a03_star_or_artifact_weighted_fraction'])
    
    max_t01 = max(t01_smooth, t01_features, t01_star)
    
    if max_t01 == t01_smooth:
        return 'smooth'
    elif max_t01 == t01_features:
        return 'features or disk-shaped'
    else:
        return 'star or artifact'

def task_02(row):
    # edge-on or not

    t02_yes = float(row['t02_edgeon_a04_yes_weighted_fraction'])
    t02_no = float(row['t02_edgeon_a05_no_weighted_fraction'])
    
    if t02_yes > t02_no:
        return 'edge-on galaxy'
    else:
        return None
    
def task_03(row):
    # bar in center or not

    t03_bar = float(row['t03_bar_a06_bar_weighted_fraction'])
    t03_no_bar = float(row['t03_bar_a07_no_bar_weighted_fraction'])

    if t03_bar > t03_no_bar:
        return 'bar-shaped structure in the center of galaxy'  
    else:
        return None

def task_04(row):
    # spiral galaxy or not

    t04_spiral = float(row['t04_spiral_a08_spiral_weighted_fraction'])
    t04_no_spiral = float(row['t04_spiral_a09_no_spiral_weighted_fraction'])
    if t04_spiral > t04_no_spiral:
        return 'spiral galaxy'  
    else:
        return None

def task_05(row):
    # bulge prominence
    
    t05_no = float(row['t05_bulge_prominence_a10_no_bulge_weighted_fraction'])
    t05_noticeable = float(row['t05_bulge_prominence_a11_just_noticeable_weighted_fraction'])
    t05_obvious = float(row['t05_bulge_prominence_a12_obvious_weighted_fraction'])
    t05_dominant = float(row['t05_bulge_prominence_a13_dominant_weighted_fraction'])
    
    max_t05 = max(t05_no, t05_noticeable, t05_obvious, t05_dominant)
    
    if max_t05 == t05_no:
        return None
    elif max_t05 == t05_noticeable:
        return 'just noticeable bulge prominence'
    elif max_t05 == t05_obvious:
        return 'obvious bulge prominence'
    else:
        return 'dominant bulge prominence'
    
def task_06(row):
    # anything odd or not

    t06_yes = float(row['t06_odd_a14_yes_weighted_fraction'])
    t06_no = float(row['t06_odd_a15_no_weighted_fraction'])
    
    if t06_yes > t06_no:
        return 'something odd'
    else:
        return None

def task_07(row):
    # round type

    t07_complete = float(row['t07_rounded_a16_completely_round_weighted_fraction'])
    t07_in_between = float(row['t07_rounded_a17_in_between_weighted_fraction'])
    t07_cigar = float(row['t07_rounded_a18_cigar_shaped_weighted_fraction'])
    
    max_t07 = max(t07_complete, t07_in_between, t07_cigar)
    
    if max_t07 == t07_complete:
        return 'completely round galaxy'
    elif max_t07 == t07_in_between:
        return 'in-between round galaxy'
    else:
        return 'cigar shaped galaxy'

def task_08(row):
    # detailed odd feature

    t08_ring = float(row['t08_odd_feature_a19_ring_weighted_fraction'])
    t08_lens = float(row['t08_odd_feature_a20_lens_or_arc_weighted_fraction'])
    t08_disturbed = float(row['t08_odd_feature_a21_disturbed_weighted_fraction'])
    t08_irregular = float(row['t08_odd_feature_a22_irregular_weighted_fraction'])
    t08_else = float(row['t08_odd_feature_a23_other_weighted_fraction'])
    t08_merger = float(row['t08_odd_feature_a24_merger_weighted_fraction'])
    t08_dustlane = float(row['t08_odd_feature_a38_dust_lane_weighted_fraction'])
    
    max_t08 = max(t08_ring, t08_lens, t08_disturbed, t08_irregular, t08_else, t08_merger, t08_dustlane)
    
    if max_t08 == t08_ring:
        return 'odd feature is a ring'
    elif max_t08 == t08_lens:
        return 'odd feature is a lens or arc'
    elif max_t08 == t08_disturbed:
        return 'odd feature is a disturbed galaxy'
    elif max_t08 == t08_irregular:
        return 'odd feature is an irregular galaxy'
    elif max_t08 == t08_else:
        return 'something else odd feature'
    elif max_t08 == t08_merger:
        return 'a merger'
    else:
        return 'dust lane'


def task_09(row):
    # bulge shape

    t09_round = float(row['t09_bulge_shape_a25_rounded_weighted_fraction'])
    t09_boxy = float(row['t09_bulge_shape_a26_boxy_weighted_fraction'])
    t09_no = float(row['t09_bulge_shape_a27_no_bulge_weighted_fraction'])
    
    max_t09 = max(t09_round, t09_boxy, t09_no)
    
    if max_t09 == t09_round:
        return 'with rounded edge-on bulge'
    elif max_t09 == t09_boxy:
        return 'with boxy edge-on bulge'
    else:
        return None

def task_10(row):
    # arm winding

    t10_tight = float(row['t10_arms_winding_a28_tight_weighted_fraction'])
    t10_medium = float(row['t10_arms_winding_a29_medium_weighted_fraction'])
    t10_loose = float(row['t10_arms_winding_a30_loose_weighted_fraction'])
    
    max_t10 = max(t10_tight, t10_medium, t10_loose)
    
    if max_t10 == t10_tight:
        return 'tightly wound spiral arms'
    elif max_t10 == t10_medium:
        return 'medium wound spiral arms'
    else:
        return 'loosely wound spiral arms'

def task_11(row):
    # spiral arms

    t11_1 = float(row['t11_arms_number_a31_1_weighted_fraction'])
    t11_2 = float(row['t11_arms_number_a32_2_weighted_fraction'])
    t11_3 = float(row['t11_arms_number_a33_3_weighted_fraction'])
    t11_4 = float(row['t11_arms_number_a34_4_weighted_fraction'])
    t11_more_than_4 = float(row['t11_arms_number_a36_more_than_4_weighted_fraction'])
    
    max_t11 = max(t11_1, t11_2, t11_3, t11_4, t11_more_than_4)
    
    if max_t11 == t11_1:
        return '1 spiral arm'
    elif max_t11 == t11_2:
        return '2 spiral arms'
    elif max_t11 == t11_3:
        return '3 spiral arms'
    elif max_t11 == t11_4:
        return '4 spiral arms'
    else:
        return 'more than 4 spiral arms'


In [11]:
# read gz2_filename_mapping.csv file and returns a dictionary mapping DR7OBJID to asset_id.

asset_id_mapping = {}

with open(mapping_filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            objid = row['OBJID']
            asset_id = row['asset_id']
            asset_id_mapping[objid] = asset_id

In [18]:
def write_file(row, label_t01, 
                    label_t07, label_t02, 
                    label_t09, label_t03, label_t04, 
                    label_t05, label_t10, label_t11,
                    label_t06, label_t08):
    
    # Write labels to file

    dr7objid = row['dr7objid']
    asset_id = asset_id_mapping.get(dr7objid)

    output_filename = os.path.join(output_directory, f'{asset_id}.txt')
    with open(output_filename, 'w') as output_file:
        # follow the order of tiers in question tree 
        labels = [label_t01, 
                    label_t07, label_t02, 
                    label_t09, label_t03, label_t04, 
                    label_t05, label_t10, label_t11,
                    label_t06, label_t08]
        output_file.write(", ".join(filter(None, labels)))

In [19]:
# Initialize row counter
row_counter = 0

# main process
with open(catalog_filename, 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        row_counter += 1 # Increment counter

        # initialize
        label_t01 = label_t02 = label_t03 = label_t04 = label_t05 = label_t06 = label_t07 = label_t08 = label_t09 = label_t10 = label_t11 = None
        
        # Task 01: smooth or feature disk
        label_t01 = task_01(row)

        if label_t01 == 'smooth':
            # how round - complete, in-between, cigar
            label_t07 = task_07(row)
            # anything odd or not
            label_t06 = task_06(row)
            if label_t06 == 'something odd':
                # detailed odd feature
                label_t08 = task_08(row)
            elif label_t06 == None:
                write_file(row, label_t01, label_t07, label_t02, label_t09, label_t03, label_t04, 
                       label_t05, label_t10, label_t11, label_t06, label_t08)
            
        elif label_t01 == 'features or disk-shaped':
            # edge-on or not
            label_t02 = task_02(row)
            
            if label_t02 == 'edge-on galaxy':
                # bulge shape
                label_t09 = task_09(row)
            else:
                # bar in center or not
                label_t03 = task_03(row)
                # spiral galaxy or not
                label_t04 = task_04(row)
                if label_t04 == 'spiral galaxy':
                    # arm winding
                    label_t10 = task_10(row)
                    # spiral arms number
                    label_t11 = task_11(row)
                
                # bulge prominence
                label_t05 = task_05(row)
            
            # anything odd or not
            label_t06 = task_06(row)
            if label_t06 == 'something odd':
                # detailed odd feature
                label_t08 = task_08(row)
            elif label_t06 == None:
                write_file(row, label_t01, label_t07, label_t02, label_t09, label_t03, label_t04, 
                       label_t05, label_t10, label_t11, label_t06, label_t08)
                
        elif label_t01 == 'star or artifact':
            write_file(row, label_t01, label_t07, label_t02, label_t09, label_t03, label_t04, 
                       label_t05, label_t10, label_t11, label_t06, label_t08)
        

        # Write labels to file
        write_file(row, label_t01, label_t07, label_t02, label_t09, label_t03, label_t04, 
                       label_t05, label_t10, label_t11, label_t06, label_t08)

print(f"Total rows processed: {row_counter}")
print("Processing completed successfully!")

Total rows processed: 239695
Processing completed successfully!


In [20]:
print(len(os.listdir("/data_150T/home/mcr9196/galaxyzoo/tags")))

239695


In [5]:
print(len(os.listdir("/data_150T/home/mcr9196/galaxyzoo/images")))

243437
