In [53]:
from pathlib import Path
import logging
import pandas as pd

from db import DB

In [2]:
logging.basicConfig(filename='gzoo.log', format='%(asctime)s %(message)s', level=logging.DEBUG)

In [32]:
pg = DB()
params = pg.read_params()

In [33]:
params

{'dbname': 'gzoo',
 'host': '192.168.1.151',
 'username': 'python',
 'password': 'python',
 'dataroot': '/home/colin/data/munch1tb/zoobot_data',
 'mapping': 'gz2_catalog/zoo2MainSpecz_fields.txt',
 'datafile': 'gz2_catalog/zoo2MainSpecz.csv',
 'sdssdr7': 'gzimg/sdssdr7',
 'sdsspng': 'gzimg/sdsspng',
 'decalsdr5': 'gz_decals_dr5_png'}

In [35]:
dataroot = Path(params['dataroot'])
jpg_path = dataroot / params['sdssdr7']
png_path = dataroot / params['sdsspng']
decals_path = dataroot / params['decalsdr5']

In [40]:
def insert_rec(id_str, dr7id, path, size, survey, filetype):
    if dr7id is None:
        dr7id = 'NULL'
    sql = f"""
        insert into img
        (id_str, dr7id, path, size, survey, filetype)
        values ('{id_str}', {dr7id}, '{path}', {size}, '{survey}', '{filetype}')
        on conflict do nothing
        """
    pg.run_admin(sql)

In [29]:
def gz2(path, stem):
    count = 0
    for f in path.rglob(f'*.{stem}'):
        name = f.stem
        insert_rec(name, int(name), f, 424, 'SDSSDR7', stem)
#         print(f)
        count += 1
        if count > 5: break
        if count % 100000 == 0:
            logging.info(f'gzoo.img, {count} records inserted)')


    logging.info(f'gzoo.img, finished, {count} records inserted)')

In [30]:
gz2(jpg_path, 'jpg')

In [44]:
def decals(path, stem):
    count = 0
    for f in path.rglob(f'*.{stem}'):
        name = f.stem
        insert_rec(name, None, f, 424, 'DECaLS_5', stem)
#         print(f)
        count += 1
#         if count > 5: break
        if count % 100000 == 0:
            logging.info(f'gzoo.img, {count} records inserted)')


    logging.info(f'gzoo.img, finished, {count} records inserted)')

In [45]:
decals(decals_path, 'png')

In [57]:
sql = """
SELECT  
    id_str, 
    path as file_loc, 
    t01_smooth_or_features_a01_smooth_count,
    t01_smooth_or_features_a02_features_or_disk_count
FROM 
    gz2data g, img
WHERE 
    filetype = 'png' 
AND 
    g.dr7objid = img.dr7id
"""

data = pg.run_select(sql)

In [58]:
headers = ['id_str',
           'file_loc',
           'smooth-or-featured_smooth',
           'smooth-or-featured_featured-or-disk']

In [69]:
df = pd.DataFrame(data, columns=headers)
df['id_str'] = df['id_str'].str.rstrip()

df.to_csv('gz2_partial_pairs.csv', index=False)