### Run upon export from spreadsheet

In [None]:
import os

from astroquery.mast import Catalogs
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 50)

def generate_data(include_toi):
    tces_file = '../mnt/tess/labels/tce_bls_to_y3.csv'
    labels_file = '../mnt/tess/labels/labels_vetting_v1.csv'
    corrections_file = '../mnt/tess/labels/e_label_ephemerides.csv'

    tce_table = pd.read_csv(tces_file, header=0, low_memory=False).set_index('tic_id')
    tce_table = tce_table.drop(columns=['Unnamed: 0'])
    tce_table['Duration'] /= 24.0

    joined_table = tce_table
    
    corrections_table = pd.read_csv(corrections_file, header=0, low_memory=False);
    corrections_table['tic_id'] = corrections_table['tic']
    corrections_table = corrections_table.set_index('tic_id')
    corrections_table['dur'] /= 24.0
    corrections_table['dep'] *= 1e6

    joined_table = joined_table.join(corrections_table, on='tic_id', how='left')
    joined_table['Epoc'] = np.where(joined_table['epo'].isna(), joined_table['Epoc'], joined_table['epo'])
    joined_table['Duration'] = np.where(joined_table['dur'].isna(), joined_table['Duration'], joined_table['dur'])
    joined_table['Period'] = np.where(joined_table['per'].isna(), joined_table['Period'], joined_table['per'])
    joined_table['Transit_Depth'] = np.where(joined_table['dep'].isna(), joined_table['Transit_Depth'], joined_table['dep'])

    joined_table = joined_table.reset_index()[[
        'tic_id', 'RA', 'Dec', 'Tmag', 'Epoc', 'Period', 'Duration',
        'Transit_Depth', 'Sectors', 'star_rad', 'star_mass', 'teff',
        'logg', 'SN', 'Qingress'
    ]]


    labels_table = pd.read_csv(labels_file, header=0, low_memory=False)
    labels_table = labels_table.drop(columns=['261262721'])
    labels_table['tic_id'] = labels_table['TIC ID']

    disps = ['e', 'p', 'n', 'b', 't', 'u']
    users = ['ch', 'et', 'md', 'as', 'mk']

    for d in disps:
        labels_table[f'disp_{d}'] = 0

    def set_labels(row):
        a = ~row.isna()
        if a['Final']:
            row[f'disp_{row["Final"][0]}'] = 1
            row[f'disp_{row["Final"][1]}'] = 1
        else:
            for user in users:
                if a[user] and row[user]:
                    row[f'disp_{row[user][0]}'] += 1
                    row[f'disp_{row[user][1]}'] += 1

        return row

    labels_table = labels_table.apply(set_labels, axis=1)
    labels_table = labels_table[['tic_id', 'Split'] + [f'disp_{d}' for d in disps]]

    joined_table = joined_table.set_index('tic_id')
    labels_table = labels_table.set_index('tic_id')
    joined_table = joined_table.join(labels_table, on='tic_id', how='inner')


    toi = pd.read_csv('../mnt/tess/labels/tce_toi_vetting_p+eb.csv', header=0, low_memory=False).set_index('tic_id')

    np.random.seed(1117)
    toi['rand'] = np.random.randint(0, 100, [len(toi)])
    toi['Split'] = toi.apply(lambda r: 'train' if r['rand'] < 80 else 'test'if r['rand'] >= 90 else 'val', axis=1)
    toi = toi.drop(columns=['rand'])
    
    if include_toi:
        # Trust the curated labels. This might help us to detect any inconsisitencies in TOI.
        toi = toi[~toi.index.isin(joined_table.index.values)]
        joined_table = joined_table.append(toi)


    print(f'Total entries: {len(joined_table)}')
    joined_table = joined_table[
        sum(joined_table[f'disp_{d}'] for d in disps) > 0
    ]
    print(f'Total labeled entries: {len(joined_table)}')


    all_table = joined_table

    t_train = joined_table[joined_table['Split'] == 'train']
    t_val = joined_table[joined_table['Split'] == 'val']
    t_test = joined_table[joined_table['Split'] == 'test']
    print(f'Split sizes. Train: {len(t_train)}; Valid: {len(t_val)}; Test: {len(t_test)}')
    print(f'Duplicate TICs: {len(all_table.index.values) - len(set(all_table.index.values))}')

    t_train = t_train.drop(columns=['Split'])
    t_val = t_val.drop(columns=['Split'])
    t_test = t_test.drop(columns=['Split'])
    all_table = all_table.drop(columns=['Split'])
    
    return t_train, t_val, t_test, all_table


t_train, t_val, t_test, all_table = generate_data(True)
t_train.to_csv('../mnt/tess/astronet/tces-vetting-v7-toi-train.csv')
t_val.to_csv('../mnt/tess/astronet/tces-vetting-v7-toi-val.csv')
t_test.to_csv('../mnt/tess/astronet/tces-vetting-v7-toi-test.csv')

all_table.to_csv('../mnt/tess/astronet/tces-vetting-all.csv')

t_train, t_val, t_test, _ = generate_data(False)
t_train.to_csv('../mnt/tess/astronet/tces-vetting-v7-train.csv')
t_val.to_csv('../mnt/tess/astronet/tces-vetting-v7-val.csv')
t_test.to_csv('../mnt/tess/astronet/tces-vetting-v7-test.csv')


In [None]:
all_table.sample(30)

### Run once

In [None]:
import numpy as np
import pandas as pd

def clean_tois():
    toi = pd.read_csv('../mnt/tess/labels/toi_p.csv', header=0, low_memory=False)

    toi = toi[toi['Period'] < 99999]

    toi['disp_p'] = 1
    toi['disp_e'] = 0
    toi['disp_b'] = 0
    toi['disp_t'] = 0
    toi['disp_u'] = 0
    toi['disp_n'] = 0

    toi = toi[[
        'tic_id',
        'RA',
        'Dec',
        'Tmag',
        'Epoc',
        'Period',
        'Duration',
        'Transit_Depth',
        'star_rad',
        'star_mass',
        'teff',
        'logg',
        'disp_e',
        'disp_p',
        'disp_n',
        'disp_b',
        'disp_t',
        'disp_u',
    ]]

    toi['rand'] = toi.apply(lambda r: np.random.randint(0, 100), axis=1)

    ebs = pd.read_csv('../mnt/tess/labels/ebs_ephemerides.csv', header=0, low_memory=False)
    ebs['dur'] /= 24.0
    ebs['dep'] *= 1e6

    ebs['tic_id'] = ebs['tic']
    ebs['RA'] = None
    ebs['Dec'] = None
    ebs['Tmag'] = None
    ebs['Epoc'] = ebs['epo']
    ebs['Period'] = ebs['per']
    ebs['Duration'] = ebs['dur']
    ebs['Transit_Depth'] = ebs['dep']
    ebs['star_rad'] = None
    ebs['star_mass'] = None
    ebs['teff'] = None
    ebs['logg'] = None

    eb_labels = pd.read_csv('../mnt/tess/labels/additionalebs.csv', header=None, low_memory=False)
    ebs = ebs.set_index('tic_id').join(eb_labels.set_index(0), how='inner').reset_index()
    ebs['tic_id'] = ebs['index']

    def set_label(row):
        row['disp_p'] = 0
        row['disp_e'] = 1
        if row[3] == 'b':
            row['disp_b'] = 1
            row['disp_t'] = 0
            row['disp_u'] = 0
        elif row[3] == 't':
            row['disp_b'] = 0
            row['disp_t'] = 1
            row['disp_u'] = 0
        elif row[3] == 'u':
            row['disp_b'] = 0
            row['disp_t'] = 0
            row['disp_u'] = 1
        else:
            raise ValueError(row)

        row['disp_n'] = 0
        return row

    ebs = ebs.apply(set_label, axis=1)

    ebs = ebs[[
        'tic_id',
        'RA',
        'Dec',
        'Tmag',
        'Epoc',
        'Period',
        'Duration',
        'Transit_Depth',
        'star_rad',
        'star_mass',
        'teff',
        'logg',
        'disp_e',
        'disp_p',
        'disp_n',
        'disp_b',
        'disp_t',
        'disp_u',
    ]]
    ebs = ebs.drop_duplicates()

    ebs['rand'] = ebs.apply(lambda r: np.random.randint(0, 100), axis=1)

    toi_pe = toi.append(ebs)
    toi_pe.to_csv('../mnt/tess/labels/tce_toi_vetting_p+eb.csv')
    
    print('TOIs:', len(toi), 'EBs:', len(ebs), 'All:', len(toi_pe))

clean_tois()