In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightkurve as lk
import time
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# Lock for thread-safe printing
print_lock = Lock()

def get_lightcurve_info(row_data):
    """Get sector info for a single TIC - designed for parallel processing"""
    i, tic_id, st_tmag, st_rad, st_mass, st_teff = row_data
    tic = f"TIC {int(tic_id)}"
    
    try:
        lc_collection = lk.search_lightcurve(tic, mission="TESS", author="TESS-SPOC", cadence="long").mission
        if lc_collection is not None:
            sector_length = len(lc_collection)
            with print_lock:
                print(f'{i} tic {tic}: sector length {sector_length}, tmag {st_tmag}, rad {st_rad}, mass {st_mass}, teff {st_teff}')
                
            return {
                'tic': tic,
                'sector_length': sector_length,
                'st_tmag': st_tmag,
                'st_rad': st_rad,
                'st_mass': st_mass,
                'st_teff': st_teff
            }
        else:
            with print_lock:
                print(f'{i} tic {tic}: No data found')
            return {
                'tic': tic,
                'sector_length': np.nan,
                'st_tmag': st_tmag,
                'st_rad': st_rad,
                'st_mass': st_mass,
                'st_teff': st_teff
            }
    except Exception as e:
        with print_lock:
            print(f"Error processing {tic}: {e}")
        return {
            'tic': tic,
            'sector_length': np.nan,
            'st_tmag': st_tmag,
            'st_rad': st_rad,
            'st_mass': st_mass,
            'st_teff': st_teff
        }

# Load and deduplicate data
star_params = pd.read_csv("matched_CTL_sample_with_properties_10x.csv")
star_params = star_params[star_params['tmag'] < 13]
star_params = star_params[star_params['teff'] > 3000]
star_params = star_params[star_params['teff'] < 7000]
#print(star_params)
#print(f"Max st_teff: {star_params['st_teff'].max()}")
star_params = star_params#[2900000:2930000]  # Process only first 100 rows for testing
print(f"Original rows: {len(star_params)}")

# Remove duplicates based on TIC ID - keep first occurrence
star_params_unique = star_params.drop_duplicates(subset=['tic_id'], keep='first').reset_index(drop=True)
print(f"After deduplication: {len(star_params_unique)}")
print(f"Duplicates removed: {len(star_params) - len(star_params_unique)}\n")

# Load existing results to skip already processed TICs
output_file = "CTL_10x_with_sectors.csv"
if os.path.exists(output_file):
    existing_results = pd.read_csv(output_file)
    processed_tics = set(existing_results['tic'].values)
    print(f"Found {len(processed_tics)} already processed TICs\n")
else:
    existing_results = pd.DataFrame()
    processed_tics = set()

# Start timing
start_time = time.time()
print(f"Starting processing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total stars to process: {len(star_params_unique)}\n")

# Prepare data for parallel processing - skip already processed TICs
row_data_list = [
    (i, row['tic_id'], row['tmag'], row['radius'], row['mst'], row['teff'])
    for i, row in star_params_unique.iterrows()
    if f"TIC {int(row['tic_id'])}" not in processed_tics
]

print(f"Skipping {len(star_params_unique) - len(row_data_list)} already processed TICs")
print(f"Processing {len(row_data_list)} new TICs\n")

# Parallel processing with ThreadPoolExecutor
# Use max_workers based on your system - typically 10-20 for I/O-bound tasks
max_workers = 10  # Adjust this based on your system and network bandwidth
results_list = []

print(f"Using {max_workers} parallel workers\n")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_row = {executor.submit(get_lightcurve_info, row_data): row_data for row_data in row_data_list}
    
    # Collect results as they complete
    completed = 0
    for future in as_completed(future_to_row):
        result = future.result()
        results_list.append(result)
        completed += 1
        
        # Progress update every 10 stars
        if completed % 10 == 0:
            elapsed = time.time() - start_time
            rate = completed / elapsed
            remaining = len(row_data_list) - completed
            eta = remaining / rate if rate > 0 else 0
            with print_lock:
                print(f"\nProgress: {completed}/{len(row_data_list)} ({100*completed/len(row_data_list):.1f}%) - ETA: {eta/60:.1f} min\n")

# Combine new results with existing ones
if len(results_list) > 0:
    CTL_sectors = pd.concat([existing_results, pd.DataFrame(results_list)], ignore_index=True)
    CTL_sectors.to_csv(output_file, index=False)
    print(f"\nSaved {len(results_list)} new results ({len(CTL_sectors)} total) to {output_file}")
else:
    CTL_sectors = existing_results
    print(f"\nNo new results to save")

# End timing
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"Total time elapsed: {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
if len(row_data_list) > 0:
    print(f"Average time per star: {elapsed_time/len(row_data_list):.2f} seconds")
    print(f"Processing rate: {len(row_data_list)/elapsed_time:.2f} stars/second")
print(f"Finished at {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")