# YSO Chord Diagram Project
## Phase 1: Exploring Correlations in Young Stellar Objects

This notebook uses **Cachai** to visualize relationships between YSO properties through chord diagrams.

**Objectives:**
- Load and clean YSO survey data from papers A, B, and C
- Compute correlation matrices for key observables
- Generate chord diagrams showing variable inter-relationships
- Analyze clustering patterns and physical connections


## ðŸ§¾ Section 1: Imports and Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cachai.chplot as chp
import cachai.utilities as chu
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("âœ“ All imports successful")
print(f"Cachai version information loaded")

## ðŸ“Š Section 2: Load and Explore Data

In [None]:
# Import utility functions
import sys
sys.path.insert(0, '/Users/marcus/Desktop/YSO')
from yso_utils import parse_mrt_file, get_summary_statistics, categorize_variability

# Load papers
print("Loading YSO data from three papers...\n")

paper_b_file = '/Users/marcus/Desktop/YSO/apjsadc397t2_mrt.txt'
df_b = parse_mrt_file(paper_b_file)

print(f"Paper B (apjsadc397t2_mrt.txt): {len(df_b)} sources")
print(f"  Declination range: {df_b['DEdeg'].min():.1f}Â° to {df_b['DEdeg'].max():.1f}Â°")
print(f"  YSO Classes: {df_b['YSO_CLASS'].nunique()}")
print(f"  Light Curve Types: {df_b['LCType'].nunique()}")

### Data Cleaning and Summary Statistics

In [None]:
# Generate summary statistics
stats = get_summary_statistics(df_b)

print("\n" + "="*70)
print("SUMMARY STATISTICS - Paper B")
print("="*70)
print(f"Total objects: {stats['total_objects']}")
print(f"\nYSO Class Distribution:")
for yso_class, count in stats['yso_classes'].items():
    pct = 100 * count / stats['total_objects']
    print(f"  {yso_class}: {count:4d} ({pct:5.1f}%)")

print(f"\nLight Curve Types:")
for lc_type, count in stats['lc_types'].items():
    pct = 100 * count / stats['total_objects']
    print(f"  {lc_type}: {count:4d} ({pct:5.1f}%)")

print(f"\nBrightness (WISE W2 band):")
print(f"  Mean magnitude: {stats['mean_w2_mag']:.2f} Â± {stats['std_w2_mag']:.2f} mag")

print(f"\nVariability (Magnitude amplitude):")
print(f"  Mean Î”mag: {stats['mean_variability']:.3f} Â± {stats['std_variability']:.3f} mag")

# Add variability category
df_b['Variability'] = categorize_variability(df_b, 'delW2mag')
print(f"\nVariability Categories:")
for var_cat, count in df_b['Variability'].value_counts().items():
    pct = 100 * count / len(df_b)
    print(f"  {var_cat}: {count:4d} ({pct:5.1f}%)")

### First Look at Data

In [None]:
# Display sample rows
print("Sample of data (first 5 sources):\n")
display(df_b[['Objname', 'RAdeg', 'DEdeg', 'YSO_CLASS', 'W2magMean', 'delW2mag', 'LCType']].head())

print(f"\nData types:")
print(df_b.dtypes)

## ðŸ”— Section 3: Correlation Analysis

In [None]:
from yso_utils import compute_correlation_matrix

# Select numeric columns for correlation
numeric_cols = ['W2magMean', 'sig_W2Flux', 'delW2mag', 'Period', 'slope', 'r_value', 'FLP_LSP_BOOT']

# Compute correlation matrix
corr_matrix = compute_correlation_matrix(df_b, numeric_cols)

print("Correlation Matrix (Variability Metrics):")
print("="*70)
display(corr_matrix.round(3))

# Find strongest correlations
print("\n\nStrongest Correlations (excluding diagonal):")
print("-"*70)
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], abs(corr_val), corr_val))

corr_pairs.sort(key=lambda x: x[2], reverse=True)
for col1, col2, abs_corr, corr_val in corr_pairs[:8]:
    print(f"  {col1:15s} â†” {col2:15s}: {corr_val:7.3f} (|r|={abs_corr:.3f})")

### Visualize Correlation Matrix

In [None]:
# Heatmap of correlations
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Matrix: YSO Variability Metrics', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/Users/marcus/Desktop/YSO/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Correlation heatmap saved")

## ðŸ“ˆ Section 4: Chord Diagrams - Relationships Between Variables

### Chord Diagram 1: Correlation Structure

In [None]:
# Create chord diagram from correlation matrix
fig, ax = plt.subplots(figsize=(14, 12))

# Use absolute values for better visualization
corr_abs = corr_matrix.abs()

chp.chord(
    corr_abs,
    ax=ax,
    threshold=0.15,
    chord_alpha=0.6,
    fontsize=10
)

# Improve label positioning
improve_chord_labels(ax, len(corr_abs.columns))

ax.set_title('Correlation Matrix: Variability Metrics', 
             fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/Users/marcus/Desktop/YSO/chord_correlation_metrics.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Chord diagram saved: chord_correlation_metrics.png")

### Chord Diagram 2: YSO Class vs Light Curve Type

In [None]:
from yso_utils import normalize_for_chord
import matplotlib.patches as mpatches

def improve_chord_labels(ax, n_labels):
    """
    Improve readability of chord diagram labels by:
    1. Making them horizontal where possible
    2. Positioning them outside the circle
    3. Increasing font size
    """
    texts = [t for t in ax.texts]
    for i, text in enumerate(texts):
        # Get text properties
        x, y = text.get_position()
        
        # Calculate angle from center
        angle = np.arctan2(y, x) * 180 / np.pi
        
        # Move text slightly further out for better spacing
        distance = np.sqrt(x**2 + y**2)
        scale_factor = 1.15  # Move text 15% further out
        new_x = x * scale_factor
        new_y = y * scale_factor
        text.set_position((new_x, new_y))
        
        # Adjust text rotation for better readability
        # Make text readable by keeping it roughly horizontal
        if angle > 90 and angle < 270:
            text.set_rotation(angle - 180)
        else:
            text.set_rotation(angle)
        
        # Increase font size
        text.set_fontsize(11)
        text.set_fontweight('bold')
        text.set_ha('center')
        text.set_va('center')

# Create contingency table
contingency_lc = pd.crosstab(df_b['YSO_CLASS'], df_b['LCType'])
print("Contingency Table: YSO Class vs Light Curve Type")
print("="*70)
display(contingency_lc)

# Normalize and create chord diagram
labels_combined = (
    [f"YSO_CLASS:{c}" for c in contingency_lc.index] +
    [f"LCType:{l}" for l in contingency_lc.columns]
)

chord_matrix = normalize_for_chord(contingency_lc)

fig, ax = plt.subplots(figsize=(14, 12))
chp.chord(
    chord_matrix,
    names=labels_combined,
    ax=ax,
    threshold=0.01,
    chord_alpha=0.5,
    fontsize=10
)

# Improve label positioning
improve_chord_labels(ax, len(labels_combined))

ax.set_title('YSO Class vs Light Curve Type', 
             fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/Users/marcus/Desktop/YSO/chord_yso_class_vs_lightcurve.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Chord diagram saved: chord_yso_class_vs_lightcurve.png")

### Chord Diagram 3: YSO Class vs Variability Category

In [None]:
# Create contingency table for YSO Class vs Variability
contingency_var = pd.crosstab(df_b['YSO_CLASS'], df_b['Variability'])
print("Contingency Table: YSO Class vs Variability")
print("="*70)
display(contingency_var)

# Normalize and create chord diagram
labels_var = (
    [f"YSO_CLASS:{c}" for c in contingency_var.index] +
    [f"Variability:{v}" for v in contingency_var.columns]
)

chord_matrix_var = normalize_for_chord(contingency_var)

fig, ax = plt.subplots(figsize=(14, 12))
chp.chord(
    chord_matrix_var,
    names=labels_var,
    ax=ax,
    threshold=0.01,
    chord_alpha=0.5,
    fontsize=10
)

# Improve label positioning
improve_chord_labels(ax, len(labels_var))

ax.set_title('YSO Class vs Variability', 
             fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/Users/marcus/Desktop/YSO/chord_yso_class_vs_variability.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Chord diagram saved: chord_yso_class_vs_variability.png")

### Chord Diagram 4: Light Curve Type vs Variability

In [None]:
# Create contingency table for Light Curve Type vs Variability
contingency_lc_var = pd.crosstab(df_b['LCType'], df_b['Variability'])
print("Contingency Table: Light Curve Type vs Variability")
print("="*70)
display(contingency_lc_var)

# Normalize and create chord diagram
labels_lc_var = (
    [f"LCType:{l}" for l in contingency_lc_var.index] +
    [f"Variability:{v}" for v in contingency_lc_var.columns]
)

chord_matrix_lc_var = normalize_for_chord(contingency_lc_var)

fig, ax = plt.subplots(figsize=(14, 12))
chp.chord(
    chord_matrix_lc_var,
    names=labels_lc_var,
    ax=ax,
    threshold=0.01,
    chord_alpha=0.5,
    fontsize=10
)

# Improve label positioning
improve_chord_labels(ax, len(labels_lc_var))

ax.set_title('Light Curve Type vs Variability', 
             fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('/Users/marcus/Desktop/YSO/chord_lightcurve_vs_variability.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Chord diagram saved: chord_lightcurve_vs_variability.png")

## ðŸ’¾ Section 5: Save Culled Tables

In [None]:
# Create output directory
output_dir = Path('/Users/marcus/Desktop/YSO/culled_tables')
output_dir.mkdir(exist_ok=True)

# Save full cleaned dataset
df_b_clean = df_b.dropna(subset=['YSO_CLASS', 'LCType'])
df_b_clean.to_csv(output_dir / 'PaperB_Full_Cleaned.csv', index=False)
print(f"âœ“ Saved PaperB_Full_Cleaned.csv ({len(df_b_clean)} sources)")

# Save by light curve type
for lc_type in df_b_clean['LCType'].unique():
    df_subset = df_b_clean[df_b_clean['LCType'] == lc_type]
    filename = f"PaperB_{lc_type}.csv"
    df_subset.to_csv(output_dir / filename, index=False)
    print(f"âœ“ Saved {filename} ({len(df_subset)} sources)")

# Save by YSO class
for yso_class in df_b_clean['YSO_CLASS'].unique():
    df_subset = df_b_clean[df_b_clean['YSO_CLASS'] == yso_class]
    filename = f"PaperB_Class_{yso_class}.csv"
    df_subset.to_csv(output_dir / filename, index=False)
    print(f"âœ“ Saved {filename} ({len(df_subset)} sources)")

print("\nâœ“ All culled tables saved to:", output_dir)

## ðŸ“‹ Section 6: Summary and Findings

In [None]:
print("\n" + "="*80)
print("PHASE 1 ANALYSIS SUMMARY")
print("="*80)
print("\nDATA OVERVIEW")
print("-"*80)
print(f"  Paper B (Variability Study):")
print(f"  - Total objects: {len(df_b)} sources")
print(f"  - YSO Classes: {df_b['YSO_CLASS'].nunique()}")
print(f"  - Light Curve Types: {df_b['LCType'].nunique()}")

print("\nKEY CORRELATIONS IDENTIFIED")
print("-"*80)
print("  Top correlation pairs:")
for col1, col2, abs_corr, corr_val in corr_pairs[:5]:
    print(f"    {col1} <-> {col2}: {corr_val:.3f}")

print("\nVARIABILITY INSIGHTS")
print("-"*80)
print("  Magnitude Amplitude Distribution:")
for var_cat in ['Low', 'Medium', 'High']:
    count = (df_b['Variability'] == var_cat).sum()
    pct = 100 * count / len(df_b)
    print(f"    {var_cat:6s}: {count:5d} ({pct:5.1f}%)")

print("\n  Average variability by YSO class:")
var_by_class = df_b.groupby('YSO_CLASS')['delW2mag'].mean().sort_values(ascending=False)
for yso_class, var in var_by_class.items():
    print(f"    {yso_class:8s}: {var:.3f} mag")

print("\nOUTPUTS GENERATED")
print("-"*80)
print("  Figures (PNG):")
print("    - correlation_heatmap.png")
print("    - chord_correlation_metrics.png")
print("    - chord_yso_vs_lightcurve.png")
print("    - chord_yso_vs_variability.png")
print("    - chord_lightcurve_vs_variability.png")
print("\n  Data (CSV):")
print("    - Culled tables saved to /Users/marcus/Desktop/YSO/culled_tables/")
print("\n" + "="*80)

## ðŸ“§ Files Ready for Submission

**Main Figure for Professor (Recommended):**
- `chord_correlation_metrics.png` - Core variability relationships

**Supporting Figures:**
- `correlation_heatmap.png` - Quantitative correlation matrix
- `chord_yso_vs_lightcurve.png` - Classification relationships
- `chord_yso_vs_variability.png` - Variability by class
- `chord_lightcurve_vs_variability.png` - Curve morphology relationships

**Data Products:**
- Culled CSV tables organized by YSO class and light curve type
- Ready for spectroscopy analysis in Phase 2
