In [None]:
import copy
import h5py
import itertools
import numpy as np
import os
import pandas as pd
import scipy
import scipy.interpolate
import tqdm

In [None]:
import matplotlib
import matplotlib.pyplot as plt
# Currently need to call this to get matplotlib selected style to load...
plt.plot()
matplotlib.style.use( '/Users/zhafen/repos/clean-bold/clean-bold-mnras.mplstyle' )
import palettable
import matplotlib.patheffects as path_effects

In [None]:
import yt
import trident
import unyt as u

In [None]:
import kalepy as kale
import verdict

In [None]:
import helpers

# Parameters

In [None]:
# Analysis parameters
seed = 15482
rng = np.random.default_rng( seed )
verbose = False
Z_sun = 0.014

In [None]:
# Data management parameters
distribution_fp = './data/EAGLE/histogram_galaxies_logM200c-Msun-12.0-12.5_200_seed0_hneutralssh.hdf5'
data_dir = './data/synthetic_data/sample1'
observer_data_dir = './data/synthetic_data_samples/sample1'
summary_data_fp = './data/polished_data/summary.h5'
figure_dir = '/Users/zhafen/drafts/cgm_modeling_challenge_paper/figures/sample1'

In [None]:
os.makedirs( figure_dir, exist_ok=True )

In [None]:
# Ray parameters
redshift = 0.25
n_sightlines = 100
min_clouds_per_sightline = 1
max_clouds_per_sightline = 3
velocity_range = [ -150., 150. ] # In km/s
finite_cloud_max_logT = 5 # We'll only allow one cloud per line of sight with temperatures greater than this

In [None]:
# Spectra parameters
ions = [
    'H I',
    'O I',
    'C II',
    'C III',
    'N II',
    'N III',
    'Si II',
    'Si III',
    'Si IV',
#     'N V',
    'O VI',
    'Mg II'
]
fields = [
    'H_p0_number_density', 
    'O_p0_number_density',
    'C_p1_number_density',
    'C_p2_number_density',
    'N_p1_number_density',
    'N_p2_number_density',
    'Si_p1_number_density',
    'Si_p2_number_density',
    'Si_p3_number_density',
#     'N_p4_number_density',
    'O_p5_number_density',
    'Mg_p1_number_density'
]
plotted_ions = [
    'H I 923',
    'H I 926',
    'H I 930',
    'H I 937',
    'H I 949',
    'H I 972',
    'H I 1025',
    'H I 1215',
    'Si II 1020',
    'Si II 1190',
    'Si II 1193',
    'Si II 1260',
    'Si II 1304',
    'Si III 1206',
    'Si IV 1393',
    'Si IV 1402',
    'C II 1036',
    'C II 1334',
    'C III 977',
    'N II 1083',
    'N III 989',
    'N V 1238',
    'N V 1242',
    'O I 1302',
    'O VI 1031',
    'O VI 1037',
    'Mg II 2796',
    'Mg II 2803',
]
snr = 30

In [None]:
# Plotting parameters
colors = palettable.cartocolors.qualitative.Safe_10.mpl_colors
modeled_color = palettable.cartocolors.qualitative.Safe_10.mpl_colors[1]
revised_color = palettable.cartocolors.qualitative.Safe_10.mpl_colors[0]

# Load Data

## Modeled Data

In [None]:
modeled_data_dir = './data/modeling_results/sameer_charlton/sample1/pdfs'

In [None]:
# Load into a dictionary
modeled = {}
for dirname in tqdm.tqdm( os.listdir( modeled_data_dir ) ):
        
    # Get dirs, skip others
    current_dir = os.path.join( modeled_data_dir, dirname )
    if not os.path.isdir( current_dir ):
        continue
        
    modeled_dir = {}
    for file in os.listdir( current_dir ):
        comp_key = file.split( '.' )[0]
        
        fp = os.path.join( current_dir, file )
        df = pd.read_csv( fp, sep=' ', header=None, )
        df.columns = [ 'Prob', 'Likelihood', 'logZ', 'logT', 'lognH' ]
        
        modeled_dir[comp_key] = df
        
    modeled[dirname[:3]] = modeled_dir

In [None]:
# Generate histograms, and store in a format conducive to plotting
sl_keys = sorted( list( modeled.keys() ) )
params = [ 'logZ', 'logT', 'lognH' ]
dx = 0.0
dists = {}
for param in params:
    
    param_dists = {
        'xs': [],
        'values': [],
        'mles': [],
        'sls': [],
        'comps': [],
    }
    for i, sl in enumerate( tqdm.tqdm( sl_keys ) ):
        
        for j, ( comp_key, df ) in enumerate( modeled[sl].items() ):
            
            values = df[param].values
            
            x = i + j*dx
            
            kde = kale.kde.KDE( values )
            centers, pdf = kde.density()
            mle = centers[pdf.argmax()]
            
            param_dists['xs'].append( x )
            param_dists['values'].append( values )
            param_dists['mles'].append( mle )
            param_dists['sls'].append( sl )
            param_dists['comps'].append( comp_key )
            
    dists[param] = param_dists

In [None]:
# Count the number of components
n_comp_modeled = [ len( modeled[sl].keys() ) for sl in sl_keys ]

## Generated Data

In [None]:
sls = verdict.Dict.from_hdf5( './data/synthetic_data/sample1/sightlines.h5', jagged_flag='sl')

In [None]:
clouds = {}
for key, item in sls.items():
    clouds[key] = np.concatenate( item )

In [None]:
# Used sightlines
indices = np.array( sl_keys ).astype( int )

In [None]:
# Particular sightlines chosen
combined = {
    'logZ': [],
    'logT': [],
    'lognH': [],
}
for i in indices:
    
    print( 'Sightline {:03d}'.format( i ) )

    density = 10.**sls['Density'][i] * u.g * u.cm**-3 / u.mp * 0.75
    temperature = 10.**sls['Temperature'][i] * u.K
    metallicity = 10.**sls['Metallicity'][i] / Z_sun
    HI_column = 10.**sls['HI Column'][i] * u.cm**-2
    velocity = sls['LOS Velocity'][i] * u.km / u.s
    lengths = sls['Lengths'][i] * u.cm
    
    for j, den in enumerate( density ):
        
        print( '    logZ = {:.3g}, logT = {:.3g}, logn = {:.3g}'.format( 
                np.log10( metallicity[j] ),
                np.log10( temperature[j] ),
                np.log10( den ),
            )
        )

    if len( velocity ) == 2:
        print( '    delta_v = {:.3g}'.format( np.abs( velocity[1] - velocity[0] ) ) )
    
    den = ( density * lengths ).sum() / lengths.sum()
    temp = ( temperature * density * lengths ).sum() / ( density * lengths ).sum()
    met = ( metallicity * Z_sun * density * lengths ).sum() / ( Z_sun * density * lengths ).sum()
    print( '    Combined, logZ = {:.3g}, logT = {:.3g}, logn = {:.3g}'.format( 
            np.log10( met ),
            np.log10( temp ),
            np.log10( den ),
        )
    )
    
    combined['lognH'].append( den )
    combined['logZ'].append( met )
    combined['logT'].append( temp )

# Parameter Estimation Results

In [None]:
param_mapping = {
    'logZ': 'Metallicity',
    'logT': 'Temperature',
    'lognH': 'Density',
}
x_labels = {
    'logZ': r'$\log_{10} Z / Z_\odot$',
    'logT': r'$\log_{10} T / K$',
    'lognH': r'$\log_{10} n_{\rm H} / {\rm cm}^{-3}$',
}

In [None]:
panel_width = plt.rcParams['figure.figsize'][0]
fig = plt.figure( figsize=( panel_width, panel_width*len( x_labels )/2. ), facecolor='w' )
# ax_main = plt.gca()

gs = matplotlib.gridspec.GridSpec( 3, 1 )
# gs.update( hspace=0.1)

main_xs = np.arange( len( indices ) )

for i, param in enumerate( params ):
    ax = fig.add_subplot( gs[i,0] )
    
#     # Combined
#     ax.scatter(
#         main_xs,
#         np.log10( combined[param] ),
#         color = 'none',
#         edgecolor = 'k',
#         s = 200,
#         zorder = 100,
#     )

    # Individual clouds
    for i, ind in enumerate( indices ):

        if param == 'logZ':
            ys = np.log10( 10.**sls[param_mapping[param]][ind] / Z_sun )
        elif param == 'lognH':
            ys = np.log10( 10.**sls[param_mapping[param]][ind] * u.g * u.cm**-3 / u.mp * 0.75 )
        else:
            ys = sls[param_mapping[param]][ind]

        xs = np.full( ys.size, i )
        ax.scatter(
            xs,
            ys,
            color = 'k',
            zorder = 90,
            edgecolor = 'k',
            linewidth = 0.2,
            s = 25,
        )
        
        ax.annotate(
            text = '{}'.format( ys.size ),
            xy = (i, 1),
            xycoords = matplotlib.transforms.blended_transform_factory( ax.transData, ax.transAxes ),
            xytext = ( -1, 2.5 ),
            textcoords = 'offset points',
            va = 'bottom',
            ha = 'right',
            fontsize = 'small',
            fontweight = 'bold',
        )
    
    # Annotate number of modeled components
    ax.annotate(
        text = r'$n_{\rm comp}:$',
        xy = ( 0, 1 ),
        xycoords = 'axes fraction',
        xytext = ( 0, 2.5 ),
        textcoords = 'offset points',
        fontsize = 'small',
        va = 'bottom',
        ha = 'right',
        fontweight = 'bold',
    )
    for i, n_comp in enumerate( n_comp_modeled ): 
        ax.annotate(
            text = '{}'.format( n_comp ),
            xy = (i, 1),
            xycoords = matplotlib.transforms.blended_transform_factory( ax.transData, ax.transAxes ),
            xytext = ( 1, 2.5 ),
            textcoords = 'offset points',
            va = 'bottom',
            ha = 'left',
            fontsize = 'small',
            color = modeled_color,
            fontweight = 'bold',
        )
        
    # Violin plot
    v = ax.violinplot(
        dists[param]['values'],
        dists[param]['xs'],
        showextrema = False,
        widths = 0.75,
#         showmeans = True,
#         showmedians = True,
#         quantiles = [ [ 0.01135, 0.5, 1. - 0.01135 ], ] * len( dists[param]['xs'] )
    )
    for i, poly in enumerate( v['bodies'] ):
        poly.set_alpha( 0.5 )
        poly.set_color( modeled_color, )
    
    # Plot MLEs
    ax.scatter(
        dists[param]['xs'],
        dists[param]['mles'],
        color = modeled_color,
        s = 20,
        edgecolor = 'w',
        linewidth = 0.2,
        zorder = 100,
    )
    
    ax.set_ylabel( x_labels[param] )

    tick_labels = [ _[1:] for _ in sl_keys ]
    plt.xticks( ticks=main_xs, labels=tick_labels )
    
savefile = os.path.join( figure_dir, 'comparison.pdf' )
print( 'Saving at {}'.format( savefile ) )
plt.savefig( savefile, bbox_inches='tight' )

# Compile Summary Data

In [None]:
summary = verdict.Dict()

In [None]:
# For estimated data
for param_key in dists.keys():
    data_for_param = dists[param_key]
    for i, mle in enumerate( data_for_param['mles'] ):
        
        if param_key[:3] == 'log':
            param_key = param_key[3:]
        
        summary.setitem( 'estimated', mle, 'maximum likelihood estimate', param_key, data_for_param['sls'][i], data_for_param['comps'][i] )
                
        percentiles = {}
        for p in [ 1, 5, 16, 25, 50 ]:
            percentiles[helpers.percentile_str_fn(p/100.)] = np.nanpercentile( data_for_param['values'][i], p )
            percentiles[helpers.percentile_str_fn(1. - p/100.)] = np.nanpercentile( data_for_param['values'][i], 100-p )
        summary.setitem( 'estimated', percentiles, 'posterior percentiles', param_key, data_for_param['sls'][i], data_for_param['comps'][i], )

In [None]:
synthetic_data_units  = {
    'Density': u.g * u.cm**-3 / u.mp * 0.75,
    'Temperature': u.K,
    'Metallicity': 1 / Z_sun,
    'HI Column': u.cm**-2,
    'LOS Velocity': u.km / u.s,
    'Lengths': u.cm,
    'PDF Value': 1.,
}
synthetic_data_is_logscale = [ 'Density', 'Temperature', 'Metallicity', 'HI Column' ]

In [None]:
# For source data
for long_param_key in sls.keys():
        
    for sl in sl_keys:
        
        param_key = helpers.key_given_property[long_param_key]
        
        values = sls[long_param_key][int(sl)]
        
        if long_param_key in synthetic_data_is_logscale:
            values = 10.**values
        
        values *= synthetic_data_units[long_param_key]
        
        if param_key in helpers.logscale_props:
            values = np.log10( values )

        summary.setitem( 'source', values, param_key, sl )

In [None]:
total_summary = verdict.Dict.from_hdf5( summary_data_fp, create_nonexistent=True )
total_summary['sample1'] = summary
total_summary.to_hdf5( summary_data_fp, )

In [None]:
assert False, 'Below is unnecessary. I will just use Sameers Plots.'

# Plot output spectra + fit

In [None]:
# Objects for use
ldb = trident.LineDatabase('lines.txt')

In [None]:
# Add Mg II lines
ldb.add_line( 'Mg', 'II', 2796, use_linetools=True)
ldb.add_line( 'Mg', 'II', 2803, use_linetools=True)

In [None]:
# Make sure we have the lower part of this Si IV doublet
ldb.add_line( 'Si', 'IV', 1394, use_linetools=True)

In [None]:
sg_cos = trident.SpectrumGenerator('COS-G130M', line_database=ldb )
sg_cos_160 = trident.SpectrumGenerator('COS-G160M', line_database=ldb )

In [None]:
# Spectrum Generator for Mg II from ground
lambda_mg = ldb.select_lines( 'Mg', 'II', 2796 )[0].wavelength * ( 1. + redshift )
sg_mg = trident.SpectrumGenerator(
    lambda_min = lambda_mg - 30.,
    lambda_max = lambda_mg + 30.,
    dlambda = 0.01,
    lsf_kernel = os.path.join( trident.path, 'data', 'lsf_kernels', 'avg_COS.txt' ),
    line_database = ldb,
)

In [None]:
sgs = [ sg_cos, sg_cos_160, sg_mg ]
spectrum_sg_tags = [ '_G130', '_G160', '_MgII']

In [None]:
i = 0
j = 0
k = 0

In [None]:
ind_sl = indices[i]
sl_key = sl_keys[i]

In [None]:
sg = sgs[j]
spectrum_tag = spectrum_sg_tags[j]

In [None]:
spectrum_file = 'spectrum{}_sl{:04d}.h5'.format( spectrum_tag, ind_sl )
spectrum_fp = os.path.join( observer_data_dir, spectrum_file )
spectrum = verdict.Dict.from_hdf5( spectrum_fp )

In [None]:
plotted_ion = plotted_ions[k]

In [None]:
selected_ions = ldb.parse_subset( plotted_ion )
assert len( selected_ions ) == 1, 'Too many or too few possible ions.'
selected_ion = selected_ions[0]

In [None]:
z_p_1 = spectrum['wavelength'] / ( 1. + redshift ) / selected_ion.wavelength
vs_spec = ( u.c * ( z_p_1**2. - 1. ) / ( z_p_1**2. + 1. ) ).to( 'km/s' )

In [None]:
# in_range = ( vs_spec > velocity_range[0] * u.km / u.s ) & ( vs_spec < velocity_range[1] * u.km / u.s )

In [None]:
fig = plt.figure()
ax = plt.gca()

ax.plot(
    vs_spec,
    spectrum['flux'],
    color = 'k',
)

ax.annotate(
    text = plotted_ion,
    xy = ( 0, 0 ),
    xytext = ( 5, 5 ),
    xycoords = 'axes fraction',
    textcoords = 'offset points',
    va = 'bottom', 
    ha = 'left',
)

ax.set_xlim( velocity_range )
ax.set_ylim( 0, 1.1 )

ax.set_xlabel( 'velocity [km/s]' )
ax.set_ylabel( 'flux' )

In [None]:
# Get estimated cloud properties
clouds_estimated = summary['estimated']['maximum likelihood estimate']
clouds_estimated = clouds_estimated.inner_item( sl_key )
clouds_estimated = clouds_estimated.apply( verdict.Dict.array )

# # Sort by T
# sort_inds_estimated = np.argsort( clouds_estimated['T'] )
# clouds_estimated = clouds_estimated.inner_item( sort_inds_estimated )

In [None]:
# Get source cloud properties
clouds_source = summary['source'].inner_item( sl_key )

# # Sort by T
# sort_inds_source = np.argsort( clouds_source['T'] )
# clouds_source = clouds_source.inner_item( sort_inds_source )

In [None]:
i_cloud = 0

In [None]:
i_cloud_source = np.argmin( np.abs( clouds_estimated['T'][i_cloud] - clouds_source['T'] ) )

In [None]:
density_estimated = 10.**clouds_estimated['nH'][i_cloud] * u.cm**-3.,
density_estimated *= ( u.mp / 0.75 )
density_estimated = density_estimated.to( 'g/cm**3' )[0]

In [None]:
# First, let's create a one-zone dataset for our desired density,
# temperature, metallicity, and redshift.  We'll arbitrarily set it to
# be 1 kpc in width.  
test_ds = trident.make_onezone_dataset(
    density = density_estimated,
    temperature = 10.**clouds_estimated['T'][i_cloud] * u.K,
    metallicity = 10.**clouds_estimated['Z'][i_cloud] * u.Zsun,
    domain_width = 1.*u.kpc
)
test_ds.current_redshift = redshift

# Now let's add our desired ions to this dataset, using Trident's 
# lookup table based on the Haardt-Madau 2012 UV background.
trident.add_ion_fields( test_ds, ions=ions )

# Since we now know the HI number density for this dataset, and we
# have a desired HI column density from above (i.e., a LLS), we can divide 
# these two to get a desired length for the dataset.
HI_column = 10.**clouds_source['NHI'][i_cloud_source] * u.cm**-2.
length = HI_column / test_ds.r[('gas', 'H_p0_number_density')][0]

ray = trident.make_onezone_ray(
    density = density_estimated,
    temperature = 10.**clouds_estimated['T'][i_cloud] * u.K,
    metallicity = 10.**clouds_estimated['Z'][i_cloud] * u.Zsun,
    length = length,
    redshift = redshift,
)
trident.add_ion_fields( ray, ions=ions )

In [None]:
sg.make_spectrum( ray )