In [1]:
# Data stuffs
import pickle 
import numpy as np
import pandas as pd
import pathlib 
import os, sys
from moxie.data.utils_ import load_data, standardize, de_standardize, normalize_profiles


# Make it look pretty
from tqdm.notebook import tqdm  

# ML Stuff
from scipy.stats import truncnorm


In [14]:
train_data, val_data, test_data = load_data(dataset_choice='SANDBOX_NO_VARIATIONS', file_loc='../../../moxie/data/processed/pedestal_profiles_ML_READY_ak_5052022_uncerts_mask.pickle')
(train_X, train_y, train_mask, train_radii, train_real_space_radii, train_ids, train_uncert), (val_X, val_y, val_mask, val_radii, val_real_space_radii, val_ids, val_uncert), (test_X, test_y, test_mask, test_radii, test_real_space_radii, test_ids, test_uncert) = train_data, val_data, test_data
# profiles, mps, masks, psis, rmids, trainids, uncerts
with open('../../data/raw/new_elm_timings_catch.pickle', 'rb') as file: 
    JET_ELM_TIMINGS = pickle.load(file) 

machine_param_order = ['Q95', 'RGEO', 'CR0', 'VOLM', 'TRIU', 'TRIL', 'ELON', 'POHM', 'IPLA', 'BVAC', 'NBI', 'ICRH', 'ELER']

def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

train_pulse_order = [int(x.split('/')[0]) for x in train_ids]
train_pulses_ordered_set = unique(train_pulse_order)
train_pulse_idxs = [[index for index in range(len(train_pulse_order)) if train_pulse_order[index] == pulse] for pulse in train_pulses_ordered_set]

val_pulse_order = [int(x.split('/')[0]) for x in val_ids]
val_pulses_ordered_set = unique(val_pulse_order)
val_pulse_idxs = [[index for index in range(len(val_pulse_order)) if val_pulse_order[index] == pulse] for pulse in val_pulses_ordered_set]

test_pulse_order = [int(x.split('/')[0]) for x in test_ids]
test_pulses_ordered_set = unique(test_pulse_order)
test_pulse_idxs = [[index for index in range(len(test_pulse_order)) if test_pulse_order[index] == pulse] for pulse in test_pulses_ordered_set]

In [22]:

def calculate_elm_percent(time_last_elm, time_next_elm, hrts_time): 
    return (hrts_time - time_last_elm) / (time_next_elm - time_last_elm)

def find_elm_timings_for_set(dataset, pulse_indexes, pulse_numbers, elm_dict): 
    """
    This will return np array of elm percentages per slice. 
    i.e., it is of length num_slice in the set
    """
    _, _, _, _, _, setids, _ = dataset
    setids = np.array(setids)
    elm_percentages = []
    lost_slices = 0
    iterator = tqdm(enumerate(zip(pulse_indexes, pulse_numbers)))
    print('doing elm timings')
    for n, (indexes, number) in iterator:
        
        iterator.set_description(str(number))
        try: 
            pulse_elm_timings_frass = np.array(elm_dict[number])
        except KeyError as e:
            pulse_elm_percentages = [np.nan]*len(indexes)
            lost_slices += len(indexes)
        else:
            pulse_elm_percentages = []
            hrts_timings = [float(time_slice.split('/')[1]) for time_slice in setids[indexes]]
            for time in hrts_timings: 
                diff = pulse_elm_timings_frass - time
                try:
                    time_last_elm = pulse_elm_timings_frass[diff < 0][-1]
                    time_next_elm = pulse_elm_timings_frass[diff > 0][0]
                except IndexError as e:
                    # print('Outside of ELM timings', time, pulse_elm_timings_frass)
                    slice_elm_percent = np.nan
                    lost_slices += 1
                else: 
                    slice_elm_percent = calculate_elm_percent(time_next_elm, time_last_elm, time)
                pulse_elm_percentages.append(slice_elm_percent)
        elm_percentages.extend(pulse_elm_percentages)
    print('Slices outside of ELM Windows found by Frassinetti: ', lost_slices)
    
    return np.array(elm_percentages)



def make_new_set(dataset, pulse_indexes, pulse_numbers, elm_dict): 
    """
    Now we go about kicking out all of the slices that fell outside the elm window range given by T17-05. 
    
    Parameters
    ==========
    
    pulse_indexes: List[List[int]]
        For each pulse, a list of indexes of the profiles/mps/rmids/etc corresponding to that pulse 
        This needs to be rewritten in terms of what is available for ELM timings 
    pulse_numbers: List[int]
        List of pulse numbers in the order of the pulse indexes
        
    """
    set_elm_percentages = find_elm_timings_for_set(dataset, pulse_indexes, pulse_numbers, elm_dict)
    profiles, mps, masks, psis, rmids, setids, uncerts = dataset
    
    # The number of slices that have elm percentages
    num_has_elm_percentage = len(set_elm_percentages) - np.isnan(set_elm_percentages).sum()

    new_pulse_numbers = pulse_numbers.copy()
    new_indexes = pulse_indexes.copy()
    # new_mps, new_masks, new_psis, new_rmids, new_trainids, new_uncerts = 
    
    iterator = tqdm(enumerate(zip(pulse_indexes, pulse_numbers)))
    # go through the indexes and pop those that are correspond to 
    for n, (number) in iterator:
        iterator.set_description(str(number))
        elm_slices_for_pulse = set_elm_percentages[pulse_indexes[n]]
        non_nan_idx = np.invert(np.isnan(elm_slices_for_pulse))
        new_indexes[n] = np.array(new_indexes[n])[non_nan_idx].tolist()
        
        
    return new_indexes, set_elm_percentages
    
    


train_new_pulse_ids, train_elm_percentages = make_new_set(train_data, train_pulse_idxs, train_pulses_ordered_set, JET_ELM_TIMINGS)
val_new_pulse_ids, val_elm_percentages = make_new_set(val_data, val_pulse_idxs, val_pulses_ordered_set, JET_ELM_TIMINGS)
test_new_pulse_ids, test_elm_percentages = make_new_set(test_data, test_pulse_idxs, test_pulses_ordered_set, JET_ELM_TIMINGS)



0it [00:00, ?it/s]

doing elm timings
Slices outside of ELM Windows found by Frassinetti:  4518


0it [00:00, ?it/s]

0it [00:00, ?it/s]

doing elm timings
Slices outside of ELM Windows found by Frassinetti:  1241


0it [00:00, ?it/s]

0it [00:00, ?it/s]

doing elm timings
Slices outside of ELM Windows found by Frassinetti:  561


0it [00:00, ?it/s]

In [26]:
np.isnan(train_elm_percentages).sum(), len(train_elm_percentages)

(4518, 22885)

In [6]:
elm_timing_dict = {'train': {'pulse_idx': train_new_pulse_ids, 'elm_percentages': train_elm_percentages}, 
                   'val': {'pulse_idx': val_new_pulse_ids, 'elm_percentages': val_elm_percentages}, 
                   'test': {'pulse_idx': test_new_pulse_ids, 'elm_percentages': test_elm_percentages}}
with open('../../data/processed/new_elm_timings_idxs.pickle', 'wb') as file: 
    pickle.dump(elm_timing_dict, file) 

# for p in range(5, 100):
#     assert len(train_new_pulse_ids[p]) == len(train_pulse_idxs[p]), p

Collect neseps for each slice in train-val-test, and store and save as smaller subset. 

```
    dict_supervsied = {'train': {'mps': np.array((13, N)), 'neseps': np.array(N))}, 
                        'val': {'mps': np.array((13, N)), 'neseps': np.array(N))}, 
                        'test': {'mps': np.array((13, N)), 'neseps': np.array(N))}}
```

In [29]:
def calculate_neseps_in_set(set_ids, set_profiles, set_uncerts, set_masks, set_mps, set_timings, set_shot_numbers_by_slice): 
    iterator = tqdm(range(len(set_ids)))
    
    nesep_means_by_pulse = []
    neseps_means = []
    mps_means = []
    for k in iterator: 
        slice_loc_in_set = set_ids[k]
         
        pulse_number = np.array(set_shot_numbers_by_slice)[set_ids[k]][0]
        
        sample_profiles, sample_uncerts, sample_masks, sample_mps, sample_timings = set_profiles[slice_loc_in_set], set_uncerts[slice_loc_in_set], set_masks[slice_loc_in_set], set_mps[slice_loc_in_set], set_timings[slice_loc_in_set]
        pulse_neseps = calculate_nesep_for_pulse(sample_profiles, sample_uncerts, sample_masks, lbound_n = 0, ubound_n = 0.5e21, lbound_t=0, ubound_t=2000)
        nesep_means_by_pulse.extend(pulse_neseps)
        
        neseps_means.append(pulse_neseps.mean())
        mps_means.append(sample_mps.mean(0))
        iterator.set_description_str(str(pulse_number))        
        break 
        
    return np.array(nesep_means_by_pulse), np.array(neseps_means), np.array(mps_means)


def calculate_nesep_for_pulse(both_profiles, both_uncertanties, integer_masks, lbound_n, ubound_n, lbound_t, ubound_t, conditional_prediction=False):
    """
    This will return the neseps predicted for each time slice in the pulse. 
    """
    
    pulse_neseps = np.zeros(len(both_profiles))
    pulse_teseps = np.zeros(len(both_profiles))
    if conditional_prediction: 
        integer_masks = np.ones_like(integer_masks, dtype=bool)
        both_uncertanties = np.ones_like(both_uncertanties, dtype=bool)*200
    
    for n, (both_profiles_slice, both_uncertanties_slice, mask_int_slice) in enumerate(zip(both_profiles, both_uncertanties, integer_masks)):
        bool_mask = mask_int_slice > 0
        slice_ne, slice_te = both_profiles_slice[0, :][bool_mask], both_profiles_slice[1, :][bool_mask]
        slice_ne_uncert, slice_te_uncert = both_uncertanties_slice[0, :][bool_mask], both_uncertanties_slice[1, :][bool_mask]
        
        tes_gaussians = np.array([np.linspace(truncnorm.ppf(0.0001, (lbound_t - mu) / var, (ubound_t - mu) / var, mu, var), 
                                     truncnorm.ppf(0.9999, (lbound_t - mu) / var, (ubound_t - mu) / var, mu, var), 10000)
                        for mu, var in zip(slice_te, slice_te_uncert)])
        
        separatrix_loc = np.logical_and(tes_gaussians > 90, tes_gaussians < 110)
        tesep_in_separatrix = tes_gaussians[separatrix_loc]
        
        
        
        
        nes_gaussians = np.array([np.linspace(truncnorm.ppf(0.1, (lbound_n - mu) / var, (ubound_n - mu) / var, mu, var), 
                                     truncnorm.ppf(0.9, (lbound_n - mu) / var, (ubound_n - mu) / var, mu, var), 10000)
                        for mu, var in zip(slice_ne, slice_ne_uncert)])

        nes_in_separatrix = nes_gaussians[separatrix_loc]
        if len(nes_in_separatrix) == 0: 
            print(slice_ne, slice_te, slice_ne_uncert, slice_te_uncert)
        slice_nesep = np.mean(nes_in_separatrix)
        slice_tesep = np.mean(tesep_in_separatrix)
        pulse_neseps[n] = slice_nesep
        pulse_teseps[n] = slice_tesep
    
    return pulse_neseps
                    

In [30]:
train_neseps, train_neseps_mean, train_mps_mean = calculate_neseps_in_set(train_pulse_idxs, train_X, train_uncert, train_mask, train_y, train_pulse_order, train_ids)
val_neseps, val_neseps_mean, val_mps_mean = calculate_neseps_in_set(val_pulse_idxs, val_X, val_uncert, val_mask, val_y, val_pulse_order, val_ids)
test_neseps, test_neseps_mean, test_mps_mean = calculate_neseps_in_set(test_pulse_idxs, test_X, test_uncert, test_mask, test_y, test_pulse_order, test_ids)

  0%|          | 0/533 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/77 [00:00<?, ?it/s]

In [35]:
print(len(train_neseps), len(train_y))
print(len(train_neseps_mean), train_mps_mean.shape)

22885 22885
533 (533, 13)


In [36]:
supervised_dict = {'train': {'mps': train_y, 'neseps': train_neseps, 'mps_pulse': train_mps_mean, 'neseps_pulse': train_neseps_mean}, 
                    'val': {'mps': val_y, 'neseps': val_neseps, 'mps_pulse': val_mps_mean, 'neseps_pulse': val_neseps_mean}, 
                    'test': {'mps': test_y, 'neseps': test_neseps, 'mps_pulse': test_mps_mean, 'neseps_pulse': test_neseps_mean}, 
                  'MP_ORDER': machine_param_order}

In [37]:
PERSONAL_DATA_DIR_PROC = '/home/kitadam/ENR_Sven/moxie/data/processed/'
# with open(PERSONAL_DATA_DIR_PROC + 'supervised_set.pickle', 'wb') as file:
#     pickle.dump(supervised_dict, file)
with open(PERSONAL_DATA_DIR_PROC + 'supervised_set.pickle', 'rb') as file:
    supervised_dict = pickle.load(file)

In [None]:
supervised_dict