# Update PSI Dataset

Want to include time evolving parameters. 


## Requirements of Dataset 

- Some of the pulses should be never before seen in the test set. 
- The rest are split 
- Pulses and time windows come from JET PDB (flat top H-mode)


In [11]:
import pandas as pd
import h5py
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
import torch

In [6]:
jet_pdb = pd.read_csv('/home/adam/ENR_Sven/moxie_copy/data/raw/pedestal-database.csv')
important_info = jet_pdb[['shot', 't1', 't2', 'neseparatrixfromexpdata10^19(m^-3)', 'error_neseparatrixfromexpdata10^19(m^-3)','neseparatrixfromfit10^19(m^-3)',
 'error_neseparatrixfromfit10^19(m^-3)','FLAG:HRTSdatavalidated']]
final_pulse_list = important_info[(important_info['neseparatrixfromexpdata10^19(m^-3)'] != 0.0) & (important_info['neseparatrixfromexpdata10^19(m^-3)'] != -1.0) &  (important_info['shot'] >= 79000) &  (important_info['FLAG:HRTSdatavalidated'] > 0)]


In [3]:
def get_window_profile_times(profiles, profiles_times, t1, t2):
    time_idx = np.logical_and(profiles_times > t1, profiles_times < t2)
    return profiles_times[time_idx], profiles[time_idx]

def average_machine_with_times(wind_times, mp_values, mp_times): 
    delta_T = 0.05002594*8
    sampled_vals = []
    for time in wind_times: 
        if len(mp_values) == 0:
            sampled_vals.append(0.0)
            continue
        aggregation_idx = np.logical_and(mp_times < time, mp_times > time - delta_T)
        aggregation_vals = mp_values[aggregation_idx]
        
        sampled_vals.append(np.mean(aggregation_vals))
    return np.array(sampled_vals)

def get_ne_and_te_profiles(raw_shot, t1, t2):
    sample_ne = raw_shot['profiles/NE'][:]
    sample_te = raw_shot['profiles/TE'][:]
    sample_time = raw_shot['profiles/time'][:]
    sample_radius = raw_shot['profiles/radius'][:]
    
    profiles_idx = np.logical_and(sample_time > t1, sample_time < t2)
    profiles_ne = sample_ne[profiles_idx]
    profiles_te = sample_te[profiles_idx]
    if len(profiles_ne[0]) != 63:
        profiles_ne = np.pad(profiles_ne, ((0, 0), (0, 63 - len(profiles_ne[0]))), 'constant')
    if len(profiles_te[0]) != 63:
        profiles_te = np.pad(profiles_te, ((0, 0), (0, 63 - len(profiles_te[0]))), 'constant')
    
    # Returns to numpy arrays of the same shape
    return profiles_ne, profiles_te

def combine_ne_te(prof_ne, prof_te):
    
    combined = np.stack([prof_ne, prof_te], axis=1)
    # Want a shape of num_slices X 63
    return combined

def sample_input(mp_loc, key, t1, t2, window_times): 
    mp_val, mp_time = mp_loc[key]['values'][:], mp_loc[key]['time'][:]
    final_mp_vals = average_machine_with_times(window_times, mp_val, mp_time)
    assert len(window_times) == len(final_mp_vals)
    return np.array(final_mp_vals)

def sample_slices(slices, y):
    num_windows = len(slices)
        
    test_size = int(0.2*num_windows)
    val_size = int(0.1*num_windows)
    
    if val_size == 0 or test_size==0:
        val_size = 1
        test_size = 1
        
    train_size = num_windows - test_size - val_size
    
    X_train, X_test, y_train, y_test, train_idx, test_idx =  train_test_split(slices, y, range(len(slices)), test_size=test_size)
    X_train, X_val, y_train, y_val, train_idx, val_idx = train_test_split(X_train, y_train, range(len(X_train)), test_size=val_size)
    
    return X_train, X_test, X_val, y_train, y_test, y_val, train_idx, val_idx, test_idx

def plot_profile(single_profile):
    fig = plt.figure()
    plt.plot(single_profile)
    plt.show()
    pass

In [41]:
X_train_str, y_train_str = [], []
X_val_str, y_val_str = [], []
X_test_str, y_test_str = [], []

X_train_both, y_train_both = [], []
X_val_both, y_val_both = [], []
X_test_both, y_test_both = [], []

radius_train_str = []
radius_val_str = []
radius_test_str = []
radius_train_both = []
radius_val_both = []
radius_test_both = []
i = 0
pulse_list = []
label_dict = {'BT': '$B_T$ [T]', 'CR0': 'a [m]', 'ELER': '$\Gamma \; (10^{22}$ e/s)', 'ELON': '$\kappa$ [-]', 'POHM': '$P_{OHM}$ [MW]', 'P_ICRH': '$P_{ICRH}$ [MW]', 'P_NBI': '$P_{NBI}$ [MW]', 'Q95' :'$q_{95}$ [-]', 'RGEO': '$R_{geo}$ [m]', 'TRIL': '$\delta_L$', 'TRIU': '$\delta_U$', 'VOLM': '$V_P$ [m$^{-3}$]', 'XIP': '$I_P$ [MA]'}
with h5py.File('/home/adam/ENR_Sven/moxie_copy/data/processed/profile_database_v1_psi22.hdf5', 'r') as f:
    mp_keys = list(label_dict.keys())
    print(mp_keys)
    for index, row in final_pulse_list.iterrows():
        shot, t1, t2 = str(int(row['shot'])), row['t1'], row['t2']
        if shot == '79499':
            continue
        nesep, dnesep = row['neseparatrixfromexpdata10^19(m^-3)'], row['error_neseparatrixfromexpdata10^19(m^-3)']
        pulse_sample = f[shot]
        pulse_list.append(shot)
        
        sample_mp = pulse_sample['machine_parameters']
        sample_prof = pulse_sample['profiles']
        prof_times = sample_prof['time'][:]
        profiles = sample_prof['NE'][:]
        radii = sample_prof['radius'][:]
        
        # print(len(sample_mp['ELER/values'][:]))
        
        window_times, windowed_profiles = get_window_profile_times(profiles, prof_times, t1, t2)
        
        # Get profiles
        profiles_ne, profiles_te = get_ne_and_te_profiles(pulse_sample, t1, t2)
        
        mean, std = np.mean(profiles_te, 1), np.std(profiles_te, 1)
        
        # mask = (profiles_te[k] > mean[k] + 3*std[k])
        
        for k in range(len(profiles_te)-1): 
            profiles_te[k][profiles_te[k] > mean[k] + 3*std[k]] = 0.0
            if (profiles_te[k] > mean[k] + 3*std[k]).any() == True:
                
                # print('HELP {}'.format(k))
                # print(profiles_te[k] > mean[k] + std[k])
                plt.plot(profiles_te[k])
                plt.hlines([mean[k], mean[k] + 3*std[k]], 0, 63, colors=['black', 'grey'])
        plt.show()
        
        combined = combine_ne_te(profiles_ne, profiles_te)
        assert len(profiles_ne) == len(window_times)
        
        # Get machine parameters
        sampled_machine_params = np.array([sample_input(sample_mp, key, t1, t2, window_times) for key in mp_keys]).T
        assert len(sampled_machine_params) == len(profiles_ne)
        # print(sampled_machine_params)
        # print(np.isnan(sampled_machine_params).any())
        if np.isnan(sampled_machine_params).any() == True:
            if np.isnan(sampled_machine_params[:, 5]).any() == True: 
                # print('Its the ICRH')
                np.nan_to_num(sampled_machine_params, copy=False)
            # print(sampled_machine_params)
        assert np.isnan(sampled_machine_params).any() == False
        
        
        
        
        ne_train, ne_test, ne_valid, ne_y_train, ne_y_test, ne_y_val, train_idx, val_idx, test_idx = sample_slices(profiles_ne, sampled_machine_params)
        
        both_train, both_test, both_valid, both_y_train, both_y_test, both_y_val, both_train_idx, both_val_idx, both_test_idx = sample_slices(combined, sampled_machine_params)
        # i += 1
        
        
        
        
        # break 
        
        
        X_train_str.extend(ne_train)
        X_val_str.extend(ne_valid)
        X_test_str.extend(ne_test)
        y_train_str.extend(ne_y_train)
        y_val_str.extend(ne_y_val)
        y_test_str.extend(ne_y_test)


        X_train_both.extend(both_train)
        X_val_both.extend(both_valid)
        X_test_both.extend(both_test)
        y_train_both.extend(both_y_train)
        y_val_both.extend(both_y_val)
        y_test_both.extend(both_y_test)
        
        radius_train_str.extend([radii for _ in range(len(ne_train))])
        radius_val_str.extend([radii for _ in range(len(ne_valid))])
        radius_test_str.extend([radii for _ in range(len(ne_test))])
        radius_train_both.extend([radii for _ in range(len(both_train))])
        radius_val_both.extend([radii for _ in range(len(both_valid))])
        radius_test_both.extend([radii for _ in range(len(both_test))])

['BT', 'CR0', 'ELER', 'ELON', 'POHM', 'P_ICRH', 'P_NBI', 'Q95', 'RGEO', 'TRIL', 'TRIU', 'VOLM', 'XIP']


In [37]:
print(len(X_train_str), len(y_train_str))
print(pulse_list)

59465 59465
['79192', '79192', '79193', '79193', '79195', '79195', '79197', '79198', '79138', '79498', '79501', '79503', '79505', '79507', '79508', '79512', '79513', '79515', '79517', '79518', '79523', '79523', '79572', '79573', '79573', '79573', '79582', '79584', '79585', '79587', '79588', '79589', '79622', '79623', '79625', '79625', '79626', '79626', '79628', '79628', '79629', '79629', '79630', '79630', '79630', '79631', '79633', '79634', '79635', '79636', '79638', '79638', '79639', '79641', '79641', '79641', '79641', '79642', '79642', '79643', '79643', '79643', '79647', '79647', '79651', '79651', '79651', '79653', '79653', '79653', '79654', '79654', '79656', '79656', '79656', '79657', '79657', '79657', '79661', '79664', '79666', '79667', '79668', '79669', '79672', '79673', '79675', '79676', '79676', '79679', '79682', '79684', '79685', '79687', '79688', '79690', '79691', '79692', '79696', '79697', '79697', '79698', '79717', '79717', '79718', '79719', '79720', '79722', '79723', '79725

In [42]:
with h5py.File('/home/adam/ENR_Sven/moxie_copy/data/processed/profile_database_v1_psi22.hdf5', 'r+') as f:
    grp_datasets = f['processed_datasets']
    grp_psi = grp_datasets['PSI22']
    
    #del grp_psi['density_revised']
    del grp_psi['density_and_temperature_revised']
    
    #grp_strohman = grp_psi.create_group("density_revised")
    grp_both = grp_psi.create_group("density_and_temperature_revised")
    
    # grp_str_train = grp_strohman.create_group('train')
    # grp_str_test = grp_strohman.create_group('test')
    # grp_str_val = grp_strohman.create_group('valid')

    grp_both_train = grp_both.create_group('train')
    grp_both_test = grp_both.create_group('test')
    grp_both_val = grp_both.create_group('valid')
    
    # meta_group = grp_strohman.create_group("meta")
    
    
    # pulse_list = np.array(list(set([int(key) for key in pulse_list])))
    # key_list = np.array([s.encode('utf-8') for s in mp_keys])
    # meta_pulse = meta_group.create_dataset('pulse_list', data=pulse_list)
    # meta_y_atrr = meta_group.create_dataset('y_column_names', data=key_list)

    # dset_X_train_str = grp_str_train.create_dataset("X", data=X_train_str)
    # dset_y_train_str = grp_str_train.create_dataset("y", data=y_train_str)
    # dset_r_train_str = grp_str_train.create_dataset("radii", data=radius_train_str)

    # dset_X_test_str = grp_str_test.create_dataset("X", data=X_test_str)
    # dset_y_test_str = grp_str_test.create_dataset("y", data=y_test_str)
    # dset_r_test_str = grp_str_test.create_dataset("radii", data=radius_test_str)

    # dset_X_val_str = grp_str_val.create_dataset("X", data=X_val_str)
    # dset_y_val_str = grp_str_val.create_dataset("y", data=y_val_str)
    # dset_r_val_str = grp_str_val.create_dataset("radii", data=radius_val_str)


    dset_X_train_both = grp_both_train.create_dataset("X", data=X_train_both)
    dset_y_train_both = grp_both_train.create_dataset("y", data=y_train_both)
    # dset_r_train_both = grp_both_train.create_dataset("radii", data=radius_train_both)
        
    dset_X_test_both = grp_both_test.create_dataset("X", data=X_test_both)
    dset_y_test_both = grp_both_test.create_dataset("y", data=y_test_both)
    # dset_r_test_both = grp_both_test.create_dataset("radii", data=radius_test_both)
    
    dset_X_val_both = grp_both_val.create_dataset("X", data=X_val_both)
    dset_y_val_both = grp_both_val.create_dataset("y", data=y_val_both)
    # dset_r_val_both = grp_both_val.create_dataset("radii", data=radius_val_both)