In [31]:
import pickle 
import numpy as np
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split

In [3]:
with open('all_shots_with_T_raw_p3.pickle', 'rb') as file:
    pulse_dicts = pickle.load(file)
    
"""
with open('all_shots_with_T_raw_p3.pickle', 'wb') as file:
    pickle.dump(pulse_dicts, file, protocol=pickle.HIGHEST_PROTOCOL)
"""

In [4]:
input_cols_efit = ['Q95', 'RGEO', 'CR0', 'VOLM', 'TRIU', 'TRIL', 'XIP', 'ELON', 'POHM']
input_cols_scal = ['BT']
input_cols_gash = ['ELER']
input_cols_nbi = ['P_NBI']
input_cols_icrh = ['P_ICRH']

col_time_efit = ['EFIT_T']

In [5]:
f = h5py.File('../processed/profile_database.hdf5', 'w')
for n, (pulse_id, pulse_data) in enumerate(pulse_dicts.items()):
    try :
        pulse_grp = f.create_group(pulse_id)

        input_grp = pulse_grp.create_group('machine_parameters')
        profile_grp = pulse_grp.create_group('profiles')

        pulse_input_data = pulse_data['inputs']
        pulse_output_data = pulse_data['outputs']

        for input_id, input_data in pulse_input_data.items():
            if input_id == 'EFIT_T' or input_id ==  'CRO':
                continue
            individual_grp = input_grp.create_group(input_id)
            
            if input_id not in input_cols_efit:
                input_data, input_time = input_data['values'], input_data['time']
            else:
                input_time = pulse_input_data['EFIT_T']
            
            dset_r = individual_grp.create_dataset("values", data=np.array(input_data))
            dset_t = individual_grp.create_dataset("time", data=np.array(input_time))
            
        for out_id, output_data in pulse_output_data.items():
            dset_p = profile_grp.create_dataset(out_id, data=output_data)
            
        
    except ValueError as e: 
        print(e)
f.close() 

In [None]:
def printname(name):
    return name

i = 0
with h5py.File('../processed/profile_database.hdf5', 'r') as file: 
    # print(file.visit(printname))
    
    for name in file: 
        print(name)
        for subgroup in file[name]:
            print(subgroup)
            for index in file[name + '/' + subgroup]:
                print(index)
                for field in file[name + '/' + subgroup + '/' + index]:
                    print(field)
        break 
        # i += 1
        # if i == 3: 
        #     break
    # for pulse in file.keys():
    #     print(file[pulse + '/machine_parameters'].keys())
    #    break

Now we can create the PSI database

In [40]:
EFIT = ['Q95', 'RGEO', 'CR0', 'VOLM', 'TRIU', 'TRIL', 'XIP', 'ELON', 'POHM']
ALL_KEYS = EFIT + ['BT', 'ELER', 'P_NBI', 'P_ICRH', 'NE']
def sample_nesep(mean_val, std_val):
    return np.random.normal(mean_val, std_val)

def sample_input(input_grp, key, t1, t2, nesep):
    # for key in list(input_grp.keys()):
    if key == 'NE':
        return nesep
    source = key + '/'
    sample_time = input_grp[key + '/time'][:]
    sample_values = input_grp[key + '/values'][:]

    time_idx = np.logical_and(sample_time > t1, sample_time < t2)
    sample = np.mean(sample_values[time_idx])
        
    return sample

def get_ne_and_te_profiles(raw_shot, t1, t2):
    sample_ne = raw_shot['profiles/NE'][:]
    sample_te = raw_shot['profiles/TE'][:]
    sample_time = raw_shot['profiles/time'][:]
    sample_radius = raw_shot['profiles/radius'][:]
    
    profiles_idx = np.logical_and(sample_time > t1, sample_time < t2)
    profiles_ne = sample_ne[profiles_idx]
    profiles_te = sample_te[profiles_idx]
    if len(profiles_ne[0]) != 63:
        profiles_ne = np.pad(profiles_ne, ((0, 0), (0, 63 - len(profiles_ne[0]))), 'constant')
    if len(profiles_te[0]) != 63:
        profiles_te = np.pad(profiles_te, ((0, 0), (0, 63 - len(profiles_te[0]))), 'constant')
    
    # Returns to numpy arrays of the same shape
    return profiles_ne, profiles_te, sample_radius

def combine_ne_te(prof_ne, prof_te):
    
    combined = np.stack([prof_ne, prof_te], axis=1)
    # Want a shape of num_slices X 63
    return combined

def sample_slices(slices, y):
    num_windows = len(slices)
        
    test_size = int(0.2*num_windows)
    val_size = int(0.1*num_windows)
    
    if val_size == 0 or test_size==0:
        val_size = 1
        test_size = 1
        
    train_size = num_windows - test_size - val_size
    
    X_train, X_test, y_train, y_test, train_idx, test_idx =  train_test_split(slices, y, range(len(slices)), test_size=test_size)
    X_train, X_val, y_train, y_val, train_idx, val_idx = train_test_split(X_train, y_train, range(len(X_train)), test_size=val_size)
    
    return X_train, X_test, X_val, y_train, y_test, y_val, train_idx, val_idx, test_idx

In [27]:
jet_pdb = pd.read_csv('pedestal-database.csv')
important_info = jet_pdb[['shot', 't1', 't2', 'neseparatrixfromexpdata10^19(m^-3)', 'error_neseparatrixfromexpdata10^19(m^-3)','neseparatrixfromfit10^19(m^-3)',
 'error_neseparatrixfromfit10^19(m^-3)','FLAG:HRTSdatavalidated']]
final_pulse_list = important_info[(important_info['neseparatrixfromexpdata10^19(m^-3)'] != 0.0) & (important_info['neseparatrixfromexpdata10^19(m^-3)'] != -1.0) &  (important_info['shot'] >= 79000) &  (important_info['FLAG:HRTSdatavalidated'] > 0)]


In [None]:
X_train_str, y_train_str = [], []
X_val_str, y_val_str = [], []
X_test_str, y_test_str = [], []

X_train_both, y_train_both = [], []
X_val_both, y_val_both = [], []
X_test_both, y_test_both = [], []

radius_train_str = []
radius_val_str = []
radius_test_str = []
radius_train_both = []
radius_val_both = []
radius_test_both = []

pulse_list = []
with h5py.File('../processed/profile_database.hdf5', 'r+') as f:
    try: 
        grp_datasets = f.create_group("processed_datasets")
    except ValueError as e: 
        grp_datasets = f['processed_datasets']
    try: 
        grp_psi = grp_datasets.create_group("PSI22")
    except ValueError as e: 
        grp_psi = grp_datasets['PSI22']
    
    try: 
        grp_strohman = grp_psi.create_group("density")
    except ValueError as e: 
        grp_strohman = grp_psi['density']
    try: 
        grp_both = grp_psi.create_group("density_and_temperature")
    except ValueError as e: 
        grp_both = grp_psi['density_and_temperature']
    
    try: 
        grp_str_train = grp_strohman.create_group('train')
        grp_str_test = grp_strohman.create_group('test')
        grp_str_val = grp_strohman.create_group('valid')

        grp_both_train = grp_both.create_group('train')
        grp_both_test = grp_both.create_group('test')
        grp_both_val = grp_both.create_group('valid')
    except: 
        grp_str_train = grp_strohman['train']
        grp_str_test = grp_strohman['test']
        grp_str_val = grp_strohman['valid']
        grp_both_train = grp_both['train']
        grp_both_test = grp_both['test']
        grp_both_val = grp_both['valid']

    # loop through JET PDB 
    for index, row in final_pulse_list.iterrows():
        shot, t1, t2 = str(int(row['shot'])), row['t1'], row['t2']
        nesep, dnesep = row['neseparatrixfromexpdata10^19(m^-3)'], row['error_neseparatrixfromexpdata10^19(m^-3)']
        raw_shot = f[shot]
        pulse_list.append(shot)
        
        profiles_ne, profiles_te, radii = get_ne_and_te_profiles(raw_shot, t1, t2)
        combined = combine_ne_te(profiles_ne, profiles_te)
        other_params = np.array([[sample_input(raw_shot['machine_parameters'], key, t1, t2, nesep) for key in ALL_KEYS] for _ in range(len(profiles_ne))])
        
        
        ne_train, ne_test, ne_valid, ne_y_train, ne_y_test, ne_y_val, train_idx, val_idx, test_idx = sample_slices(profiles_ne, other_params)
        
        both_train, both_test, both_valid, both_y_train, both_y_test, both_y_val, both_train_idx, both_val_idx, both_test_idx = sample_slices(combined, other_params)
        
        X_train_str.extend(ne_train)
        X_val_str.extend(ne_valid)
        X_test_str.extend(ne_test)
        y_train_str.extend(ne_y_train)
        y_val_str.extend(ne_y_val)
        y_test_str.extend(ne_y_test)


        X_train_both.extend(both_train)
        X_val_both.extend(both_valid)
        X_test_both.extend(both_test)
        y_train_both.extend(both_y_train)
        y_val_both.extend(both_y_val)
        y_test_both.extend(both_y_test)
        
        radius_train_str.extend([radii for _ in range(len(ne_train))])
        radius_val_str.extend([radii for _ in range(len(ne_valid))])
        radius_test_str.extend([radii for _ in range(len(ne_test))])
        radius_train_both.extend([radii for _ in range(len(both_train))])
        radius_val_both.extend([radii for _ in range(len(both_valid))])
        radius_test_both.extend([radii for _ in range(len(both_test))])
    
    try: 
        meta_group = grp_psi.create_group("meta")
    except ValueError as e: 
        meta_group = grp_psi['meta']
    pulse_list = np.array(list(set([int(key) for key in pulse_list])))
    key_list = np.array([s.encode('utf-8') for s in ALL_KEYS])
    meta_pulse = meta_group.create_dataset('pulse_list', data=pulse_list)
    meta_y_atrr = meta_group.create_dataset('y_column_names', data=key_list)

    print(np.vstack(X_train_str).shape, np.vstack(y_train_str).shape)
    print(np.vstack(X_val_str).shape, np.vstack(y_val_str).shape)
    print(np.vstack(X_test_str).shape, np.vstack(y_test_str).shape)

    print(np.stack(X_train_both).shape, np.stack(y_train_both).shape)
    print(np.stack(X_val_both).shape, np.stack(y_val_both).shape)
    print(np.stack(X_test_both).shape, np.stack(y_test_both).shape)
    dset_X_train_str = grp_str_train.create_dataset("X", data=X_train_str)
    dset_y_train_str = grp_str_train.create_dataset("y", data=y_train_str)
    dset_r_train_str = grp_str_train.create_dataset("radii", data=radius_train_str)

    dset_X_test_str = grp_str_test.create_dataset("X", data=X_test_str)
    dset_y_test_str = grp_str_test.create_dataset("y", data=y_test_str)
    dset_r_test_str = grp_str_test.create_dataset("radii", data=radius_test_str)

    dset_X_val_str = grp_str_val.create_dataset("X", data=X_val_str)
    dset_y_val_str = grp_str_val.create_dataset("y", data=y_val_str)
    dset_r_val_str = grp_str_val.create_dataset("radii", data=radius_val_str)


    dset_X_train_both = grp_both_train.create_dataset("X", data=X_train_both)
    dset_y_train_both = grp_both_train.create_dataset("y", data=y_train_both)
    dset_r_train_both = grp_both_train.create_dataset("radii", data=radius_train_both)
        
    dset_X_test_both = grp_both_test.create_dataset("X", data=X_test_both)
    dset_y_test_both = grp_both_test.create_dataset("y", data=y_test_both)
    dset_r_test_both = grp_both_test.create_dataset("radii", data=radius_test_both)
    
    dset_X_val_both = grp_both_val.create_dataset("X", data=X_val_both)
    dset_y_val_both = grp_both_val.create_dataset("y", data=y_val_both)
    dset_r_val_both = grp_both_val.create_dataset("radii", data=radius_val_both)