In [3]:
import pickle 
import numpy as np
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split

- Gather pulses, t1, t2, nesep (exp & from fit) from JET database
    - If nesep is 0, then drop it

In [None]:
jet_pdb = pd.read_csv('pedestal-database.csv')
important_info = jet_pdb[['shot', 't1', 't2', 'neseparatrixfromexpdata10^19(m^-3)', 'error_neseparatrixfromexpdata10^19(m^-3)','neseparatrixfromfit10^19(m^-3)',
 'error_neseparatrixfromfit10^19(m^-3)','FLAG:HRTSdatavalidated']]
final_pulse_list = important_info[(important_info['neseparatrixfromexpdata10^19(m^-3)'] != 0.0) & (important_info['neseparatrixfromexpdata10^19(m^-3)'] != -1.0) &  (important_info['shot'] >= 79000) &  (important_info['FLAG:HRTSdatavalidated'] > 0)]

# final_pulse_list.describe()

- Import the raw profiles stuffs

In [None]:
with open('all_shots_with_T.pickle', 'rb') as file:
    pulse_dicts = pickle.load(file)

Create HDF5 file with the dataset groups (Name of group): 

1. (only_density) Density profiles in the time window 
2. (density_and_temperature) Density and Temperature profiles

For each group we have the following datasets 
- X: profiles
- y: corresponding machine parameters for time window

In [None]:
f = h5py.File('../processed/pedestal_profile_dataset_v3.hdf5', 'w')
grp_strohman = f.create_group("strohman")
grp_both = f.create_group("density_and_temperature")

grp_str_train = grp_strohman.create_group('train')
grp_str_test = grp_strohman.create_group('test')
grp_str_val = grp_strohman.create_group('valid')

grp_both_train = grp_both.create_group('train')
grp_both_test = grp_both.create_group('test')
grp_both_val = grp_both.create_group('valid')

### Strohman Gathering

- For each shot in pedestal db
    - Gather density profiles within time windows of entry
    - Add them to 70% to train, 10% to valid, 20% to test
    - For each input variable in list, get average of the value for given time window
    - Get nesep and put it also into y
    - put into y
    
### Extended Gathering
- Same as above, but take density as well, and have it be a 2D input

In [None]:
EFIT = ['Q95', 'RGEO', 'CRO', 'VOLM', 'TRIU', 'TRIL', 'XIP', 'ELON', 'POHM']
ALL_KEYS = EFIT + ['BT', 'ELER', 'P_NBI', 'P_ICRH', 'NE']
def sample_nesep(mean_val, std_val):
    return np.random.normal(mean_val, std_val)

def sample_input(input_dict, key, t1, t2, nesep):
    if key in EFIT: 
        if len(input_dict[key]) == 0:
            return -1 
        time_idx = np.logical_and(input_dict['EFIT_T'] > t1, input_dict['EFIT_T'] < t2)
        sample = np.mean(np.array(input_dict[key])[time_idx])
    else:
        if key == 'NE':
            return nesep
        else:
            time_idx = np.logical_and(input_dict[key]['time'] > t1, input_dict[key]['time'] < t2)
            sample = np.mean(np.array(input_dict[key]['values'])[time_idx])
    return sample

def get_ne_and_te_profiles(raw_shot, t1, t2):
    sample_ne = raw_shot['outputs']['NE']
    sample_te = raw_shot['outputs']['TE']
    sample_time = raw_shot['outputs']['time']
    profiles_idx = np.logical_and(sample_time > t1, sample_time < t2)
    profiles_ne = sample_ne[profiles_idx]
    profiles_te = sample_te[profiles_idx]
    
    if len(profiles_ne[0]) != 63:
        profiles_ne = np.pad(profiles_ne, ((0, 0), (0, 63 - len(profiles_ne[0]))), 'constant')
    if len(profiles_te[0]) != 63:
        profiles_te = np.pad(profiles_te, ((0, 0), (0, 63 - len(profiles_te[0]))), 'constant')
    
    # Returns to numpy arrays of the same shape
    return profiles_ne, profiles_te

def combine_ne_te(prof_ne, prof_te):
    
    combined = np.stack([prof_ne, prof_te], axis=1)
    # Want a shape of num_slices X 63
    return combined

def sample_slices(slices, y):
    num_windows = len(slices)
        
    test_size = int(0.2*num_windows)
    val_size = int(0.1*num_windows)
    
    if val_size == 0 or test_size==0:
        val_size = 1
        test_size = 1
        
    train_size = num_windows - test_size - val_size
    
    X_train, X_test, y_train, y_test =  train_test_split(slices, y, test_size=test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size)
    
    return X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
pulses = pulse_dicts.keys()

X_train_str, y_train_str = [], []
X_val_str, y_val_str = [], []
X_test_str, y_test_str = [], []

X_train_both, y_train_both = [], []
X_val_both, y_val_both = [], []
X_test_both, y_test_both = [], []


pulse_list = []
i = 0
for index, row in final_pulse_list.iterrows():
    shot, t1, t2 = str(int(row['shot'])), row['t1'], row['t2']
    nesep, dnesep = row['neseparatrixfromexpdata10^19(m^-3)'], row['error_neseparatrixfromexpdata10^19(m^-3)']
    if shot not in pulses:
        print('Shot {} not stored in adams db, you should probably go get it'.format(shot))
        continue
    raw_shot = pulse_dicts[shot]
    pulse_list.append(shot)
    
    profiles_ne, profiles_te = get_ne_and_te_profiles(raw_shot, t1, t2)
    combined = combine_ne_te(profiles_ne, profiles_te)
    other_params = np.array([[sample_input(raw_shot['inputs'], key, t1, t2, nesep) for key in ALL_KEYS] for _ in range(len(profiles_ne))])
    
    ne_train, ne_test, ne_valid, ne_y_train, ne_y_test, ne_y_val = sample_slices(profiles_ne, other_params)
    both_train, both_test, both_valid, both_y_train, both_y_test, both_y_val = sample_slices(combined, other_params)
    
    X_train_str.extend(ne_train)
    X_val_str.extend(ne_valid)
    X_test_str.extend(ne_test)
    y_train_str.extend(ne_y_train)
    y_val_str.extend(ne_y_val)
    y_test_str.extend(ne_y_test)
    
    
    X_train_both.extend(both_train)
    X_val_both.extend(both_valid)
    X_test_both.extend(both_test)
    y_train_both.extend(both_y_train)
    y_val_both.extend(both_y_val)
    y_test_both.extend(both_y_test)
    
    i += 1
    
    
print(np.vstack(X_train_str).shape, np.vstack(y_train_str).shape)
print(np.vstack(X_val_str).shape, np.vstack(y_val_str).shape)
print(np.vstack(X_test_str).shape, np.vstack(y_test_str).shape)

print(np.stack(X_train_both).shape, np.stack(y_train_both).shape)
print(np.stack(X_val_both).shape, np.stack(y_val_both).shape)
print(np.stack(X_test_both).shape, np.stack(y_test_both).shape)

Add the pulses to the respective groups, and close the HDF5 file. 

In [None]:
dset_X_train_str = grp_str_train.create_dataset("X", data=X_train_str)
dset_y_train_str = grp_str_train.create_dataset("y", data=y_train_str)

dset_X_test_str = grp_str_test.create_dataset("X", data=X_test_str)
dset_y_test_str = grp_str_test.create_dataset("y", data=y_test_str)

dset_X_val_str = grp_str_val.create_dataset("X", data=X_val_str)
dset_y_val_str = grp_str_val.create_dataset("y", data=y_val_str)


dset_X_train_both = grp_both_train.create_dataset("X", data=X_train_both)
dset_y_train_both = grp_both_train.create_dataset("y", data=y_train_both)

dset_X_test_both = grp_both_test.create_dataset("X", data=X_test_both)
dset_y_test_both = grp_both_test.create_dataset("y", data=y_test_both)

dset_X_val_str = grp_both_val.create_dataset("X", data=X_val_both)
dset_y_val_str = grp_both_val.create_dataset("y", data=y_val_both)

In [None]:
meta_group = f.create_group("meta")
pulse_list = np.array(list(set([int(key) for key in pulse_list])))
key_list = np.array([s.encode('utf-8') for s in ALL_KEYS])
meta_pulse = meta_group.create_dataset('pulse_list', data=pulse_set)
meta_y_atrr = meta_group.create_dataset('y_column_names', data=key_list)

In [None]:
f.close()

In [4]:
with h5py.File('../processed/pedestal_profile_dataset_v3.hdf5', 'r') as file:
    group = file['density_and_temperature']
    X_train, y_train = group['train']['X'][:], group['train']['y'][:]
    # X, y = file['density_and_temperature']['X'][:], file['strohman']['y'][:]

In [9]:
# X_train.shape, y_train.shape

((59528, 2, 63), (59528, 14))