In [3]:
import pickle 
import numpy as np
import pandas as pd
import h5py

- Gather pulses, t1, t2, nesep (exp & from fit) from JET database
    - If nesep is 0, then drop it

In [18]:
jet_pdb = pd.read_csv('pedestal-database.csv')
important_info = jet_pdb[['shot', 't1', 't2', 'neseparatrixfromexpdata10^19(m^-3)', 'error_neseparatrixfromexpdata10^19(m^-3)','neseparatrixfromfit10^19(m^-3)',
 'error_neseparatrixfromfit10^19(m^-3)','FLAG:HRTSdatavalidated']]
final_pulse_list = important_info[(important_info['neseparatrixfromexpdata10^19(m^-3)'] != 0.0) & (important_info['neseparatrixfromexpdata10^19(m^-3)'] != -1.0) &  (important_info['shot'] >= 79000) &  (important_info['FLAG:HRTSdatavalidated'] > 0)]

# final_pulse_list.describe()

- Import the raw profiles stuffs

In [5]:
with open('all_shots_with_T.pickle', 'rb') as file:
    pulse_dicts = pickle.load(file)

Create HDF5 file with the dataset groups (Name of group): 

1. (strohman) Nesep the same for each profile in the time window 
2. (sampled) Nesep sampled from gaussian distribution with std given by database 

For each group we have the following datasets 
- X: profiles
- y: corresponding nesep value

In [52]:
f = h5py.File('./pedestal_profile_for_nesep.hdf5', 'w')
grp_strohman = f.create_group("strohman")
grp_sampled = f.create_group("sampled")

### Strohman Gathering

- For each shot in pedestal db
    - Gather profiles within time windows of entry
    - Add them to X 
    - Add same number of nesep values from jetpdb to y as there are profiles within window 
    
### Sampling Gathering

- For each shot in pedestal db
    - Gather profiles within time windows of entry
    - Add them to X 
    - Get nesep +- std from jet pdb 
    - create guassian distribution
    - Add same number nesep sampled from gaussian distribution to y as there are profiles within window 

In [49]:
def sample_nesep(mean_val, std_val):
    return np.random.normal(mean_val, std_val)

In [50]:
pulses = pulse_dicts.keys()
X = []
y = []
y_sampled = []
pulse_list = []
i = 0
for index, row in final_pulse_list.iterrows():
    shot, t1, t2 = str(int(row['shot'])), row['t1'], row['t2']
    nesep, dnesep = row['neseparatrixfromexpdata10^19(m^-3)'], row['error_neseparatrixfromexpdata10^19(m^-3)']
    if shot not in pulses:
        print('Shot {} not stored in adams db, you should probably go get it'.format(shot))
        continue
    raw_shot = pulse_dicts[shot]['outputs']
    sample_ne = raw_shot['NE']
    sample_time = raw_shot['time']
    profiles_idx = np.logical_and(sample_time > t1, sample_time < t2)
    profiles = sample_ne[profiles_idx]
    if len(profiles[0]) != 63:
        profiles = np.pad(profiles, ((0, 0), (0, 63 - len(profiles[0]))), 'constant')
    X.extend(profiles)
    y.extend([nesep for _ in range(len(profiles))])
    y_sampled.extend([sample_nesep(nesep, dnesep) for _ in range(len(profiles))])
    pulse_list.extend([shot for _ in range(len(profiles))])
    i += 1
X = np.vstack(X)
y = np.vstack(y)
y_sampled = np.vstack(y_sampled)


Add the pulses to the respective groups, and close the HDF5 file. 

In [53]:
dset_X_str = grp_strohman.create_dataset("X", data=X)
dset_y_str = grp_strohman.create_dataset("y", data=y)
dset_shots = grp_strohman.create_dataset("shots", data=pulse_list)

dset_X_str = grp_sampled.create_dataset("X", data=X)
dset_y_str = grp_sampled.create_dataset("y", data=y_sampled)
dset_shots = grp_sampled.create_dataset("shots", data=pulse_list)
f.close()