# Generate SPOCK training data

In [1]:
import spock
import random
import numpy as np
import rebound
import pandas as pd
from spock import simsetup
from spock import FeatureClassifier
import sys
from multiprocessing import Pool
from features import init_model, getFeat

  import pkg_resources


The initial conditions are stored as snapshots of a simulation archive, we must thus load the datapath and the labels for the corresponding systems

In [2]:
#specify the data path
#We will be using cleaned data generated from the original spock initial conditions data
# This data is in the form of a simulation archive
datapath = '../dataset/resonant/'
labels = pd.read_csv(datapath+'sim_labels.csv')

We can now generate the set of system indices based on the labels

In [3]:
#generates the indexes of the systems
systemNum = range(labels.shape[0])

We can note the column names and import the different feature generators

In [4]:
col = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear', 'EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar', 'MEGNO', 'MEGNOstd', 'Tsec','InitialStable']

We can then establish some helper functions that will allow us to map the spock.generate_feature function to the different systems by mapping to the different snapshots

In [5]:
def getList(features):
    '''Helper function which isolates the data list from the generate_features return'''
    return list(features[0][0].values())+[features[1]]

We can now map getFeat to the different rows of the Initial df, this will create each simulation and generate the spock features.

In [6]:
model = FeatureClassifier()
sim = rebound.Simulation("../dataset/resonant/clean_initial_conditions.bin", snapshot=0)
model.predict_stable(sim)



0.54436356

In [7]:
model = FeatureClassifier()
sim = rebound.Simulation("../dataset/resonant/clean_initial_conditions.bin", snapshot=0)
model.predict_stable([sim, sim])

array([0.54436356, 0.54436356], dtype=float32)

In [6]:
model = FeatureClassifier()
sim = rebound.Simulation("../dataset/resonant/clean_initial_conditions.bin", snapshot=0)
model.generate_features(sim)



([OrderedDict([('EMcrossnear', 0.06023365324093462),
               ('EMfracstdnear', 0.029447188962535807),
               ('EPstdnear', 0.001994998547310404),
               ('MMRstrengthnear', 0.49804017851441507),
               ('EMcrossfar', 0.5040626705572052),
               ('EMfracstdfar', 0.0029002068051170965),
               ('EPstdfar', 0.0008157705748634473),
               ('MMRstrengthfar', nan),
               ('MEGNO', 1.9966848679613491),
               ('MEGNOstd', 0.00245037404332733),
               ('Tsec', 28431.3555941486)])],
 True)

In [7]:
%%time
with Pool(initializer=init_model) as pool:
    features = pool.map(getFeat,systemNum[:100])
    pool.close()
    pool.join()
#formats the data correctly
formattedFeat = pd.DataFrame(np.array(list(map(getList,features))), columns = col)


  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources
  import pkg_resources


CPU times: user 16.3 ms, sys: 88.8 ms, total: 105 ms
Wall time: 35.3 s


We can then join the generated features with the corresponding labels

In [8]:
dataset = pd.DataFrame.join(formattedFeat,labels)

We can then save the new training data spreadsheet.

In [9]:
df = pd.read_csv('refdata.csv', index_col=0)
df.head()

Unnamed: 0.1,EMcrossnear,EMfracstdnear,EPstdnear,MMRstrengthnear,EMcrossfar,EMfracstdfar,EPstdfar,MMRstrengthfar,MEGNO,MEGNOstd,Tsec,InitialStable,Unnamed: 0,runstring,instability_time,shadow_instability_time,Stable
0,0.060234,0.029447,0.001995,0.49804,0.504063,0.0029,0.000816,,1.996685,0.00245,28431.355594,1.0,0,0000000.bin,1545872.0,3063700.0,False
1,0.080547,0.016559,0.000112,0.452015,0.240504,0.006736,0.001621,0.008674,2.019031,0.010215,3904.311701,1.0,1,0000001.bin,999000000.0,999000000.0,True
2,0.12966,0.028844,0.003181,1.001973,1.001981,0.001402,0.003742,0.010415,1.995858,0.003027,70073.695993,1.0,2,0000002.bin,999000000.0,999000000.0,True
3,0.406112,0.036478,0.00239,0.321071,0.427768,0.036219,0.008587,0.012958,2.005184,0.000787,18313.684674,1.0,3,0000003.bin,2287671.0,8392234.0,False
4,0.059897,0.028009,0.001623,0.341814,0.257596,0.053197,0.001287,0.034789,2.057699,0.020469,4110.203941,1.0,4,0000004.bin,966893.1,338035.0,False


In [10]:
dataset.head()

Unnamed: 0.1,EMcrossnear,EMfracstdnear,EPstdnear,MMRstrengthnear,EMcrossfar,EMfracstdfar,EPstdfar,MMRstrengthfar,MEGNO,MEGNOstd,Tsec,InitialStable,Unnamed: 0,runstring,instability_time,shadow_instability_time,Stable
0,0.060234,0.029447,0.001995,0.49804,0.504063,0.0029,0.000816,,1.996685,0.00245,28431.355594,1.0,0,0000000.bin,1545872.0,3063700.0,False
1,0.080547,0.016559,0.000112,0.452015,0.240504,0.006736,0.001621,0.008674,2.019031,0.010215,3904.311701,1.0,1,0000001.bin,999000000.0,999000000.0,True
2,0.12966,0.028844,0.003181,1.001973,1.001981,0.001402,0.003742,0.010415,1.995858,0.003027,70073.695993,1.0,2,0000002.bin,999000000.0,999000000.0,True
3,0.406112,0.036478,0.00239,0.321071,0.427768,0.036219,0.008587,0.012958,2.005184,0.000787,18313.684674,1.0,3,0000003.bin,2287671.0,8392234.0,False
4,0.059897,0.028009,0.001623,0.341814,0.257596,0.053197,0.001287,0.034789,2.057699,0.020469,4110.203941,1.0,4,0000004.bin,966893.1,338035.0,False


In [11]:
for col in df.columns:
    try:
        print(col, (dataset[col] - df[col]).sum()/dataset[col].shape[0])
    except:
        pass

EMcrossnear 3.608224830031759e-17
EMfracstdnear 4.825133348429489e-17
EPstdnear 4.3287042787026886e-17
MMRstrengthnear 2.654126918244515e-17
EMcrossfar 2.6090241078691177e-17
EMfracstdfar 4.236194728335363e-17
EPstdfar 4.742612010576186e-17
MMRstrengthfar 2.860775852320252e-17
MEGNO -9.325873406851315e-17
MEGNOstd 4.82971410260824e-17
Tsec 2.1600499167107045e-13
InitialStable 0.0
Unnamed: 0 0.0
instability_time 0.0
shadow_instability_time 0.0


In [12]:
#dataset.to_csv('refdata.csv')