## Load Raw Data into Train/Val/Test sets and Save

Takes list from `0_preproc` step and saves train/val/holdout dataframes as pkl files.

Loads up the data from CSV raw and saved in the shared data directory at `/mistorage/mads/data`

2 versions saved

* `0_full` includes the 2 TLE Lines and OBJECT_NAME
* `0_min` excludes those 3 fields

In [1]:
import os
import sys

sys.path.append('../models/model_0')

from load_data import *

In [2]:
if ('my_home_path' not in os.environ) and ('MY_HOME_PATH' in os.environ):
    os.environ['my_home_path'] = os.environ['MY_HOME_PATH'] # because stupid :(

In [3]:
norad_lists = load_norads(['train','validate','secret_test'])

In [4]:
%time df_dict = load_data(norad_lists, use_all_data=True, debug=True, multiproc=True)  # Takes about 4min

Loading files from path: /mistorage/mads/data/gp_history


100%|██████████| 1484/1484 [01:33<00:00, 15.82it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

Finished loading.


100%|██████████| 3/3 [00:31<00:00, 10.64s/it]

Finished assembling.
CPU times: user 56.7 s, sys: 40.2 s, total: 1min 36s
Wall time: 2min 6s





In [5]:
%time write_data(df_dict, use_all_data=True, debug=True, threaded=True, path='/mistorage/mads/data/0_full')

Saving files to path: /mistorage/mads/data/0_full
Writing raw data for to: /mistorage/mads/data/0_full/train.pkl
Writing raw data for to: /mistorage/mads/data/0_full/test.pkl
Writing raw data for to: /mistorage/mads/data/0_full/secret_test.pkl
Finished saving /mistorage/mads/data/0_full/secret_test.pkl
Finished saving /mistorage/mads/data/0_full/test.pkl
Finished saving /mistorage/mads/data/0_full/train.pkl
CPU times: user 1min 2s, sys: 24.7 s, total: 1min 27s
Wall time: 2min 23s


In [6]:
min_columns = ['NORAD_CAT_ID', 'MEAN_MOTION_DOT', 'MEAN_MOTION_DDOT',
               'BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE',
               'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY',
               'MEAN_MOTION', 'REV_AT_EPOCH', 'EPOCH', 'GP_ID']

In [7]:
df_dict_min = {k:df[min_columns] for k,df in df_dict.items()}

In [8]:
%time write_data(df_dict_min, use_all_data=True, debug=True, threaded=True, path='/mistorage/mads/data/0_min')

Saving files to path: /mistorage/mads/data/0_min
Writing raw data for to: /mistorage/mads/data/0_min/train.pkl
Writing raw data for to: /mistorage/mads/data/0_min/test.pkl
Writing raw data for to: /mistorage/mads/data/0_min/secret_test.pkl
Finished saving /mistorage/mads/data/0_min/test.pkl
Finished saving /mistorage/mads/data/0_min/secret_test.pkl
Finished saving /mistorage/mads/data/0_min/train.pkl
CPU times: user 380 ms, sys: 6.38 s, total: 6.76 s
Wall time: 32.7 s


In [9]:
for k,df in df_dict_min.items():
    print(k)
    display(df.head())
    display(df.info())

train


Unnamed: 0,NORAD_CAT_ID,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,REV_AT_EPOCH,EPOCH,GP_ID
0,18549,1.801e-05,0.0,0.002592,62.2415,180.1561,0.070489,265.6761,86.2771,12.852684,58561,2004-04-27 14:18:48.216960,2
1,18727,-2e-08,0.0,0.0001,73.36,345.6887,0.008815,270.3999,88.6911,12.642166,75486,2004-04-27 15:59:40.727904,3
2,19027,1.28e-05,0.0,0.001076,83.0239,250.9465,0.008493,184.3222,175.7249,13.856401,95359,2004-04-27 19:45:13.686048,5
3,19128,1.32e-06,0.0,0.000166,70.9841,207.483,0.020756,161.3777,199.5075,13.715209,79821,2004-04-27 15:43:11.393472,6
4,19242,2.28e-06,0.0,0.000739,90.146,192.1834,0.002746,300.4617,59.3655,12.992417,47996,2004-04-27 03:43:04.015775,8


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55239839 entries, 0 to 55239838
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   NORAD_CAT_ID       int64         
 1   MEAN_MOTION_DOT    float64       
 2   MEAN_MOTION_DDOT   float64       
 3   BSTAR              float64       
 4   INCLINATION        float64       
 5   RA_OF_ASC_NODE     float64       
 6   ECCENTRICITY       float64       
 7   ARG_OF_PERICENTER  float64       
 8   MEAN_ANOMALY       float64       
 9   MEAN_MOTION        float64       
 10  REV_AT_EPOCH       int64         
 11  EPOCH              datetime64[ns]
 12  GP_ID              int64         
dtypes: datetime64[ns](1), float64(9), int64(3)
memory usage: 5.4 GB


None

test


Unnamed: 0,NORAD_CAT_ID,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,REV_AT_EPOCH,EPOCH,GP_ID
0,20885,6.5e-07,0.0,8.9e-05,99.3729,81.3942,0.006732,199.1482,160.7142,13.893167,68717,2004-04-27 21:07:53.026176,15
1,7128,2.47e-06,0.0,0.001707,101.6272,161.0821,0.00075,68.1146,292.0727,12.588996,39367,2004-04-27 07:00:03.611808,96
2,4756,-2e-08,0.0,7.2e-05,100.1459,250.4248,0.002824,61.4128,298.9856,13.47829,67283,2004-04-27 20:26:42.674784,115
3,22806,2.7e-07,0.0,6.6e-05,71.022,149.07,0.020216,334.1032,25.005,13.72446,53153,2004-04-27 13:43:02.783135,143
4,4990,1.347e-05,0.0,0.00121,99.9881,221.672,0.007042,152.232,208.262,13.850642,69572,2004-04-27 13:54:08.461440,159


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10734112 entries, 0 to 10734111
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   NORAD_CAT_ID       int64         
 1   MEAN_MOTION_DOT    float64       
 2   MEAN_MOTION_DDOT   float64       
 3   BSTAR              float64       
 4   INCLINATION        float64       
 5   RA_OF_ASC_NODE     float64       
 6   ECCENTRICITY       float64       
 7   ARG_OF_PERICENTER  float64       
 8   MEAN_ANOMALY       float64       
 9   MEAN_MOTION        float64       
 10  REV_AT_EPOCH       int64         
 11  EPOCH              datetime64[ns]
 12  GP_ID              int64         
dtypes: datetime64[ns](1), float64(9), int64(3)
memory usage: 1.0 GB


None

secret_test


Unnamed: 0,NORAD_CAT_ID,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,REV_AT_EPOCH,EPOCH,GP_ID
0,21338,1e-07,0.0,0.0001,99.5644,80.0179,0.006139,49.0571,311.5842,13.341155,63044,2004-04-27 21:24:23.006879,18
1,21361,-8e-08,0.0,0.0001,99.6308,192.2625,0.07709,161.7899,201.2653,11.952818,56269,2004-04-27 15:20:47.475167,20
2,21724,-5e-08,0.0,0.0001,73.9774,233.3877,0.020674,277.6905,80.0701,12.200313,56051,2004-04-27 19:07:28.175232,24
3,17107,7.17e-06,0.0,0.000601,65.8236,279.7706,0.003446,220.1776,139.6759,13.94153,93639,2004-04-27 20:18:20.642400,60
4,26441,1.697e-05,0.0,0.000481,98.2801,179.4539,0.008288,69.1648,291.8396,14.44737,21050,2004-04-27 15:18:01.303775,61


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10645034 entries, 0 to 10645033
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   NORAD_CAT_ID       int64         
 1   MEAN_MOTION_DOT    float64       
 2   MEAN_MOTION_DDOT   float64       
 3   BSTAR              float64       
 4   INCLINATION        float64       
 5   RA_OF_ASC_NODE     float64       
 6   ECCENTRICITY       float64       
 7   ARG_OF_PERICENTER  float64       
 8   MEAN_ANOMALY       float64       
 9   MEAN_MOTION        float64       
 10  REV_AT_EPOCH       int64         
 11  EPOCH              datetime64[ns]
 12  GP_ID              int64         
dtypes: datetime64[ns](1), float64(9), int64(3)
memory usage: 1.0 GB


None