In [44]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import clean_data
import random

# init_cols = ['BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY',
#               'MEAN_MOTION', 'NORAD_CAT_ID', 'EPOCH',
#              ]

init_cols = ['BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY',
             'MEAN_MOTION', 'NORAD_CAT_ID', 'EPOCH', 'SUNSPOTS_1D', 'SUNSPOTS_3D', 'SUNSPOTS_7D',
             'AIR_MONTH_AVG_TEMP', 'WATER_MONTH_AVG_TEMP',
            ]


def load_raw(name=None):
    if not name:
        train_df = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/train.pkl' ) # Time: 25.7s
        test_df = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/test.pkl' ) # Time: 5
        return {'train': train_df, 'test': test_df}
    elif name == 'train':
        return {name: pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/train.pkl' )}
    elif name == 'test':
        return {name: pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/test.pkl' )}

def truncate_data(df, perc):
    norad_count = int(len(df.NORAD_CAT_ID.unique()) * perc)
    df=df[df['INCLINATION'].between(0,180) &
          df['RA_OF_ASC_NODE'].between(0,360) &
          df['ECCENTRICITY'].between(0,0.25) &
          df['ARG_OF_PERICENTER'].between(0,360) &
          df['MEAN_ANOMALY'].between(0,360) &
          df['MEAN_MOTION'].between(11.25,17) &
          (df.EPOCH >= '1990-01-01') &
          (df.NORAD_CAT_ID.isin(df.NORAD_CAT_ID.unique()[:norad_count]))].reset_index(drop=True)
    return df

def create_save(perc, name=None):
    print('>>> Loading raw data')
    df_in = load_raw(name)

    df_out = []
    for name, df in df_in.items():
        print(f'>>> Truncating {name} data...')
        df = truncate_data(df, perc) # 14.5s
        df = df[init_cols].reset_index(drop=True)  # 4s

        print(f'>>> Normalizing {name} data...')
        df = clean_data.normalize_all_columns(df) # 53.4s

        print(f'>>> Building {name} index map...')
        idx_map = clean_data.create_index_map(df, threaded=True, batch_size=50) # 3min 29s

        print(f'>>> Building {name} inputs and labels')
        X,y = clean_data.build_xy(df, idx_map) # 59min 41s
        X = clean_data.normalize_epoch_diff(X, drop_epoch=False) # 19s

        print(f'>>> Saving {name} data')
        X.to_pickle(f'data/x_{name}1.pkl')
        y.to_pickle(f'data/y_{name}1.pkl')
        df_out.extend([X,y])
    
    return df_out

def load(perc, force_update=False):
    if not force_update:
        print('>>> Loading data')
        
        X_train = pd.read_pickle('data/x_train1.pkl')
        y_train = pd.read_pickle('data/y_train1.pkl')
        X_test = pd.read_pickle('data/x_test1.pkl')
        y_test = pd.read_pickle('data/y_test1.pkl')
        return X_train, y_train, X_test, y_test
    else:
        return create_save(perc)
        

perc = 0.05
force_update = False

X_train, y_train, X_test, y_test = load(perc, force_update)
print(f'>>> Complete')

>>> Loading raw data
>>> Truncating train data...
>>> Normalizing train data...
>>> Building train index map...


  0%|          | 0/614 [00:00<?, ?it/s]

>>> Building train inputs and labels
>>> Saving train data
>>> Truncating test data...
>>> Normalizing test data...
>>> Building test index map...


  0%|          | 0/135 [00:00<?, ?it/s]

>>> Building test inputs and labels
>>> Saving test data
>>> Complete


In [3]:
df_in = load_raw()

In [39]:
train_df = df_in['train'].iloc[:1000]
train_df = truncate_data(train_df, perc)
train_df = train_df[init_cols].reset_index(drop=True)
train_df = clean_data.normalize_all_columns(train_df)
train_df.head()

Unnamed: 0,BSTAR,INCLINATION,RA_OF_ASC_NODE,ECCENTRICITY,ARG_OF_PERICENTER,MEAN_ANOMALY,MEAN_MOTION,NORAD_CAT_ID,EPOCH,SUNSPOTS_1D,...,month_sin,month_cos,hour_sin,hour_cos,minute_sin,minute_cos,second_sin,second_cos,ms_sin,ms_cos
0,0.002592,0.345786,0.500434,0.281957,0.737989,0.239659,0.278728,18549,2004-04-27 14:18:48.216960,39,...,1.0,6.123234000000001e-17,-0.5,-0.866025,0.951057,-0.309017,-0.951057,0.309017,0.978529,0.206109
1,0.0001,0.407556,0.960246,0.035261,0.751111,0.246364,0.242116,18727,2004-04-27 15:59:40.727904,39,...,1.0,6.123234000000001e-17,-0.707107,-0.707107,-0.104528,0.9945219,-0.866025,-0.5,-0.990378,-0.138388
2,0.001076,0.461244,0.697074,0.033974,0.512006,0.488125,0.453287,19027,2004-04-27 19:45:13.686048,39,...,1.0,6.123234000000001e-17,-0.965926,0.258819,-1.0,-1.83697e-16,0.978148,0.207912,-0.92035,-0.391096
3,0.000166,0.394356,0.576342,0.083025,0.448271,0.554187,0.428732,19128,2004-04-27 15:43:11.393472,39,...,1.0,6.123234000000001e-17,-0.707107,-0.707107,-0.978148,-0.2079117,0.913545,0.406737,0.620465,-0.784234
4,0.000739,0.500811,0.533843,0.010984,0.834616,0.164904,0.303029,19242,2004-04-27 03:43:04.015775,39,...,1.0,6.123234000000001e-17,0.707107,0.707107,-0.978148,-0.2079117,0.406737,0.913545,0.098955,0.995092


In [41]:
idx_map = clean_data.create_index_map(train_df, threaded=True, batch_size=50)

  0%|          | 0/48 [00:00<?, ?it/s]

In [43]:
clean_data.build_xy(train_df, idx_map, debug=True)
x_idx=[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,39,40,41,42,43,44,45,46,47,48,49,8,33],
y_idx=[26,27,28,29,30,31],

{0: 'BSTAR',
 1: 'INCLINATION',
 2: 'RA_OF_ASC_NODE',
 3: 'ECCENTRICITY',
 4: 'ARG_OF_PERICENTER',
 5: 'MEAN_ANOMALY',
 6: 'MEAN_MOTION',
 7: 'NORAD_CAT_ID',
 8: 'EPOCH',
 9: 'SUNSPOTS_1D',
 10: 'SUNSPOTS_3D',
 11: 'SUNSPOTS_7D',
 12: 'AIR_MONTH_AVG_TEMP',
 13: 'WATER_MONTH_AVG_TEMP',
 14: 'year',
 15: 'month_sin',
 16: 'month_cos',
 17: 'hour_sin',
 18: 'hour_cos',
 19: 'minute_sin',
 20: 'minute_cos',
 21: 'second_sin',
 22: 'second_cos',
 23: 'ms_sin',
 24: 'ms_cos'}

{25: 'BSTAR',
 26: 'INCLINATION',
 27: 'RA_OF_ASC_NODE',
 28: 'ECCENTRICITY',
 29: 'ARG_OF_PERICENTER',
 30: 'MEAN_ANOMALY',
 31: 'MEAN_MOTION',
 32: 'NORAD_CAT_ID',
 33: 'EPOCH',
 34: 'SUNSPOTS_1D',
 35: 'SUNSPOTS_3D',
 36: 'SUNSPOTS_7D',
 37: 'AIR_MONTH_AVG_TEMP',
 38: 'WATER_MONTH_AVG_TEMP',
 39: 'year',
 40: 'month_sin',
 41: 'month_cos',
 42: 'hour_sin',
 43: 'hour_cos',
 44: 'minute_sin',
 45: 'minute_cos',
 46: 'second_sin',
 47: 'second_cos',
 48: 'ms_sin',
 49: 'ms_cos'}

In [7]:
df1 = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../1_min/train.pkl' )
df2 = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/train.pkl' )

In [8]:
display(len(df1))
display(len(df2))

55239839

48769810

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import clean_data

init_cols = ['BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY',
              'MEAN_MOTION', 'NORAD_CAT_ID', 'EPOCH',
             ]

def load_all():
    train_df = pd.read_pickle('../model_0/data/train.pkl' )
    #test_df = pd.read_pickle('../model_0/data/test.pkl' )
    return train_df #, test_df

train_df = load_all()
#train_df,test_df = clean_all(train_df, test_df)

In [2]:
def truncate_data(df):
    df=df[df['INCLINATION'].between(0,180) & df['RA_OF_ASC_NODE'].between(0,360) & df['ECCENTRICITY'].between(0,0.25) &
          df['ARG_OF_PERICENTER'].between(0,360) & df['MEAN_ANOMALY'].between(0,360) & df['MEAN_MOTION'].between(11.25,17) &
          (df.EPOCH >= '1990-01-01')]
    df=df.reset_index(drop=True)
    return df

In [3]:
#train_df = train_df[train_df['NORAD_CAT_ID'] == 14631].reset_index(drop=True)

print('>>> Truncating data...')
%time train_df = truncate_data(train_df) # 14.5s
%time train_df = train_df[init_cols]  # 4s

print('>>> Normalizing data...')
%time train_df = clean_data.normalize_all_columns(train_df) # 53.4s

print('>>> Building index map...')
%time idx_map = clean_data.create_index_map(train_df, threaded=True, batch_size=50) # 3min 29s

print('>>> Building inputs and labels')
%time X_train,y_train = clean_data.build_xy(train_df, idx_map) # 59min 41s
%time X_train = clean_data.normalize_epoch_diff(X_train, drop_epoch=True) # 19s

print('>>> Saving data')
X_train.to_pickle('data/x_train1.pkl')
y_train.to_pickle('data/y_train1.pkl')

>>> Truncating data...
CPU times: user 10.3 s, sys: 4.19 s, total: 14.5 s
Wall time: 14.5 s
CPU times: user 2.84 s, sys: 1.3 s, total: 4.14 s
Wall time: 4.13 s
>>> Normalizing data...
CPU times: user 47.8 s, sys: 5.54 s, total: 53.3 s
Wall time: 53.3 s
>>> Building index map...


12800it [02:44, 77.67it/s]                            


CPU times: user 27min 42s, sys: 16min 5s, total: 43min 47s
Wall time: 3min 29s
>>> Building inputs and labels
CPU times: user 48min 26s, sys: 11min 21s, total: 59min 47s
Wall time: 59min 41s
CPU times: user 10.7 s, sys: 8.31 s, total: 19 s
Wall time: 19 s
