In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import clean_data
import random
import train


init_cols = ['BSTAR', 'INCLINATION', 'RA_OF_ASC_NODE', 'ECCENTRICITY', 'ARG_OF_PERICENTER', 'MEAN_ANOMALY',
             'MEAN_MOTION', 'NORAD_CAT_ID', 'EPOCH', 'SUNSPOTS_1D', 'SUNSPOTS_3D', 'SUNSPOTS_7D',
             'AIR_MONTH_AVG_TEMP', 'WATER_MONTH_AVG_TEMP',
            ]

def load_raw(name=None):
    if not name:
        train_df = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/train.pkl' ) # Time: 25.7s
        test_df = pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/test.pkl' ) # Time: 5
        return {'train': train_df, 'test': test_df}
    elif name == 'train':
        return {name: pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/train.pkl' )}
    elif name == 'test':
        return {name: pd.read_pickle(os.environ['GP_HIST_PATH'] + '/../3_min/test.pkl' )}
train_df = load_raw('train')['train']

print(f'>>> Truncating data...')
train_df = train_df[init_cols].reset_index(drop=True)  # 4s

print(f'>>> Normalizing data...')
train_df = clean_data.normalize_all_columns(train_df) # 53.4s

>>> Truncating data...
>>> Normalizing data...


In [8]:
idx_map = clean_data.load_index_map(name='train', path='data')[:1000]

In [9]:
df = pd.merge(train_df.iloc[idx_map[:,0]].reset_index(drop=True),
              train_df.iloc[idx_map[:,1]].reset_index(drop=True),
              left_index=True, right_index=True)

In [10]:
x_idx=[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,39,40,41,42,43,44,45,46,47,48,49,8,33]
y_idx=[26,27,28,29,30,31]
X = df[list(df.columns[x_idx])]
y = df[list(df.columns[y_idx])]

In [16]:
X.columns = [col[:-2] if col[-2:] == '_x' else col for col in X.columns]
y.columns = [col[:-2] if col[-2:] == '_y' else col for col in X.columns]

Unnamed: 0,INCLINATION_y,RA_OF_ASC_NODE_y,ECCENTRICITY_y,ARG_OF_PERICENTER_y,MEAN_ANOMALY_y,MEAN_MOTION_y
0,0.345787,0.795126,0.2821,0.8029,0.17651,0.183388
1,0.345651,0.601665,0.293362,0.188529,0.832906,0.185565
2,0.345729,0.013095,0.314878,0.327711,0.695316,0.170153
3,0.345819,0.095622,0.281552,0.775454,0.202845,0.183247
4,0.345667,0.452867,0.305143,0.198696,0.824191,0.178082


In [3]:
w = train_df.shape[1] #len(df.columns)
combined = np.zeros([len(idx_map), w*2])

In [None]:
combined[:,:w] = train_df.iloc[idx_map[:,0]]
combined[:,w:w*2] = train_df.iloc[idx_map[:,1]]

KeyError: 'key'

In [5]:
def build_xy(df, idx_pairs,
             x_idx=[0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,39,40,41,42,43,44,45,46,47,48,49,8,33],
             y_idx=[26,27,28,29,30,31],
             debug=False):
    '''
    Builds an X (inputs e.g. X_train) and y (labels e.g. y_train) dataframes
    by using the idx_pairs.  For example, idx_pairs of [[0,1]] will return a
    single row which contains the values from df.iloc[0] and df.iloc[1] concat
    and then split according to the x_idx and y_idx indexes into two df.

    Parameters
    ----------
    df : Dataframe
        Contains all the data to be trained on

    idx_pairs : list
        Contains list of lists where each list is a pair of row indexes for df

    x_idx : list
        Contains the column indexes that represent the X values.
        Default: [0,1,2,3,4,5,6,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,
                  39,40,41,42,43,44,45,46,47,48,49,
                  8,33]

    y_idx : list
        Contains the column indexes that represent the y values
        Default: [26,27,28,29,30,31]

    debug : bool
        Display the column indexes only.

    Returns
    -------
    DataFrame
        Contains the input values X

    DataFrame
        Contains the label values y
    '''
    
    if debug:
        display ({i:c for i,c in enumerate(df.columns)})
        display ({i+len(df.columns):c for i,c in enumerate(df.columns)})
        return None

    columns = df.columns
    X_columns,y_columns = [],[]
    for i in x_idx:
        c = columns[i%len(columns)]
        if c in X_columns:
            X_columns.append(c+'_y')
        else:
            X_columns.append(c)
    for i in y_idx:
        c = columns[i%len(columns)]
        if c in y_columns:
            y_columns.append(c+'_y')
        else:
            y_columns.append(c)

    combined = np.concatenate([df.to_numpy()[idx_pairs[:,0]],
                               df.to_numpy()[idx_pairs[:,1]]], axis=1)

    X = pd.DataFrame(combined[:,x_idx], columns=X_columns)
    y = pd.DataFrame(combined[:,y_idx], columns=y_columns).apply(pd.to_numeric)

    num_cols = list(set(X.columns).difference({'EPOCH','EPOCH_y'}))
    X[num_cols] = X[num_cols].apply(pd.to_numeric)

    return X,y

In [None]:


def build(df):
    print(f'>>> Truncating {name} data...')
    df = df[init_cols].reset_index(drop=True)  # 4s

    print(f'>>> Normalizing {name} data...')
    df = clean_data.normalize_all_columns(df) # 53.4s

    print(f'>>> Building {name} index map...')
    try:
        idx_map = clean_data.load_index_map(name=name, path='data')
    except:
        idx_map = clean_data.create_index_map(df, write=True, name=name, path='data') # 3min 29s

#     print(f'>>> Building {name} inputs and labels')
#     X,y = clean_data.build_xy(df, idx_map) # 59min 41s
#     X = clean_data.normalize_epoch_diff(X, drop_epoch=False) # 19s