# Time-Series Reconstruction
After verifying the Kaggle community's selection of important features, I will proceed to reconstruct the time-series dataset. Reconstruction will be based on this public kernel: https://www.kaggle.com/johnfarrell/breaking-lb-fresh-start-with-lag-selection

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import pdb
import os
import h5py
import pickle

from sklearn.metrics import mean_squared_error


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/time_series/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for getting predictions with certain lag assumption
def get_leak(df, cols, extras, lag=0):
    cols_1 = cols[:((lag + 2)*-1)]
    cols_2 = cols[(lag + 2):]
    for ext in extras:
        cols_1 += ext[:((lag + 2)*-1)]
        cols_2 += ext[(lag + 2):]

    # All columns except last two + lag into tuple
    d1 = df[cols_1].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    # All columns except first two + lag into tuple
    d2 = df[cols_2].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = df[cols[lag]]
    
    # Remove duplicate keys so that join operation will work
    d3 = d2[~d2.duplicated(subset=['key'], keep=False)]
    d4 = d1[~d1.duplicated(subset=['key'], keep=False)]
    d5 = d3.merge(d4, on='key', how='inner')
    
    d6 = d1.merge(d5, on='key', how='left')
    
    return d6['pred'].fillna(0)

# Function for rewriting leaky dataset UP TO best leak value
def rewrite_compiled_leak(leak_df, lag):
    # Reset compiled_leak field
    leak_df['compiled_leak'] = 0
    for i in range(lag):
        c = 'leaked_target_%s'%str(i)
        zeroleak = leak_df['compiled_leak']==0
        leak_df.loc[zeroleak, 'compiled_leak'] = leak_df.loc[zeroleak, c]
    return leak_df

In [None]:
# Function for loading singh sets
def load_singh(train):
    exclude = ['target', 'value_count']
    
    set_loc = './pattern_singh/'
    file_names = os.listdir(set_loc)
    file_names = [set_loc+f for f in file_names if '.csv' in f]

    singh_sets = []
    singh_cols = []
    for name in file_names:
        tmp_df = pd.read_csv(name, index_col=0)
        tmp_df.insert(0, 'target', train.target.values[tmp_df.index.values])
        if name==set_loc+'pattern_1166666.66.csv':
            tmp_df.rename(columns={'8.50E+43': '850027e38'},inplace=True)
        singh_sets.append(tmp_df)
        singh_cols.append([c for c in tmp_df.columns.values if c not in exclude])
    return singh_sets, singh_cols

# Function for loading Aaron test sets v0
def load_aaron_v0(count=10):
    set_name = './aaron_test_v0.pickle'
    
    aaron_features = load_pickle(set_name)
    return aaron_features[:count]

In [None]:
# Main Script
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

In [None]:
# Load important features
cols = load_pickle('./important.pickle')

# Load extra_sets
a_count = 55
extra_features= load_aaron_v0(count=a_count)
# extra_sets, extra_features = load_singh(train)

In [None]:
# Format target
y = np.log1p(train['target']).values
log_mean = y.mean()
test['target'] = log_mean

In [None]:
# Leak compilation for training set
extra_cols = ['compiled_leak', 'nonzero_mean']

# Function for compiling leak results over many lag values
def compiled_leak_result():
    # Define number of lag values to consider
    max_nlags = len(cols)-2
    # Define leaky train set
    train_leak = train[['ID', 'target'] + list(cols)]
    # Initialize compiled_leak as zeros
    train_leak['compiled_leak'] = 0
    train_leak['nonzero_mean'] = train[fnames].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
    # Initialize empty lists
    scores = []
    leaky_value_counts = []
    leaky_value_corrects = []
    leaky_cols = []
    
    for i in range(max_nlags):
        c = 'leaked_target_%s'%str(i)
        
        print '\nProcessing Lag:', i
        # Get predictions for current lag and store in new column
        train_leak[c] = get_leak(train, list(cols), extra_features, i)
        
        # Update leaky_cols with latest lag label
        leaky_cols.append(c)
        # Get "grounding" by joining with original training dataset
        train_leak = train.join(train_leak.set_index('ID')[leaky_cols + extra_cols], 
                                on='ID', how='left')[['ID', 'target'] + list(cols) + leaky_cols + extra_cols]
        # Iteratively fill in compiled_leak values for increasing lag
        zeroleak = train_leak['compiled_leak'] == 0
        train_leak.loc[zeroleak, 'compiled_leak'] = train_leak.loc[zeroleak, c]
        
        # Number of leaky values found so far
        leaky_value_counts.append(np.sum(train_leak['compiled_leak']>0))
        # Number of correct discovered leaky values
        _correct_counts = np.sum(train_leak['compiled_leak']==train_leak['target'])
        # Percentage of correct discovered leaky values
        leaky_value_corrects.append(1.0*_correct_counts/leaky_value_counts[-1])
        
        print 'Number of leak values found in train:', leaky_value_counts[-1]
        print 'Percentage of correct leak values in train:', leaky_value_corrects[-1]
        
        # Find score of current compilation iteration
        tmp = train_leak.copy()  # Temporary dataframe
        tmp.loc[zeroleak, 'compiled_leak'] = tmp.loc[zeroleak, 'nonzero_mean']
        scores.append(np.sqrt(mean_squared_error(y, np.log1p(tmp['compiled_leak']).fillna(log_mean))))
        
        print 'Score (filled with nonzero mean):', scores[-1]
    
    # End of iterations
    result = dict(score=scores,
                  leaky_count = leaky_value_counts,
                  leaky_correct = leaky_value_corrects)
    
    return train_leak, result

In [None]:
# Get leaked training data and result
train_leak, result = compiled_leak_result()

In [None]:
# Format results 
result = pd.DataFrame.from_dict(result, orient='columns')
result.T

In [None]:
# Find best score and lag value
best_score = np.min(result['score'])
best_lag = np.argmin(result['score'])
print 'Best score:', best_score
print 'Best lag value:', best_lag

In [None]:
# Rewrite leaky training set in terms of best lag
leaky_cols = [c for c in train_leak.columns if 'leaked_target_' in c]
train_leak = rewrite_compiled_leak(train_leak, best_lag)

In [None]:
# Save train leak
train_leak_name = './stats/train_leak_%s.csv'%best_lag
train_leak.to_csv(train_leak_name, index=False)

In [None]:
# Leak compilation for test set
# Function for compiling leaky values for test set
def compiled_leak_result_test():
    max_nlags = len(cols)-2
    
    test_leak = test[['ID', 'target'] + list(cols)]
    test_leak['compiled_leak'] = 0
    test_leak['nonzero_mean'] = test[fnames].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
    
    leaky_value_counts = []
    leaky_cols = []
    
    for i in range(max_nlags):
        c = 'leaked_target_%s'%str(i)
        
        print '\nProcessing Lag:', i
        test_leak[c] = get_leak(test, list(cols), extra_features, i)
        leaky_cols.append(c)
        
        test_leak = test.join(test_leak.set_index('ID')[leaky_cols + extra_cols], 
                              on='ID', how='left')[['ID', 'target'] + list(cols) + leaky_cols + extra_cols]
        zeroleak = test_leak['compiled_leak']==0
        test_leak.loc[zeroleak, 'compiled_leak'] = test_leak.loc[zeroleak, c]
        leaky_value_counts.append(np.sum(test_leak['compiled_leak']>0))
        
        print 'Number of leaky values found in test:', leaky_value_counts[-1]
        
    # End iterations
    result = dict(leaky_count = leaky_value_counts)
    
    return test_leak, result

In [None]:
# Get leaked test data and result
test_leak, test_result = compiled_leak_result_test()

In [None]:
# Format test results
test_result = pd.DataFrame.from_dict(test_result, orient='columns')
test_result.T

In [None]:
# Rewrite leaky test set in terms of best lag
test_leak = rewrite_compiled_leak(test_leak, best_lag)

In [None]:
# Save test leak
test_leak_name = './stats/test_leak_%s.csv'%best_lag
test_leak.to_csv(test_leak_name, index=False)

In [None]:
np.count_nonzero(test_leak.compiled_leak)

### Make Submission:

In [None]:
submit_flag = False
if submit_flag:
    # Replace zeros in compiled_leak field
    test_leak.loc[test_leak['compiled_leak']==0, 'compiled_leak'] = test_leak.loc[test_leak['compiled_leak']==0, 
                                                                                  'nonzero_mean']

    submit_name = '../../submissions/recon_a%s_lag%s_submit.csv'%(a_count, best_lag)
    # Make and save submission
    sub = pd.DataFrame()
    sub['ID'] = test['ID']
    sub['target'] = test_leak['compiled_leak']
    sub.to_csv(submit_name, index=False)