# Various Exploration

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pdb
import os
import h5py
import pickle

from sklearn.metrics import mean_squared_error


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/time_series/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for loading Aaron test sets v0
def load_aaron_v0(count=10):
    set_name = './aaron_test_v0.pickle'
    
    aaron_features = load_pickle(set_name)
    return aaron_features[:count]

# Function for loading leaks
def load_leaks(leak_val):
    leak_dir = './stats/'
    
    train_leak_loc = leak_dir + 'train_leak_%s.csv'%leak_val
    train_leak = pd.read_csv(train_leak_loc).compiled_leak
    test_leak_loc = leak_dir + 'test_leak_%s.csv'%leak_val
    test_leak = pd.read_csv(test_leak_loc).compiled_leak
    
    return train_leak, test_leak

## Main Script:

In [None]:
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

# Load leaks
leak_val = 38
print '\nLoading train and test leaks...'
train_leak, test_leak = load_leaks(leak_val)
print 'Nonzero elements in train:', np.count_nonzero(train_leak)
print 'Nonzero elements in test:', np.count_nonzero(test_leak)

# Load important features
print '\nLoading important (Giba) features...\n'
cols = load_pickle('./important.pickle')

# Load extra_sets
print 'Loading extra features...\n'
extra_features= load_aaron_v0(count=55)

# Load unmatched test indexes
print 'Loading unmatched test indexes...\n'
public_test_idx = load_pickle('../model_data/unmatched_public_idxs.pickle')

In [None]:
trn_right_idx = np.where(train_leak!=0)[0]
tst_right_idx = np.where(test_leak!=0)[0]

trn_wrong_idx = np.where(train_leak==0)[0]
tst_wrong_idx = np.where(test_leak==0)[0]

matched_train = train.loc[trn_right_idx]
matched_test = test.loc[tst_right_idx]

unmatched_train = train.loc[trn_wrong_idx]
unmatched_test = test.loc[tst_wrong_idx]

public_unmatched_test = test.loc[public_test_idx]

# Check accuracy of matches
if np.array_equal(matched_train['target'], train_leak[trn_right_idx]):
    print 'Train targets match train leaks!'

# Setting targets and log targets
matched_test['target'] = test_leak[tst_right_idx]

matched_train['log_target'] = np.log1p(matched_train['target'])
matched_test['log_target'] = np.log1p(matched_test['target'])


# Format data
flat_tst_public = np.log1p(public_unmatched_test[fnames].values.reshape(-1))
flat_tst_public = flat_tst_public[np.where(flat_tst_public!=0)[0]]

# Train leak samples' feature values without zeros
flat_trn_leak = np.log1p(matched_train[fnames].values.reshape(-1))
flat_trn_leak = flat_trn_leak[np.where(flat_trn_leak!=0)[0]]
# Test leak samples' feature values without zeros
flat_tst_leak = np.log1p(matched_test[fnames].values.reshape(-1))
flat_tst_leak = flat_tst_leak[np.where(flat_tst_leak!=0)[0]]

# Train missed samples' feature values without zeros
flat_trn_miss = np.log1p(unmatched_train[fnames].values.reshape(-1))
flat_trn_miss = flat_trn_miss[np.where(flat_trn_miss!=0)[0]]
# Test missed samples' feature values without zeros
flat_tst_miss = np.log1p(unmatched_test[fnames].values.reshape(-1))
flat_tst_miss = flat_tst_miss[np.where(flat_tst_miss!=0)[0]]

all_data = np.concatenate([flat_trn_leak, flat_tst_leak])
all_data_min = np.min(all_data)
all_data_max = np.max(all_data)

all_targets = np.concatenate([matched_train['log_target'].values, matched_test['log_target'].values])
all_targ_min = np.min(all_targets)
all_targ_max = np.max(all_targets)

In [None]:
def data_scaler(set_x, set_y):
    xlen = len(set_x)
    ylen = len(set_y)
    
    correct_using = np.argmin([xlen, ylen])
    
    new_set = []
    if correct_using == 0:
        scaling = np.ceil(ylen/xlen)
        for i in range(int(scaling)):
            new_set.append(set_x)
        combined = np.concatenate(new_set)
        return combined, set_y
    else:
        scaling = np.ceil(xlen/ylen)
        for i in range(int(scaling)):
            new_set.append(set_y)
        combined = np.concatenate(new_set)
        return set_x, combined

In [None]:
trn, tst = data_scaler(matched_train['log_target'], matched_test['log_target'])

bins = np.linspace(all_targ_min*1.1, all_targ_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(trn, bins, alpha=0.5, label='train')
plt.hist(tst, bins, alpha=0.5, label='test')
plt.legend(loc='upper right')
plt.title('Matched Train and Test Target Overlaps')
plt.show()

In [None]:
match, unmatch = data_scaler(flat_trn_leak, flat_trn_miss)

bins = np.linspace(all_targ_min*1.1, all_targ_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(match, bins, alpha=0.5, label='train match')
plt.hist(unmatch, bins, alpha=0.5, label='train unmatch')
plt.legend(loc='upper right')
plt.title('Matched Train and Unmatched Train Data Overlaps')
plt.show()

In [None]:
trn, tst = data_scaler(flat_trn_miss, flat_tst_miss)

bins = np.linspace(all_targ_min*1.1, all_targ_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(trn, bins, alpha=0.5, label='train unmatch')
plt.hist(tst, bins, alpha=0.5, label='test unmatch')
plt.legend(loc='upper right')
plt.title('Unmatched Train and Test Data Overlaps')
plt.show()

In [None]:
trn, tst = data_scaler(flat_trn_leak, flat_tst_leak)

bins = np.linspace(all_data_min*1.1, all_data_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(trn, bins, alpha=0.5, label='train matched')
plt.hist(tst, bins, alpha=0.5, label='test matched')
plt.legend(loc='upper right')
plt.title('Test to Train Leak Overlaps')
plt.show()

In [None]:
leak, miss = data_scaler(flat_tst_leak, flat_tst_miss)

bins = np.linspace(all_data_min*1.1, all_data_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(leak, bins, alpha=0.5, label='test matched')
plt.hist(miss, bins, alpha=0.5, label='all test unmatched')
plt.legend(loc='upper right')
plt.title('Test Data Overlaps')
plt.show()

In [None]:
leak, public = data_scaler(flat_tst_leak, flat_tst_public)

bins = np.linspace(all_data_min*1.1, all_data_max*1.1, 100)
plt.figure(figsize=(12, 7))
plt.hist(leak, bins, alpha=0.5, label='test matched')
plt.hist(public, bins, alpha=0.5, label='public test unmatched')
plt.legend(loc='upper right')
plt.title('Test Data (Leak and Public) Overlaps')
plt.show()

In [None]:
# Find mistakes made in the training leak
target = train['target']
target_log = np.log1p(target)

zero_leak_trn = np.where(train_leak==0)[0]
wrong_trn_idx = np.where(train_leak!=target)[0]
print 'Zero leak idx equivalent to wrong train idx?:', np.array_equal(zero_leak_trn, wrong_trn_idx)
print 'Number of zero-value leaks in training leaks:', len(zero_leak_trn)

zero_leak_tst = np.where(test_leak==0)[0]
print '\nNumber of zero-value leaks in test leaks:', len(zero_leak_tst)

In [None]:
# Function for matching samples to predictions with 2-lag assumption
def two_get_leak(df, cols, extras, lag=0):
    cols_1 = cols[:((lag + 2)*-1)]
    cols_2 = cols[(lag + 2):]
    for ext in extras:
        cols_1 += ext[:((lag + 2)*-1)]
        cols_2 += ext[(lag + 2):]

    # All columns except last two + lag into tuple
    d1 = df[cols_1].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d1['index_one'] = d1.index.values
    # All columns except first two + lag into tuple
    d2 = df[cols_2].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['index_two'] = d2.index.values
    d2['pred'] = df[cols[lag]]
    
    # Remove duplicate keys for accurate matching
    d3 = d2[~d2.duplicated(subset=['key'], keep=False)]
    d4 = d1[~d1.duplicated(subset=['key'], keep=False)]
    d5 = d3.merge(d4, on='key', how='inner')
    d5.drop(labels='index_one', axis=1, inplace=True)
    
    d6 = d1.merge(d5, on='key', how='left')
    d6['matches'] = d6.apply(lambda x: (x['index_one'], x['index_two']), axis=1)
    
    return d6['pred'].fillna(0), d6['matches']


# Function for matching samples with 1-lag assumption
def one_get_leak(df, cols, extras, lag=0):
    cols_1 = cols[:((lag + 1)*-1)]
    cols_2 = cols[(lag + 1):]
    for ext in extras:
        cols_1 += ext[:((lag + 1)*-1)]
        cols_2 += ext[(lag + 1):]

    # All columns except last one + lag into tuple
    d1 = df[cols_1].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d1['index_one'] = d1.index.values
    # All columns except first one + lag into tuple
    d2 = df[cols_2].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['index_two'] = d2.index.values
    d2['pred'] = df[cols[lag]]
    
    # Remove duplicate keys for accurate matching
    d3 = d2[~d2.duplicated(subset=['key'], keep=False)]
    d4 = d1[~d1.duplicated(subset=['key'], keep=False)]
    d5 = d3.merge(d4, on='key', how='inner')
    d5.drop(labels='index_one', axis=1, inplace=True)
    
    d6 = d1.merge(d5, on='key', how='left')
    d6['matches'] = d6.apply(lambda x: (x['index_one'], x['index_two']), axis=1)
    
    return d6['pred'].fillna(0), d6['matches']


# Function for storing row indexes while making Giba-like matches (2-off predictions)
def compile_leak_index(data, f, cols, extras, pair=False):
    extra_cols = ['compiled_leak', 'compiled_idx', 'nonzero_mean']
    max_nlags = len(cols)-2
    
    train_leak = data[['ID']]
    train_leak['compiled_leak'] = 0
    train_leak['compiled_idx'] = 0
    train_leak['nonzero_mean'] = data[f].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
    
    leaky_cols = []
    
    if pair:
        print '\nMatching with one-lag assumption:'
    else:
        print '\nMatching with two-lag assumption:'
    
    for i in range(max_nlags):
        c = 'leaked_target_%s'%i
        n = 'leaked_index_%s'%i
        
        print 'Processing Lag:', i
        # Get predictions for current lag and store in new column
        if pair:
            train_leak[c], train_leak[n] = one_get_leak(data, list(cols), extras, i)
        else:
            train_leak[c], train_leak[n] = two_get_leak(data, list(cols), extras, i)
        leaky_cols.append(c)
        leaky_cols.append(n)
        # Get "grounding" by joining with original training dataset
        train_leak = train.join(train_leak.set_index('ID')[leaky_cols + extra_cols], 
                                on='ID', how='left')[['ID', 'target'] + leaky_cols + extra_cols]
    
    return train_leak

In [None]:
# Find leak values and leak indexes
master_two_leak = compile_leak_index(train, fnames, cols, extra_features, pair=False)
master_one_leak = compile_leak_index(train, fnames, cols, extra_features, pair=True)

In [None]:
def leak_compiler(data, leak_val):
    data['compiled_leak'] = 0
    for i in range(leak_val):
        c = 'leaked_target_%s'%i
        t = 'leaked_index_%s'%i
        zero_target = data['compiled_leak']==0
        zero_index = data['compiled_idx']==0
        data.loc[zero_target, 'compiled_leak'] = data.loc[zero_target, c]
        data.loc[zero_index, 'compiled_idx'] = data.loc[zero_index, t]
    return data

In [None]:
# Reformat leak data
two_leak = leak_compiler(master_two_leak, leak_val)
one_leak = leak_compiler(master_one_leak, leak_val)

In [None]:
# Find tails
coords_two = two_leak['compiled_idx'].values.tolist()
tail_df = pd.DataFrame(data=coords_two, columns=['coord_1', 'coord_2'])
tail_df = tail_df.loc[tail_df['coord_2'].notnull()]
tail_intersect = np.intersect1d(tail_df['coord_1'], tail_df['coord_2'])

tail_idx = tail_df['coord_2'].values[np.where(tail_df['coord_2'].values!=tail_intersect)[0]]

In [None]:
# Nonzero counts across rows
nzr_count_train = np.count_nonzero(train[fnames], axis=1)
nzr_count_test = np.count_nonzero(test[fnames], axis=1)

In [None]:
# Function for finding a unique value if it exists
def row_checker(x):
    uni_val, uni_cnt = np.unique(x, return_counts=True)
    # Drop zero index
    uni_val = uni_val[1:]
    uni_cnt = uni_cnt[1:]
    # Check if there's only 1 unique value
    if uni_val.shape[0]==1:
        return uni_cnt[0]
    else:
        return np.nan

### High-Level Metadata:

In [None]:
# Number of nonzero values in features
f_nonzeros = np.count_nonzero(test[fnames], axis=0)
# Visualize
plt.figure(figsize=(12, 7))
n, bins, patches = plt.hist(f_nonzeros, 100, alpha=0.5)
plt.title('Test Set Nonzero Features')
plt.ylabel('Count')
plt.xlabel('Number of Non-Zero Entities')
plt.savefig('./images/test_nonzero_count.png')
plt.show()

In [None]:
# Focus on the tail-end
dense_features = fnames[np.where(f_nonzeros>=4000)[0]]
print 'Number of dense features:', len(dense_features)
print dense_features

In [None]:
# Find densest rows amongst test set
dense_test = test.loc[:, dense_features]

In [None]:
r_nonzeros = np.count_nonzero(dense_test, axis=1)
# Visualize
plt.figure(figsize=(12, 7))
n, bins, patches = plt.hist(r_nonzeros, 100, alpha=0.5)
plt.title('Test Set Nonzero Rows')
plt.ylabel('Count')
plt.xlabel('Number of Non-Zero Entities')
plt.show()

In [None]:
# Focus on tail-end
dense_rows = dense_test.index.values[np.where(r_nonzeros>=50)[0]]
print 'Number of dense rows:', len(dense_rows)
print dense_rows

In [None]:
# Get dense features and rows in a dataframe
candidate_df = test.loc[dense_rows, dense_features]

#### Finding Unique Values:

In [None]:
def unique_in_dense(data, decimal=True, head_val=10):
    vals = data.values.reshape((-1))
    unique_vals, unique_counts = np.unique(vals, return_counts=True)
    unique_dict = {'vals': unique_vals, 'counts': unique_counts}
    unique_df = pd.DataFrame.from_dict(unique_dict, orient='columns')
    unique_df.sort_values(by='counts', ascending=False, inplace=True)
    if decimal:
        return unique_df.head(head_val)
    else:
        return unique_df[np.invert(np.equal(np.mod(unique_df['vals'].values, 1), 0))].head(head_val)

In [None]:
# Look at most common values in the dense test dataframe
unique_in_dense(dense_test, decimal=False, head_val=5)

In [None]:
# Look at most common values in the candidate dataframe
unique_in_dense(candidate_df, decimal=True, head_val=10)