# Looking at Candidate Sets

In [None]:
# Load libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import pdb
import os
import h5py
import pickle


# Function for loading h5py file
def load_h5py(fname):
    with h5py.File(fname, 'r') as handle:
        return handle['data'][:]
# Function for loading pickle file
def load_pickle(fname):
    with open(fname, 'rb') as handle:
        return pickle.load(handle)


# Function for setting up
def get_input(debug=False):
    '''
    Function for loading either debug or full datasets
    '''
    os.chdir('../../data/compressed/')
    print os.getcwd()
    pkl_files = ['train_id.pickle', 'trainidx.pickle', 'target.pickle', 'test_id.pickle', 'testidx.pickle']
    if debug:
        print 'Loading debug train and test datasets...'
        # h5py files
        train = load_h5py('debug_train.h5')
        test = load_h5py('debug_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('debug_%s'%f) for f in pkl_files]
    else:
        print 'Loading original train and test datasets...'
        # h5py files
        train = load_h5py('full_train.h5')
        test = load_h5py('full_test.h5')
        # pickle files
        id_train, train_idx, target, id_test, test_idx = [load_pickle('full_%s'%f) for f in pkl_files]
    # Load feature names
    fnames = load_pickle('feature_names.pickle')
    # Find shape of loaded datasets
    print('Shape of training dataset: {} Rows, {} Columns'.format(*train.shape))
    print('Shape of test dataset: {} Rows, {} Columns'.format(*test.shape))
    os.chdir('../../scripts/time_series/')
    print os.getcwd()
    return fnames, train, id_train, train_idx, target, test, id_test, test_idx


# Function for getting datasets in dataframe format
def get_dataframes(debug=False):
    # Load data
    fnames, train, id_train, train_idx, target, test, id_test, test_idx = get_input(debug)
    # Format data
    train_df = pd.DataFrame(data=train, index=train_idx, columns=fnames)
    train_df['ID'] = id_train
    train_df['target'] = target
    test_df = pd.DataFrame(data=test, index=test_idx, columns=fnames)
    test_df['ID'] = id_test
    
    print('\nShape of training dataframe: {} Rows, {} Columns'.format(*train_df.shape))
    print('Shape of test dataframe: {} Rows, {} Columns'.format(*test_df.shape))
    return fnames, train_df, test_df

In [None]:
# Function for loading singh sets
def load_singh(train):
    exclude = ['target', 'value_count']
    
    set_loc = './pattern_singh/'
    file_names = os.listdir(set_loc)
    file_names = [set_loc+f for f in file_names if '.csv' in f]

    singh_sets = []
    singh_cols = []
    for name in file_names:
        tmp_df = pd.read_csv(name, index_col=0)
        tmp_df.insert(0, 'target', train.target.values[tmp_df.index.values])
        if name==set_loc+'pattern_1166666.66.csv':
            tmp_df.rename(columns={'8.50E+43': '850027e38'},inplace=True)
        singh_sets.append(tmp_df)
        singh_cols.append([c for c in tmp_df.columns.values if c not in exclude])
    
    return singh_sets, singh_cols

In [None]:
# Function for loading Aaron test sets v0
def load_aaron_v0(count=10):
    set_name = './aaron_test_v0.pickle'
    
    aaron_features = load_pickle(set_name)
    return aaron_features[:count]

In [None]:
# Main Script
try:
    del fnames, train, test
    print 'Clearing loaded dataframes from memory...\n'
except:
    pass
fnames, train, test = get_dataframes(debug=False)

In [None]:
# Load singh sets
singh_sets, singh_cols = load_singh(train)

# Load jia set
jia_df = pd.read_csv('./jia_pattern.csv', index_col=0)
jia_df.insert(0, 'target', train.target.values[jia_df.index.values])

In [None]:
# Compare jia set to singh sets
print 'Number of rows in jia set:', jia_df.shape[0]

for s_set in singh_sets:
    print 'Number of overlapping rows:', len(np.intersect1d(s_set.index.values, jia_df.index.values))