In [34]:
import tensorflow as tf
import numpy as np
import os
import json
import pandas as pd
from matplotlib import pyplot as plt
% matplotlib inline
from sklearn.utils.class_weight import compute_sample_weight

In [14]:
def normalize(df):
    df_norm = (df - df_mean) / (max_feature - df_min)
    return df_norm

In [72]:
def write_tfrecord(path, df,lab=None,weights=False):
    with tf.python_io.TFRecordWriter(path) as writer:
        for i in range(len(df.index)):
            if lab is None:
                example = tf.train.Example(features=tf.train.Features(feature={
                'data': _floats_feature(df.iloc[i].as_matrix())
                }))
            elif not weights:
                example = tf.train.Example(features=tf.train.Features(feature={
                'data': _floats_feature(df.iloc[i].as_matrix()),
                'label': _int64_feature_single(lab.iloc[i].as_matrix()[1])
                }))
            else:
                example = tf.train.Example(features=tf.train.Features(feature={
                'data': _floats_feature(df.iloc[i].as_matrix()),
                'label': _int64_feature_single(lab.iloc[i].as_matrix()[1]),
                'weight':_floats_feature_single(lab.iloc[i].as_matrix()[0])
                }))
            writer.write(example.SerializeToString())
        print('%d records wrote' % i)
        
def _floats_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature_single(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _floats_feature_single(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [49]:
with open('../data/split.json','r') as f: 
    data = json.load(f)
print '%d/%d' % (len(data['train']), len(data['val']))
train_idx = data['train']
val_idx = data['val']

16747/4642


In [9]:
INPUT_PATH = os.path.join('..', 'data')
TEST_DF = pd.read_csv(os.path.join(INPUT_PATH, 'test_covariates.tsv'), sep='\t')
TRAIN_DF = pd.read_csv(os.path.join(INPUT_PATH, 'train_covariates.tsv'), sep='\t')
TRAIN_ID_DF = pd.read_csv(os.path.join(INPUT_PATH, 'train_experiment_ids.tsv'), sep='\t')
TRAIN_LAB_DF = pd.read_csv(os.path.join(INPUT_PATH, 'train_observed_labels_new.tsv'), sep='\t')

In [11]:
ALL_DF= pd.concat([TRAIN_DF, TEST_DF])

In [13]:
max_feature = ALL_DF.max()
df_mean = ALL_DF.mean()
df_min = ALL_DF.min()
df_norm = normalize(TRAIN_DF)
df_test_norm = normalize(TEST_DF)

In [None]:
for i,t in max_feature[max_feature == 0].iteritems():
    # print i
    if i in df_test_norm.keys():
        del df_test_norm[i]
for i,t in max_feature[max_feature == 0].iteritems():
    print i
    if i in df_norm.keys():
        del df_norm[i]

In [35]:
df_train_norm = df_norm.loc[train_idx]
lab_train = TRAIN_LAB_DF.loc[train_idx]
df_val_norm = df_norm.loc[val_idx]
lab_val = TRAIN_LAB_DF.loc[val_idx]

In [63]:
# compute data weights
lab_t = lab_train.as_matrix()[:,1]
weights = compute_sample_weight(class_weight='balanced', y=lab_t)
lab_train = np.vstack((weights,lab_t)).T
lab_train = pd.DataFrame(lab_train)

In [73]:
write_tfrecord('../data/train_full_norm.tfrecords',df_train_norm,lab_train,weights=True)
write_tfrecord('../data/val_full_norm.tfrecords',df_val_norm,lab_val)
write_tfrecord('../data/test_full_norm.tfrecords',df_test_norm)

16746 records wrote
4641 records wrote
2854 records wrote


In [None]:
np.save('../data/train_data.npy', df_train_norm.as_matrix())
np.save('../data/train_label.npy', lab_train.as_matrix()[:,1])
np.save('../data/val_data.npy', df_val_norm.as_matrix())
np.save('../data/val_label.npy',lab_val.as_matrix()[:,1])

np.save('../data/test_data.npy', df_test_norm.as_matrix())

In [None]:
# produce class counts
lab_t, count_t = np.unique(lab_train, return_counts=True)
lab_v, count_v = np.unique(lab_val, return_counts=True)
np.save('../data/sem_map.npy',{lab_t[i]: count_t[i] for i in range(len(count_t))})
np.save('../data/sem_map_v.npy',{lab_v[i]: count_v[i] for i in range(len(count_v))})