In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import pathlib

In [2]:
args = {'dbm': True,
        'batch_size': 128,
        'features': ['heading', 'lat', 'lon']}

In [3]:
def get_dataframe_from_dir(train_data_dir):
    """
    Crawls directory specified in @train_data_dir
    Assumes the master train folder is located at `cwd`../train
    """
    clean_train_files = []
    main_train_dir_path = pathlib.Path(
        pathlib.PurePath(pathlib.Path.cwd()).parent, pathlib.Path('train'))
    
    for train_dir in [x for x in main_train_dir_path.iterdir()
                      if x.is_dir() and '.' not in x.name]:
        clean_dir = train_dir / str(train_data_dir)
        for clean_train_file in [x for x in clean_dir.iterdir() if not x.is_dir()]:
            clean_train_files.append(str(clean_train_file))

    # setup pandas dataframe
    df = pd.concat([pd.read_csv(x) for x in clean_train_files], sort=False)

    return df

In [12]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [10]:
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

193 train examples
49 validation examples
61 test examples


In [11]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [13]:
df = get_dataframe_from_dir('1cid_nskip')
d = {'SIDEWALK' : 0, 'ROAD' : 1}
df = df.replace(d)
df.fillna(0, inplace = True)

print(df.columns)
# include/exclude cid dbm fields
if not args['dbm']:
    dbm_columns = [column for column in df.columns if 'dbm' in column]
    df.drop(columns=dbm_columns, inplace=True)

# set of features to be used for training
if 'features' in args:
    f_col = [column for column in df.columns if column not in args['features'] and 'dbm' not in column and column != 'label']
    df.drop(columns=f_col, inplace=True)
print(df.columns)
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

Index(['lat', 'lon', 'acc', 'bear', 'bearAcc', 'magnetometer_x',
       'magnetometer_y', 'magnetometer_z', 'heading', '33_dbm', '25_dbm',
       '181_dbm', '208_dbm', '200_dbm', '246_dbm', '188_dbm', '443_dbm',
       'timestamp', 'intersection_points', 'label', '189_dbm', '459_dbm',
       '195_dbm', '196_dbm', '476_dbm', '131_dbm', '484_dbm', '419_dbm'],
      dtype='object')
Index(['lat', 'lon', 'heading', '33_dbm', '25_dbm', '181_dbm', '208_dbm',
       '200_dbm', '246_dbm', '188_dbm', '443_dbm', 'label', '189_dbm',
       '459_dbm', '195_dbm', '196_dbm', '476_dbm', '131_dbm', '484_dbm',
       '419_dbm'],
      dtype='object')


In [None]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
