In [11]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

import pathlib

In [18]:
args = {'dbm': True,
        'batch_size': 128,
        'features': ['heading', 'lat', 'lon']}

In [19]:
def get_dataframe_from_dir(train_data_dir):
    """
    Crawls directory specified in @train_data_dir
    Assumes the master train folder is located at `cwd`../train
    """
    clean_train_files = []
    main_train_dir_path = pathlib.Path(
        pathlib.PurePath(pathlib.Path.cwd()).parent, pathlib.Path('train'))
    
    for train_dir in [x for x in main_train_dir_path.iterdir()
                      if x.is_dir() and '.' not in x.name]:
        clean_dir = train_dir / str(train_data_dir)
        for clean_train_file in [x for x in clean_dir.iterdir() if not x.is_dir()]:
            clean_train_files.append(str(clean_train_file))

    # setup pandas dataframe
    df = pd.concat([pd.read_csv(x) for x in clean_train_files], sort=False)

    return df

In [20]:
df = get_dataframe_from_dir('1cid_nskip')
d = {'SIDEWALK' : 0, 'ROAD' : 1}
df = df.replace(d)
df.fillna(0, inplace = True)

print(df.columns)
# include/exclude cid dbm fields
if not args['dbm']:
    dbm_columns = [column for column in df.columns if 'dbm' in column]
    df.drop(columns=dbm_columns, inplace=True)

# set of features to be used for training
if 'features' in args:
    f_col = [column for column in df.columns if column not in args['features'] and 'dbm' not in column and column != 'label']
    df.drop(columns=f_col, inplace=True)
print(df.columns)

Index(['lat', 'lon', 'acc', 'bear', 'bearAcc', 'magnetometer_x',
       'magnetometer_y', 'magnetometer_z', 'heading', '33_dbm', '25_dbm',
       '181_dbm', '208_dbm', '200_dbm', '246_dbm', '188_dbm', '443_dbm',
       'timestamp', 'intersection_points', 'label', '189_dbm', '459_dbm',
       '195_dbm', '196_dbm', '476_dbm', '131_dbm', '484_dbm', '419_dbm'],
      dtype='object')
Index(['lat', 'lon', 'heading', '33_dbm', '25_dbm', '181_dbm', '208_dbm',
       '200_dbm', '246_dbm', '188_dbm', '443_dbm', 'label', '189_dbm',
       '459_dbm', '195_dbm', '196_dbm', '476_dbm', '131_dbm', '484_dbm',
       '419_dbm'],
      dtype='object')


In [21]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

print(train)
print(val)

            lat         lon     heading  33_dbm  25_dbm  181_dbm  208_dbm  \
3203  24.788743  120.994212 -168.481770       0       0      -75        0   
3514  24.788199  120.994935   59.674255     -79       0        0        0   
4648  24.788694  120.994200 -163.080180       0       0      -70        0   
3320  24.789251  120.994184  160.660660       0       0      -81        0   
514   24.788120  120.994646   14.574241       0       0      -77        0   
...         ...         ...         ...     ...     ...      ...      ...   
291   24.788777  120.994230 -159.378280       0       0      -74        0   
651   24.788671  120.994959  -29.530804     -84       0        0        0   
735   24.789465  120.994940   30.992434     -74       0        0        0   
913   24.789235  120.995047   41.640860     -86       0        0        0   
661   24.788883  120.995009   77.816080     -83       0        0        0   

      200_dbm  246_dbm  188_dbm  443_dbm  label  189_dbm  459_dbm  195_dbm 

In [22]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [23]:
# setup data arrays from pandas dataframe
batch_size = args['batch_size']
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
steps_per_epoch = len(train.index) // batch_size

TypeError: df_to_dataset() got an unexpected keyword argument 'df'