# init

In [1]:
import pandas, IPython.core.display

## open file

In [2]:
src = pandas.ExcelFile('data/real/2017/CAM data from iPads/2017 CAM data from iPads.xlsx')

In [3]:
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [4]:
sheets_to_process = {sheet_name.split(' ')[-1]: src.parse(sheet_name) for sheet_name in src.sheet_names
    if any([person_name in sheet_name for person_name in ['Erl', 'Tyler']])}

# rename columns

In [5]:
sheets_to_rename = [sheet for sheet in sheets_to_process.values()
                    if any([str(column).find(' ') for column in sheet.columns])]

In [6]:
for sheet_to_rename in sheets_to_rename:
    sheet_to_rename.rename(mapper=lambda x: str(x).split(' ')[0], axis='columns', inplace=True)

# concatenate sheets

In [7]:
df = pandas.concat(sheets_to_process, names=['Sheet','Row'])

# hierarchical index

In [8]:
import collections

In [9]:
class NestedDefaultdict(collections.defaultdict):
    data = None
    name = None
    parent_key = None

    def __init__(self):
        collections.defaultdict.__init__(self, NestedDefaultdict)

In [10]:
def get_column_tree(frame):
    coltree = NestedDefaultdict()
    for column in frame.columns:
        pointer = coltree
        pointer.parent_key = ''
        for word in str(column).split('__'):
            pointer = pointer[word]
            pointer.parent_key = word
        pointer.name = column
        pointer.data = frame[column]
    coltree.default_factory = None
    return coltree

In [21]:
coltree = get_column_tree(df)

## visualize

In [11]:
def visualize_tree(node):
    """Given a dictionary, display a list of keys, and all keys 
    of any contained dictionaries, recursively. Indent to show depth
    of recursion."""

    def visualize_node(node, depth=0):
        for key, value in node.items():
            output.append(f"{'.   ' * depth}{key}"
                          f"{'/' if value.keys() else ''}"
                          f"{' : ' + str(value.data.count()) if value.data is not None else ''}"
                         )
            visualize_node(value, depth + 1)

    output = []
    visualize_node(node)
    return '\n'.join(output)

# define functions: get nodes, column names, DataFrames

In [15]:
def get_child_columns(node):
    return_value = []
    if node.data is not None:
        return_value.append(node)
    for child in node.values():
        return_value.append(child)
    return return_value

In [12]:
def get_descendant_columns(node):
    return_value = []
    if node.data is not None:
        return_value.append(node)
    for child in node.values():
        return_value.extend(get_descendant_columns(child))
    return return_value

In [16]:
def get_child_column_names(node):
    return [column.name for column in get_child_columns(node)]

In [13]:
def get_descendant_column_names(node):
    return [column.name for column in get_descendant_columns(node)]

In [17]:
def get_child_frame(node, frame=None):
    if frame is None:
        return pandas.DataFrame({column.name: column.data for column in get_child_columns(node)})
    return frame[get_child_column_names(node)]

In [14]:
def get_descendant_frame(node, frame=None):
    if frame is None:
        return pandas.DataFrame({column.name: column.data for column in get_descendant_columns(node)})
    return frame[get_descendant_column_names(node)]

# convert_datetime

In [18]:
def convert_datetime(frame):
    output_frame = frame.copy()
    foplt = 'fields__oSets__oPoints__location__timestamp'
    output_frame[foplt] = pandas.to_datetime(frame[foplt], unit='ms')
    for column in ['fields__oSets__date', 'fields__date']:
        output_frame[column] = pandas.to_datetime(output_frame[column], infer_datetime_format=True)
    return output_frame

In [19]:
df = convert_datetime(df)

# records: fields

In [22]:
get_child_frame(coltree['fields']).dropna(how='all')

Unnamed: 0_level_0,Unnamed: 1_level_0,None,fields__crop,fields__date,fields__desc,fields__image,fields__name
Sheet,Row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Erl,70,,CROPS.WHEAT,2017-08-09 09:24:11.845,Midge susceptible wheat. Awned.,img/wheat.png,Llewellyn wheat 1
Erl,140,,CROPS.WHEAT,2017-08-09 10:01:29.326,Early planted awned midge susceptible,img/wheat.png,Llewellyn
Erl,210,,CROPS.BARLEY,2017-08-09 11:16:15.922,Next to SEF Wheat and near faba beans,img/barley.png,SEF Barley
Erl,350,,CROPS.OATS,2017-08-09 11:17:15.791,Next to SEF peas and a wheat field,img/oats.png,SEF Oats
Erl,490,,CROPS.WHEAT,2017-08-10 14:27:04.887,Kernen cover crop,img/wheat.png,Kernen wheat
Erl,560,,CROPS.WHEAT,2017-08-17 11:10:41.881,Wheat 2 by canola and soybeans but on other si...,img/wheat.png,Outlook wheat
Erl,700,,CROPS.WHEAT,2017-08-17 13:05:51.122,Next to soybeans and canola,img/wheat.png,Outlook wheat 1
Erl,840,,CROPS.WHEAT,2017-08-22 16:01:41.018,,img/wheat.png,Kernan Wheat-1
Tyler,0,,CROPS.WHEAT,2017-07-14 12:30:31.587,,img/wheat.png,Sef wheat
Tyler,350,,CROPS.BARLEY,2017-07-18 10:46:49.163,Right next to SEF wheat,img/barley.png,SEF Barley


# records: sets

In [23]:
get_child_frame(coltree['fields']['oSets']).dropna(how='all')

Unnamed: 0_level_0,Unnamed: 1_level_0,fields__oSets__completeSets,fields__oSets__date,fields__oSets__dateCompare,fields__oSets__desc,fields__oSets__growthStage,None,fields__oSets__obsName,fields__oSets__results,fields__oSets__totalA1,fields__oSets__totalA2,fields__oSets__totalA3,fields__oSets__totalA4,fields__oSets__totalSets
Sheet,Row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Erl,0,0.0,2017-08-02 13:12:09.542,2017-08-02,,7.0,,Tyler,,,,,,1.0
Erl,70,1.0,2017-08-09 09:25:11.710,2017-08-09,,8.0,,Tyler,RESULTS.5,164.0,0.0,0.0,0.0,1.0
Erl,140,1.0,2017-08-09 10:06:25.480,2017-08-09,,7.0,,Tyler,RESULTS.5,66.0,0.0,0.0,0.0,1.0
Erl,210,2.0,2017-08-09 11:21:01.555,2017-08-09,,9.0,,Stean,RESULTS.1,0.0,0.0,0.0,0.0,2.0
Erl,350,2.0,2017-08-09 11:37:20.862,2017-08-09,,8.0,,Stean,RESULTS.1,5.0,5.0,0.0,0.0,2.0
Erl,490,1.0,2017-08-22 15:42:05.751,2017-08-22,,8.0,,Mikki,RESULTS.5,1.0,0.0,0.0,0.0,1.0
Erl,560,2.0,2017-08-17 11:12:02.820,2017-08-17,,8.0,,Gabrielle,RESULTS.1,169.0,96.0,0.0,0.0,2.0
Erl,700,2.0,2017-08-17 13:06:30.183,2017-08-17,,9.0,,Stean,RESULTS.1,78.0,102.0,0.0,0.0,2.0
Erl,840,1.0,2017-08-22 16:02:50.682,2017-08-22,,8.0,,Mikki,RESULTS.5,187.0,0.0,0.0,0.0,1.0
Tyler,0,0.0,2017-07-14 12:31:24.194,2017-07-14,,6.0,,Tyler,,,,,,1.0


# records: points

In [24]:
get_child_frame(coltree['fields']['oSets']['oPoints']).dropna(how='all')

Unnamed: 0_level_0,Unnamed: 1_level_0,fields__oSets__oPoints__id,None,fields__oSets__oPoints__name
Sheet,Row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Erl,0,0.0,,Observation Point 1
Erl,14,1.0,,Observation Point 2
Erl,28,2.0,,Observation Point 3
Erl,42,3.0,,Observation Point 4
Erl,56,4.0,,Observation Point 5
Erl,70,0.0,,Observation Point 1
Erl,84,1.0,,Observation Point 2
Erl,98,2.0,,Observation Point 3
Erl,112,3.0,,Observation Point 4
Erl,126,4.0,,Observation Point 5


# records: observations

In [25]:
get_descendant_frame(coltree['fields']['oSets']['oPoints']['observations']).dropna(how='all')

Unnamed: 0_level_0,Unnamed: 1_level_0,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__eVnum,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__id,fields__oSets__oPoints__observations__name,fields__oSets__oPoints__observations__|,fields__oSets__oPoints__observations__|__number
Sheet,Row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Erl,0,,,,0.0,,,,,0.0,Aphid Observation 1,,
Erl,1,,,,0.0,,,,,1.0,Aphid Observation 2,,
Erl,2,,,,0.0,,,,,2.0,Aphid Observation 3,,
Erl,3,,,,0.0,,,,,3.0,Aphid Observation 4,,
Erl,4,,,,0.0,,,,,4.0,Aphid Observation 5,,
Erl,5,,,,,1.0,,0.0,0.0,5.0,Natural Enemy Observation,e1,
Erl,6,,,,,,,,,,,e2,
Erl,7,,,,,,,,,,,e3,
Erl,8,,,,,,,,,,,e4,
Erl,9,,,,,,,,,,,e5,


In [29]:
df2 = pandas.concat((Out[i] for i in range(22,26)), axis='columns').dropna(how='all')