In [1]:
# The display function isn't always imported by default in some Jupyter implementations. We'll probably use it.
from IPython.core.display import display

In [2]:
# The file I'm interested in parsing for cleanup:
file_path = "./src/real data/2017 CAM data from iPads/2017 CAM data from iPads.xlsx"

In [3]:
# pandas is really good with columnar data, like Excel files - https://pandas.pydata.org
import pandas

data_file = pandas.ExcelFile(file_path)
data_file.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

In [4]:
# I'm only interested in these two:
cam_sheet_names = ['2017 CAM data Erl', '2017 CAM iPad data Tyler']

# make a dictionary of dataframes from all the sheets, using the last word as the name (index -1 means last)
sheets = {sheet_name.split(' ')[-1]: data_file.parse(sheet_name)
          for sheet_name in cam_sheet_names}

In [5]:
# The keys are sheet names. Let's see what we've got:
list(sheets.keys())

['Erl', 'Tyler']

In [6]:
# defaultdict is like a regular dictionary, except it doesn't complain if you try to assign something to a key that doesn't exist yet.
from collections import defaultdict

# I don't like the horrendously long column names, but they are grouped logically, so let's take advantage of that!
def split_column_to_dict(sheet, column, column_dictionary=None, separator='__'):
    """ Split the column names like "fields__oSets__oPoints__observations" into groupings of keys
    so that related keys are easy to find, ie columns['fields']['oSets']['oPoints']['observations'].
    This produces a tree of column name segments, with actual pandas data at the ends."""

    # If a dictionary is provided, just update it. Otherwise, make a blank one.
    if column_dictionary is None:
        # nested_dict = lambda: defaultdict(nested_dict)
        def nested_dict():
            return defaultdict(nested_dict)
        column_dictionary = nested_dict()

    # Set a pointer to the root of the tree
    pointer = column_dictionary
    for segment in str(column).split(separator):
        # Just update the pointer to the deeper location in the tree. The defaultdict will make the new nodes automatically, if needed. 
        pointer = pointer[segment]
    # To avoid naming conflicts with pandas magic attributes (such as "number"), the actual data is in a node with a 
    # name that can't possibly be a segment in the column name: the separator ('__') 
    pointer[separator] = sheet[column]

    # Since `pointer` was actually just pointing to parts of the column dictionary,
    #  it's been updated and is ready for output.
    return column_dictionary


In [7]:
# Now, make a dictionary of column trees, grouped by sheet name.
column_dictionary = {}
separator = '__'
for sheet_name, sheet in sheets.items():
    # Build a new nested column dictionary for this sheet
    new_dict = None
    for column in sheet.columns:
        new_dict = split_column_to_dict(sheet, column, new_dict, separator)
    # Turn off the defaultdict behaviour of creating a key instead of throwinng an exception
    new_dict.default_factory = None
    column_dictionary[sheet_name] = new_dict


In [8]:
len(_)  # How many 'oSets' sections of the file are there, based on the length of that list we just output?

2

In [9]:
column_dictionary.keys()

dict_keys(['Erl', 'Tyler'])

There should be a list of the first segments of all the column names in that sheet:

In [10]:
column_dictionary['Erl'].keys()

dict_keys(['fields', 'clients', 'observers'])

Continuing deeper, more segments that share a common prefix:

In [11]:
column_dictionary['Erl']['fields'].keys()

dict_keys(['client', 'name', 'crop', 'desc', 'image', 'date', 'oSets'])

In [12]:
column_dictionary['Erl']['fields']['oSets'].keys()

dict_keys(['date', 'dateCompare', 'growthStage Zadoks', 'desc', 'obsName', 'totalSets', 'completeSets', 'results', 'oPoints', 'totalA1', 'totalA2', 'totalA3', 'totalA4'])

At the bottom of the tree should be a '\_\_' key for the actual data. We can drop all the blanks and see which rows remain for the column here (`fields__oSets__date`):

In [13]:
list(column_dictionary['Erl']['fields']['oSets']['date'][separator].dropna().index)

[0, 70, 140, 210, 350, 490, 560, 700, 840]

In [14]:
len(_)  # How many 'oSets' sections of the file are there, based on the length of that list we just output?

9

Good to know. What about the other file?

In [15]:
len(column_dictionary['Tyler']['fields']['oSets']['date'][separator].dropna())

45

So, 54 sets of observations we'll be processing.

### Helper functions, for peeking into the data

In [16]:
def has_children(node):
    """ We know an item has children if it has at least one key that isn't just the separator that marks actual data. """
    return {parent_key: child for parent_key, child in node.items() if len([key for key in child.keys() if key != separator]) > 0}


def has_data(node):
    """ We know a child item has data if it has a key that's the separator string that marks actual data. """
    return {parent_key: child for (parent_key, child) in node.items() if separator in child.keys()}

In [17]:
example_node = column_dictionary[sheet_name]['fields']['oSets']['oPoints']['observations']
display(has_children(example_node).keys())
display(has_data(example_node).keys())

dict_keys(['a1', 'a2', 'a3', '|'])

dict_keys(['id', 'name', 'enum', 'eVnum', 'anum', 'disabled', 'complete', '|'])

In [18]:
# Set of keys for nodes that that have children but also data:
set(has_children(example_node).keys()) & set(has_data(example_node).keys())

{'|'}

## Concatenating dataframes made from sheets

_pandas_ has a [`concat`] method for tacking one dataframe onto another. It also has [`append`] and [`merge`].

[`concat`]: https://pandas.pydata.org/pandas-docs/stable/merging.html
[`append`]: https://pandas.pydata.org/pandas-docs/stable/merging.html#concatenating-using-append
[`merge`]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging

It looks like `concat` will suffice, here, as long as the column names are identical. At the moment, they are not. Let's see how close that gets us, though:

In [19]:
experimental_concat_results = pandas.concat(sheets, ignore_index=True)
sorted(experimental_concat_results.columns)

['clients__company',
 'clients__displayText',
 'clients__fname',
 'clients__lname',
 'clients__name',
 'fields__client__company',
 'fields__client__displayText',
 'fields__client__fname',
 'fields__client__lname',
 'fields__client__name',
 'fields__crop',
 'fields__date',
 'fields__desc',
 'fields__image',
 'fields__name',
 'fields__oSets__completeSets',
 'fields__oSets__date',
 'fields__oSets__dateCompare',
 'fields__oSets__desc',
 'fields__oSets__growthStage',
 'fields__oSets__growthStage Zadoks',
 'fields__oSets__oPoints__id',
 'fields__oSets__oPoints__location__coords__accuracy',
 'fields__oSets__oPoints__location__coords__altitude',
 'fields__oSets__oPoints__location__coords__altitudeAccuracy',
 'fields__oSets__oPoints__location__coords__heading',
 'fields__oSets__oPoints__location__coords__latitude',
 'fields__oSets__oPoints__location__coords__longitude',
 'fields__oSets__oPoints__location__coords__speed',
 'fields__oSets__oPoints__location__timestamp',
 'fields__oSets__oPoints__

In [20]:
sum(len(sheet) for sheet in sheets.values())

4690

In [21]:
len(experimental_concat_results)

4690

Pretty good, so far. The length is correct, and the columns line up, except for those duplicates having slightly different names.

## Preparing to concatenate multiple sheets

Let's take a look at the columns containing data about observation sets:

In [22]:
for sheet_name in column_dictionary.keys():
    node_sets = column_dictionary[sheet_name]['fields']['oSets']
    key_column = node_sets['date'][separator]
    columns = [child[separator].name for parent_key, child in has_data(node_sets).items()]
    display(sheets[sheet_name].loc[key_column.isna() != True, columns].head(3))

Unnamed: 0,fields__oSets__date,fields__oSets__dateCompare,fields__oSets__growthStage Zadoks,fields__oSets__desc,fields__oSets__obsName,fields__oSets__totalSets,fields__oSets__completeSets,fields__oSets__results,fields__oSets__totalA1,fields__oSets__totalA2,fields__oSets__totalA3,fields__oSets__totalA4
0,2017-08-02T13:12:09.542,2017-08-02,7.0,,Tyler,1.0,0.0,,,,,
70,2017-08-09T09:25:11.710,2017-08-09,8.0,,Tyler,1.0,1.0,RESULTS.5,164.0,0.0,0.0,0.0
140,2017-08-09T10:06:25.480,2017-08-09,7.0,,Tyler,1.0,1.0,RESULTS.5,66.0,0.0,0.0,0.0


Unnamed: 0,fields__oSets__date,fields__oSets__dateCompare,fields__oSets__growthStage,fields__oSets__desc,fields__oSets__obsName,fields__oSets__totalSets,fields__oSets__completeSets,fields__oSets__results,fields__oSets__totalA1,fields__oSets__totalA2,fields__oSets__totalA3,fields__oSets__totalA4
0,2017-07-14T12:31:24.194,2017-07-14,6.0,,Tyler,1.0,0.0,,,,,
70,2017-07-18T10:31:22.263,2017-07-18,6.0,,Tyler,1.0,1.0,RESULTS.5,8.0,0.0,0.0,0.0
140,2017-07-28T13:05:44.673,2017-07-28,8.0,,Mikki,1.0,1.0,RESULTS.5,37.0,0.0,0.0,0.0


Some column names seem to differ. Quick comparison of column names, showing differences:

In [23]:
display(sheets.keys(),
        set.difference(*[set(sheet.columns) for sheet in sheets.values()]))

dict_keys(['Erl', 'Tyler'])

{'fields__oSets__growthStage Zadoks',
 'fields__oSets__oPoints__observations__a1__number EGA',
 'fields__oSets__oPoints__observations__a2__number BCO',
 'fields__oSets__oPoints__observations__a3__number Greenbug',
 'fields__oSets__oPoints__observations__anum TotalAPhids',
 'fields__oSets__oPoints__observations__eVnum Natural enemy totals'}

In [24]:
# This time we want columns from all sheets that don't have a match (symmetrical difference).
column_list = sorted(set.symmetric_difference(*[set(sheet.columns) for sheet in sheets.values()]))

In [25]:
# Display entire column, even if cell data is long
pandas.set_option('display.max_colwidth', 0)

# Show the differing column names side-by-side
pandas.DataFrame([*zip(*[iter(column_list)] * 2)])

Unnamed: 0,0,1
0,fields__oSets__growthStage,fields__oSets__growthStage Zadoks
1,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a1__number EGA
2,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a2__number BCO
3,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__a3__number Greenbug
4,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__anum TotalAPhids
5,fields__oSets__oPoints__observations__eVnum,fields__oSets__oPoints__observations__eVnum Natural enemy totals


When we concatenate, we'll have to use the names from the first sheet, since it's more regular. Which sheet is that? Which sheet has '`fields__oSets__growthStage Zadoks`' instead of '`fields__oSets__growthStage`'?

In [26]:
# 
sheets_with_bad_column_names = {sheet_name: sheet for sheet_name, sheet in sheets.items() if 'fields__oSets__growthStage Zadoks' in sheet.columns}
display('Bad:',
        set(sheets_with_bad_column_names.keys()))

'Bad:'

{'Erl'}

In [27]:
display('Good:',
        set(sheets.keys()) - set(sheets_with_bad_column_names.keys()))

'Good:'

{'Tyler'}

Okay, 'Tyler' has the column names we prefer, 'Erl' does not. Duly noted.

How shall we solve the name mismatch? We can either remap the names during the merge, or rename the columns beforehand. After reviewing documentation, I think it seems better to rename the bad columns beforehand.

### Renaming columns

_pandas_ has a [`rename`] method which lets us apply a transform function to the columns (or explicitly map each column through a dictionary).

[`rename`]: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html

Since the bad names contain suffixes beginning with a space character, let's just ignore anything after and including the first space character in a colmun name.

In [28]:
def first_word(column_name, word_separator=' '):
    """ Split string into words (by space character), return first word. """
    return column_name.split(word_separator)[0]  # [0] means first item in list

In [29]:
from IPython.display import HTML

report = []  # start a list of strings to display after renaming
for sheet_name, sheet in sheets_with_bad_column_names.items():
    report.append(f"<h2>{sheet_name}</h2>")  # heading
    report.append('<ol>')  # start an ordered list
    for column_name in sheet.columns:
        after = first_word(column_name)
        boldness = 'bold' if after != column_name else 'normal'
        report.append(f"<li style='font-weight: {boldness}'>{column_name} &rarr; {after}</li>")
    report.append('</ol>')  # close the list
    sheet.rename(mapper=first_word, axis='columns', inplace=True)
display(HTML(''.join(report)))  # join all the strings together and display as HTML