# init

## install openpyxl

pandas uses this when saving Excel files. pandas imports it automatically, as needed.

In [1]:
!pip install openpyxl



## import Python modules

In [2]:
import pandas, IPython.core.display

## open file

In [3]:
src = pandas.ExcelFile('2017 CAM data from iPads.xlsx')
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [4]:
search_strings = ['Erl', 'Tyler']
sheets_to_process = {
    sheet_name.split(' ')[-1]: src.parse(sheet_name)
    for sheet_name in src.sheet_names
    if any(
        (
            (pattern in sheet_name) for pattern in search_strings
        )
    )
}
sheets_to_process.keys()

dict_keys(['Erl', 'Tyler'])

## Define Aphids and Natural Enemies

In [5]:
EGA = 'English grain'
BCO = 'bird cherry oat'
greenbug = 'green bug'
aphid_name_list = (EGA, BCO, greenbug)
aphid_name = {f'a{n + 1}': name for n, name in enumerate(aphid_name_list)}
aphid_name

{'a1': 'English grain', 'a2': 'bird cherry oat', 'a3': 'green bug'}

In [6]:
# @todo: input natural enemy names
natural_enemy_name_list = [
    'aphid_mummies_brown',
    'aphid_mummies_blk',
    'ladybeetle_larvae',
    'ladybeetle_adult',
    'lacewing_larvae',
    'lacewing_adult',
]
natural_enemy_name = {f'e{n + 1}': name for n, name in enumerate(natural_enemy_name_list)}
natural_enemy_name

{'e1': 'aphid_mummies_brown',
 'e2': 'aphid_mummies_blk',
 'e3': 'ladybeetle_larvae',
 'e4': 'ladybeetle_adult',
 'e5': 'lacewing_larvae',
 'e6': 'lacewing_adult'}

# cleanup

## normalize column names

In [216]:
sheets_needing_column_rename = (
    (sheet_name, sheet)
    for sheet_name, sheet
    in sheets_to_process.items()
    if any(
        (str(column).find(' ')
         for column in sheet.columns)
    )
)
sheets_to_process = {
    sheet_name: sheet.rename(
        mapper=lambda x: str(x).split(' ')[0],
        axis='columns'
    )
    for sheet_name, sheet in sheets_needing_column_rename
}

## concatenate sheets

In [217]:
concat_df = pandas.concat(
    sheets_to_process,
    names=['worksheet_name', 'index']
)

## reindex and rename columns

In [218]:
names = [
     'scope',       'group',         'variable',          'source_field']
columns = (
    ('field',       '',              'client',            'fields__client__displayText'),
    ('field',       'index',         'crop',              'fields__crop'),
    ('field',       'index',         'name',              'fields__name'),
    ('field',       'index',         'desc',              'fields__desc'),
    ('field',       '',              'date',              'fields__date'),
    ('set',         'index',         'date',              'fields__oSets__date'),
    ('set',         '',              'description',       'fields__oSets__desc'),
    ('set',         '',              'growth_stage',      'fields__oSets__growthStage'),
    ('set',         '',              'observer',          'fields__oSets__obsName'),
    ('set',         '',              'complete_sets',     'fields__oSets__completeSets'),
    ('set',         '',              'total_sets',        'fields__oSets__totalSets'),
    ('point',       'index',         'id',                'fields__oSets__oPoints__id'),
    ('observation', 'index',         'id',                'fields__oSets__oPoints__observations__id'),
    ('observation', '',              'complete',          'fields__oSets__oPoints__observations__complete'),
    ('observation', '',              'disabled',          'fields__oSets__oPoints__observations__disabled'),
    ('observation', '',              'natural_enemy_sub', 'fields__oSets__oPoints__observations__enum'),
    ('observation', '',              'aphid_subtotal',    'fields__oSets__oPoints__observations__anum'),
    ('observation', '',              'a1',                'fields__oSets__oPoints__observations__a1__number'),
    ('observation', '',              'a2',                'fields__oSets__oPoints__observations__a2__number'),
    ('observation', '',              'a3',                'fields__oSets__oPoints__observations__a3__number'),
    ('observation', 'natural_enemy', 'name',              'fields__oSets__oPoints__observations__|'),
    ('observation', 'natural_enemy', 'number',            'fields__oSets__oPoints__observations__|__number'),
)
columns = pandas.MultiIndex.from_tuples(columns, names=names)
df = concat_df.reindex(
    columns=columns,
    level='source_field')
df.head()

Unnamed: 0_level_0,scope,field,field,field,field,field,set,set,set,set,set,...,observation,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,group,Unnamed: 2_level_1,index,index,index,Unnamed: 6_level_1,index,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,...,index,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,natural_enemy,natural_enemy
Unnamed: 0_level_2,variable,client,crop,name,desc,date,date,description,growth_stage,observer,complete_sets,...,id,complete,disabled,natural_enemy_sub,aphid_subtotal,a1,a2,a3,name,number
Unnamed: 0_level_3,source_field,fields__client__displayText,fields__crop,fields__name,fields__desc,fields__date,fields__oSets__date,fields__oSets__desc,fields__oSets__growthStage,fields__oSets__obsName,fields__oSets__completeSets,...,fields__oSets__oPoints__observations__id,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__|,fields__oSets__oPoints__observations__|__number
worksheet_name,index,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
Erl,0,,,,,,2017-08-02T13:12:09.542,,7.0,Tyler,0.0,...,0.0,,,,0.0,,,,,
Erl,1,,,,,,,,,,,...,1.0,,,,0.0,,,,,
Erl,2,,,,,,,,,,,...,2.0,,,,0.0,,,,,
Erl,3,,,,,,,,,,,...,3.0,,,,0.0,,,,,
Erl,4,,,,,,,,,,,...,4.0,,,,0.0,,,,,


## convert_datetime

In [219]:
date_column_mask, date_column_slice = df.columns.get_loc_level('date', level='variable')

In [220]:
df.loc[:, date_column_mask] = df.loc[:, date_column_mask].apply(pandas.to_datetime)
df.loc[:, date_column_mask].dropna(how='all').head()

Unnamed: 0_level_0,scope,field,set
Unnamed: 0_level_1,group,Unnamed: 2_level_1,index
Unnamed: 0_level_2,variable,date,date
Unnamed: 0_level_3,source_field,fields__date,fields__oSets__date
worksheet_name,index,Unnamed: 2_level_4,Unnamed: 3_level_4
Erl,0,NaT,2017-08-02 13:12:09.542
Erl,70,2017-08-09 09:24:11.845,2017-08-09 09:25:11.710
Erl,140,2017-08-09 10:01:29.326,2017-08-09 10:06:25.480
Erl,210,2017-08-09 11:16:15.922,2017-08-09 11:21:01.555
Erl,350,2017-08-09 11:17:15.791,2017-08-09 11:37:20.862


## Define Common Index Columns

In [222]:
icols = df.columns.get_loc_level('index', level='group', drop_level=False)[1].values.tolist()
icols

[('field', 'index', 'crop', 'fields__crop'),
 ('field', 'index', 'name', 'fields__name'),
 ('field', 'index', 'desc', 'fields__desc'),
 ('set', 'index', 'date', 'fields__oSets__date'),
 ('point', 'index', 'id', 'fields__oSets__oPoints__id'),
 ('observation', 'index', 'id', 'fields__oSets__oPoints__observations__id')]

In [225]:
df[icols] = df[icols].ffill()

In [234]:
df = df.set_index(icols, append=True)
df.head(21)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,scope,field,field,set,set,set,set,set,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,group,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,natural_enemy,natural_enemy
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,variable,client,date,description,growth_stage,observer,complete_sets,total_sets,complete,disabled,natural_enemy_sub,aphid_subtotal,a1,a2,a3,name,number
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,source_field,fields__client__displayText,fields__date,fields__oSets__desc,fields__oSets__growthStage,fields__oSets__obsName,fields__oSets__completeSets,fields__oSets__totalSets,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__|,fields__oSets__oPoints__observations__|__number
worksheet_name,index,"(field, index, crop, fields__crop)","(field, index, name, fields__name)","(field, index, desc, fields__desc)","(set, index, date, fields__oSets__date)","(point, index, id, fields__oSets__oPoints__id)","(observation, index, id, fields__oSets__oPoints__observations__id)",Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
Erl,0,,,,2017-08-02 13:12:09.542,0.0,0.0,,NaT,,7.0,Tyler,0.0,1.0,,,,0.0,,,,,
Erl,1,,,,2017-08-02 13:12:09.542,0.0,1.0,,NaT,,,,,,,,,0.0,,,,,
Erl,2,,,,2017-08-02 13:12:09.542,0.0,2.0,,NaT,,,,,,,,,0.0,,,,,
Erl,3,,,,2017-08-02 13:12:09.542,0.0,3.0,,NaT,,,,,,,,,0.0,,,,,
Erl,4,,,,2017-08-02 13:12:09.542,0.0,4.0,,NaT,,,,,,,,,0.0,,,,,
Erl,5,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,1.0,,0.0,,,,,e1,
Erl,6,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,,e2,
Erl,7,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,,e3,
Erl,8,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,,e4,
Erl,9,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,,e5,


## unstack natural enemies

In [308]:
ne = (
    df
    .xs('natural_enemy', level='group', axis='columns', drop_level=False)
    .xs(5.0, level=7)
    .set_index(('observation', 'natural_enemy', 'name', 'fields__oSets__oPoints__observations__|'), append=True)
    .unstack()
    .rename(columns=natural_enemy_name, level=4)
    .loc[::9]
)
ne.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,scope,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,group,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,variable,number,number,number,number,number,number,number,number,number
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,source_field,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number,fields__oSets__oPoints__observations__|__number
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,"(observation, natural_enemy, name, fields__oSets__oPoints__observations__|)",aphid_mummies_brown,aphid_mummies_blk,ladybeetle_larvae,ladybeetle_adult,lacewing_larvae,lacewing_adult,e7,e8,e9
worksheet_name,index,"(field, index, crop, fields__crop)","(field, index, name, fields__name)","(field, index, desc, fields__desc)","(set, index, date, fields__oSets__date)","(point, index, id, fields__oSets__oPoints__id)",Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
Erl,5,,,,2017-08-02 13:12:09.542,0.0,,,,,,,,,
Erl,19,,,,2017-08-02 13:12:09.542,1.0,,,,,,,,,
Erl,33,,,,2017-08-02 13:12:09.542,2.0,,,,,,,,,
Erl,47,,,,2017-08-02 13:12:09.542,3.0,,,,,,,,,
Erl,61,,,,2017-08-02 13:12:09.542,4.0,,,,,,,,,
Erl,75,CROPS.WHEAT,Llewellyn wheat 1,Midge susceptible wheat. Awned.,2017-08-09 09:25:11.710,0.0,,,,,,,,,
Erl,89,CROPS.WHEAT,Llewellyn wheat 1,Midge susceptible wheat. Awned.,2017-08-09 09:25:11.710,1.0,,,,,,,,,
Erl,103,CROPS.WHEAT,Llewellyn wheat 1,Midge susceptible wheat. Awned.,2017-08-09 09:25:11.710,2.0,,,,,,,,,
Erl,117,CROPS.WHEAT,Llewellyn wheat 1,Midge susceptible wheat. Awned.,2017-08-09 09:25:11.710,3.0,,,,,,,,,
Erl,131,CROPS.WHEAT,Llewellyn wheat 1,Midge susceptible wheat. Awned.,2017-08-09 09:25:11.710,4.0,,,,,,,,,


# explore

In [307]:
df.xs('', level='group', axis='columns', drop_level=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,scope,field,field,set,set,set,set,set,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,group,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,variable,client,date,description,growth_stage,observer,complete_sets,total_sets,complete,disabled,natural_enemy_sub,aphid_subtotal,a1,a2,a3
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,source_field,fields__client__displayText,fields__date,fields__oSets__desc,fields__oSets__growthStage,fields__oSets__obsName,fields__oSets__completeSets,fields__oSets__totalSets,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number
worksheet_name,index,"(field, index, crop, fields__crop)","(field, index, name, fields__name)","(field, index, desc, fields__desc)","(set, index, date, fields__oSets__date)","(point, index, id, fields__oSets__oPoints__id)","(observation, index, id, fields__oSets__oPoints__observations__id)",Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
Erl,0,,,,2017-08-02 13:12:09.542,0.0,0.0,,NaT,,7.0,Tyler,0.0,1.0,,,,0.0,,,
Erl,1,,,,2017-08-02 13:12:09.542,0.0,1.0,,NaT,,,,,,,,,0.0,,,
Erl,2,,,,2017-08-02 13:12:09.542,0.0,2.0,,NaT,,,,,,,,,0.0,,,
Erl,3,,,,2017-08-02 13:12:09.542,0.0,3.0,,NaT,,,,,,,,,0.0,,,
Erl,4,,,,2017-08-02 13:12:09.542,0.0,4.0,,NaT,,,,,,,,,0.0,,,
Erl,5,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,1.0,,0.0,,,,
Erl,6,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,
Erl,7,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,
Erl,8,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,
Erl,9,,,,2017-08-02 13:12:09.542,0.0,5.0,,NaT,,,,,,,,,,,,


# natural enemies

#### rename natural enemy columns

In [99]:
# ne2 = ne2.rename(columns=natural_enemy_name, level=1)
# ne2.head()

# save file

In [None]:
# sum_by_point.columns = sum_by_point.columns.map('_'.join)
# joined_df.columns = joined_df.columns.map('_'.join)
# with pandas.ExcelWriter('out_df.xlsx') as file_writer:
#     sites_df.to_excel(
#         file_writer,
#         sheet_name='2017 - CAM - sites',
#         freeze_panes=(1, 1),
#         index_label='row'
#     )
#     sum_by_site.to_excel(
#         file_writer,
#         sheet_name='2017 - CAM - sum by site',
#         freeze_panes=(1, 1),
#         index_label='row'
#     )
#     sum_by_set.to_excel(
#         file_writer,
#         sheet_name='2017 - CAM - sum by time',
#         freeze_panes=(1, 1),
#         index_label='row'
#     )
#     sum_by_point.to_excel(
#         file_writer,
#         sheet_name='2017 - CAM - sum by point',
#         freeze_panes=(1, 1),
#         index_label='row'
#     )
#     joined_df.to_excel(
#         file_writer,
#         sheet_name='2017 - CAM - all observations',
#         freeze_panes=(1, 1),
#         index_label='row'
#     )
#     file_writer.save()