# init

## install openpyxl

pandas uses this when saving Excel files. pandas imports it automatically, as needed.

In [2]:
!pip install openpyxl



## import Python modules

In [3]:
import pandas, IPython.core.display

## open file

In [4]:
src = pandas.ExcelFile('2017 CAM data from iPads.xlsx')
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [83]:
search_strings = ['Erl', 'Tyler']
sheets_to_process = {
    sheet_name.split(' ')[-1]: src.parse(sheet_name)
    for sheet_name in src.sheet_names
    if any(
        (
            (pattern in sheet_name) for pattern in search_strings
        )
    )
}
sheets_to_process.keys()

dict_keys(['Erl', 'Tyler'])

## Define Aphids and Natural Enemies

In [88]:
EGA = 'English grain'
BCO = 'bird cherry oat'
greenbug = 'green bug'
aphid_name_list = (EGA, BCO, greenbug)
aphid_name = {f'a{n + 1}': name for n, name in enumerate(aphid_name_list)}
aphid_name

{'a1': 'English grain', 'a2': 'bird cherry oat', 'a3': 'green bug'}


In [84]:
# @todo: input natural enemy names
natural_enemy_name_list = [
    'aphid_mummies_brown',
    'aphid_mummies_blk',
    'ladybeetle_larvae',
    'ladybeetle_adult',
    'lacewing_larvae',
    'lacewing_adult',
]
natural_enemy_name = {f'e{n + 1}': name for n, name in enumerate(natural_enemy_name_list)}
natural_enemy_name

{'e1': 'aphid_mummies_brown',
 'e2': 'aphid_mummies_blk',
 'e3': 'ladybeetle_larvae',
 'e4': 'ladybeetle_adult',
 'e5': 'lacewing_larvae',
 'e6': 'lacewing_adult'}

# cleanup

## normalize column names

In [90]:
sheets_needing_column_rename = (
    (sheet_name, sheet)
    for sheet_name, sheet
    in sheets_to_process.items()
    if any(
        (str(column).find(' ')
         for column in sheet.columns)
    )
)
sheets_to_process = {
    sheet_name: sheet.rename(
        mapper=lambda x: str(x).split(' ')[0],
        axis='columns'
    )
    for sheet_name, sheet in sheets_needing_column_rename
}

## concatenate sheets

In [92]:
concat_df = pandas.concat(
    sheets_to_process,
    names=['worksheet_name', 'index']
)

## reindex and rename columns

In [225]:
names = ['scope', 'group', 'variable', 'source_field']
a1, a2, a3 = aphid_name
columns = (
    ('field',          '','client',                 'fields__client__displayText'),
    ('field',          '','crop',                   'fields__crop'),
    ('field',          '','date',                   'fields__date'),
    ('field',          '','desc',                   'fields__desc'),
    ('field',          '','name',                   'fields__name'),
    ('set',            '','date',                   'fields__oSets__date'),
    ('set',            '','description',            'fields__oSets__desc'),
    ('set',            '','growth_stage',           'fields__oSets__growthStage'),
    ('set',            '','observer',               'fields__oSets__obsName'),
    ('set',            '','complete_sets',          'fields__oSets__completeSets'),
    ('set',            '','total_sets',             'fields__oSets__totalSets'),
    ('point',          '','id',                     'fields__oSets__oPoints__id'),
    ('observation',    '','id',                     'fields__oSets__oPoints__observations__id'),
    ('observation',    '','complete',               'fields__oSets__oPoints__observations__complete'),
    ('observation',    '','disabled',               'fields__oSets__oPoints__observations__disabled'),
    ('observation',    '','natural_enemy_subtotal', 'fields__oSets__oPoints__observations__enum'),
    ('observation',    '','aphid_subtotal',         'fields__oSets__oPoints__observations__anum'),
    ('observation',    '','a1',                     'fields__oSets__oPoints__observations__a1__number'),
    ('observation',    '','a2',                     'fields__oSets__oPoints__observations__a2__number'),
    ('observation',    '','a3',                     'fields__oSets__oPoints__observations__a3__number'),
    ('observation',    'natural_enemy', 'name',  'fields__oSets__oPoints__observations__|'),
    ('observation',    'natural_enemy', 'number','fields__oSets__oPoints__observations__|__number'),
)
columns = pandas.MultiIndex.from_tuples(columns, names=names)
df = concat_df.reindex(
    columns=columns,
    level='source_field')
df.head()

Unnamed: 0_level_0,scope,field,field,field,field,field,set,set,set,set,set,...,observation,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,...,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,natural_enemy,natural_enemy
Unnamed: 0_level_2,variable,client,crop,date,desc,name,date,description,growth_stage,observer,complete_sets,...,id,complete,disabled,natural_enemy_subtotal,aphid_subtotal,a1,a2,a3,name,number
Unnamed: 0_level_3,source_field,fields__client__displayText,fields__crop,fields__date,fields__desc,fields__name,fields__oSets__date,fields__oSets__desc,fields__oSets__growthStage,fields__oSets__obsName,fields__oSets__completeSets,...,fields__oSets__oPoints__observations__id,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__|,fields__oSets__oPoints__observations__|__number
worksheet_name,index,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
Erl,0,,,,,,2017-08-02T13:12:09.542,,7.0,Tyler,0.0,...,0.0,,,,0.0,,,,,
Erl,1,,,,,,,,,,,...,1.0,,,,0.0,,,,,
Erl,2,,,,,,,,,,,...,2.0,,,,0.0,,,,,
Erl,3,,,,,,,,,,,...,3.0,,,,0.0,,,,,
Erl,4,,,,,,,,,,,...,4.0,,,,0.0,,,,,


In [174]:
df.columns = df.columns.droplevel(level='source_field')
df.head()

Unnamed: 0_level_0,scope,field,field,field,field,field,set,set,set,set,set,...,observation,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,...,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,natural_enemy,natural_enemy
Unnamed: 0_level_2,variable,client,crop,date,desc,name,date,description,growth_stage,observer,complete_sets,...,id,complete,disabled,natural_enemy_subtotal,aphid_subtotal,a1,a2,a3,name,number
worksheet_name,index,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
Erl,0,,,,,,2017-08-02T13:12:09.542,,7.0,Tyler,0.0,...,0.0,,,,0.0,,,,,
Erl,1,,,,,,,,,,,...,1.0,,,,0.0,,,,,
Erl,2,,,,,,,,,,,...,2.0,,,,0.0,,,,,
Erl,3,,,,,,,,,,,...,3.0,,,,0.0,,,,,
Erl,4,,,,,,,,,,,...,4.0,,,,0.0,,,,,


## convert_datetime

In [234]:
date_column_mask, date_column_slice = df.columns.get_loc_level('date', level='variable')

In [235]:
df.loc[:, date_column_mask] = df.loc[:, date_column_mask].apply(pandas.to_datetime)
df.head()

Unnamed: 0_level_0,scope,field,field,field,field,field,set,set,set,set,set,...,observation,observation,observation,observation,observation,observation,observation,observation,observation,observation
Unnamed: 0_level_1,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,...,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,natural_enemy,natural_enemy
Unnamed: 0_level_2,variable,client,crop,date,desc,name,date,description,growth_stage,observer,complete_sets,...,id,complete,disabled,natural_enemy_subtotal,aphid_subtotal,a1,a2,a3,name,number
Unnamed: 0_level_3,source_field,fields__client__displayText,fields__crop,fields__date,fields__desc,fields__name,fields__oSets__date,fields__oSets__desc,fields__oSets__growthStage,fields__oSets__obsName,fields__oSets__completeSets,...,fields__oSets__oPoints__observations__id,fields__oSets__oPoints__observations__complete,fields__oSets__oPoints__observations__disabled,fields__oSets__oPoints__observations__enum,fields__oSets__oPoints__observations__anum,fields__oSets__oPoints__observations__a1__number,fields__oSets__oPoints__observations__a2__number,fields__oSets__oPoints__observations__a3__number,fields__oSets__oPoints__observations__|,fields__oSets__oPoints__observations__|__number
worksheet_name,index,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
Erl,0,,,NaT,,,2017-08-02 13:12:09.542,,7.0,Tyler,0.0,...,0.0,,,,0.0,,,,,
Erl,1,,,NaT,,,NaT,,,,,...,1.0,,,,0.0,,,,,
Erl,2,,,NaT,,,NaT,,,,,...,2.0,,,,0.0,,,,,
Erl,3,,,NaT,,,NaT,,,,,...,3.0,,,,0.0,,,,,
Erl,4,,,NaT,,,NaT,,,,,...,4.0,,,,0.0,,,,,


## forward fill

In [60]:
fill_columns = [
    ('point', 'id'),
    ('observation', 'id'),
] + list(df[['field',
             'set'
            ]].columns.values)
df.loc[:, fill_columns] = df.loc[:, fill_columns].ffill()

In [70]:
# df = (
#     df
#     .set_index([('set', 'date'), ('point', 'id'), ('observation', 'id'), ('natural_enemy', 'name')])
#     .sort_index()
#     .reset_index()
# )
# df.head(15)

## index

This index skips rows not containing an observation total from the source app (which should be the rows not needed after unstacking natural enemies).

In [71]:
index_columns = [('set', 'date'), ('point', 'id'), ('observation', 'id')]

In [77]:
index_df = (
    df.loc[
        df[
            [
                ('observation', 'aphid_subtotal'), 
                ('point', 'natural_enemy_subtotal'),
            ]
        ]
        .notna().any(axis='columns')
#         , []
    ]
)

index_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,scope,point,natural_enemy,observation,observation,observation,aphid_number,aphid_number,aphid_number,set,set,set,set,set,set,field,field,field,field,field,field,field
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,variable,natural_enemy_subtotal,number,aphid_subtotal,complete,disabled,English grain,bird cherry oat,green bug,obsName,desc,...,totalA4,totalSets,completeSets,client_displayText,client_company,client_name,crop,date,desc,name
"(set, date)","(point, id)","(observation, id)","(natural_enemy, name)",Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
2017-07-14 12:31:24.194,0.0,0.0,,,,0.0,,,7.0,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,0.0,1.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,0.0,2.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,0.0,3.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,0.0,4.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,0.0,5.0,e1,2.0,1.0,,1.0,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,1.0,0.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,1.0,1.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,1.0,2.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat
2017-07-14 12:31:24.194,1.0,3.0,,,,0.0,,,,,,Tyler,,...,0.0,1.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,Sef wheat


## split frame into parts

### aphid

In [79]:
aphid_columns = list(df[[
    ('observation', 'complete'),
    ('observation', 'disabled'),
]]) + list(df[[
    'aphid_number'
]].columns)
aphid_df = df[aphid_columns].reset_index(level=[('observation', 'id')])
aphid_df = (
    aphid_df.loc[aphid_df[('observation', 'id')] < 5]
    .fillna(0)
    .set_index([('observation', 'id')], append=True)
)
aphid_df.head(17)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,scope,observation,observation,aphid_number,aphid_number,aphid_number
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,variable,complete,disabled,English grain,bird cherry oat,green bug
"(set, date)","(point, id)","(natural_enemy, name)","(observation, id)",Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2017-07-14 12:31:24.194,0.0,,0.0,0.0,0.0,7.0,0.0,0.0
2017-07-14 12:31:24.194,0.0,,1.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,0.0,,2.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,0.0,,3.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,0.0,,4.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,,1.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,,2.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,,3.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,,4.0,0.0,0.0,0.0,0.0,0.0


### natural enemies

In [80]:
ne_columns = list(df[[('observation', 'complete'), ('observation', 'disabled')]].columns) + list(df[['natural_enemy']].columns)
ne = df.loc[:, ne_columns]
ne = ne.dropna(subset=df[[('natural_enemy', 'name')]].columns)
ne = ne.reset_index()
ne = ne.set_index([('set', 'date'), ('point', 'id'), ('observation', 'id'), ('natural_enemy', 'name')]).fillna(0)
ne.head(11)

KeyError: "[('natural_enemy', 'name')] not in index"

#### natural enemy unstack (transpose)

In [18]:
ne2 = ne.unstack()#.reset_index()
ne2[('observation', 'complete', '')] = ne2.pop(('observation', 'complete','e1'))
ne2[('observation', 'disabled', '')] = ne2.pop(('observation', 'disabled','e1'))
ne2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,scope,observation,observation,observation,observation,observation,observation,observation,observation,observation,observation,...,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,observation,observation
Unnamed: 0_level_1,Unnamed: 1_level_1,variable,complete,complete,complete,complete,complete,complete,complete,complete,disabled,disabled,...,number,number,number,number,number,number,number,number,complete,disabled
Unnamed: 0_level_2,Unnamed: 1_level_2,"(natural_enemy, name)",e2,e3,e4,e5,e6,e7,e8,e9,e2,e3,...,e2,e3,e4,e5,e6,e7,e8,e9,Unnamed: 22_level_2,Unnamed: 23_level_2
"(set, date)","(point, id)","(observation, id)",Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
2017-07-14 12:31:24.194,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
# ne2 = ne2.set_index([('set', 'date', ''), ('point', 'id', ''), ('observation', 'id', '')])
natural_enemy_columns = list(ne2[['natural_enemy']].columns.values)
columns = [
    ('observation', 'complete', ''),
    ('observation', 'disabled', '')
] + natural_enemy_columns
ne2 = ne2[columns]
# ne2.index = ne2.index.set_names([('set', 'date', ''), ('point', 'id', ''), ('observation', 'id', '')])
ne2.head(11)

Unnamed: 0_level_0,Unnamed: 1_level_0,scope,observation,observation,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy
Unnamed: 0_level_1,Unnamed: 1_level_1,variable,complete,disabled,number,number,number,number,number,number,number,number,number
Unnamed: 0_level_2,Unnamed: 1_level_2,"(natural_enemy, name)",Unnamed: 3_level_2,Unnamed: 4_level_2,e1,e2,e3,e4,e5,e6,e7,e8,e9
"(set, date)","(point, id)","(observation, id)",Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
2017-07-14 12:31:24.194,0.0,5.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,3.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,3.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### flatten unstacked column names

In [20]:
new_names2 = [tuple([c[0], c[2] or c[1], None]) for c in ne2.columns.values]
ne2.columns = pandas.MultiIndex.from_tuples(new_names2).droplevel(2)
ne2.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,observation,observation,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,complete,disabled,e1,e2,e3,e4,e5,e6,e7,e8,e9
"(set, date)","(point, id)","(observation, id)",Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2017-07-14 12:31:24.194,0.0,5.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,3.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-18 10:31:22.263,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### rename natural enemy columns

In [21]:
ne2 = ne2.rename(columns=natural_enemy_name, level=1)
ne2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,observation,observation,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy,natural_enemy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,complete,disabled,aphid_mummies_brown,aphid_mummies_blk,ladybeetle_larvae,ladybeetle_adult,lacewing_larvae,lacewing_adult,e7,e8,e9
"(set, date)","(point, id)","(observation, id)",Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2017-07-14 12:31:24.194,0.0,5.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,3.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-07-14 12:31:24.194,4.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### totals from source app

In [None]:
totals_columns = [
    ('observation', 'aphid_subtotal'),
    ('point', 'natural_enemy_subtotal'),
    ('set', 'completeSets'),
    ('set', 'totalA1'),
    ('set', 'totalA2'),
    ('set', 'totalA3'),
    ('set', 'totalA4'),
    ('set', 'totalSets')
]

In [None]:
totals_df = df.loc[
    df[
        [
            ('observation', 'aphid_subtotal'), 
            ('point', 'natural_enemy_subtotal'),
        ]
    ].notna().any(axis='columns'),
    totals_columns
].sort_index(axis='columns')
totals_df.head(15)

### crop conditions

In [None]:
field_columns = df[['field']].columns.values
crop_conditions = (
    df[
        [('set', 'growthStage')] + list(field_columns)
    ].reset_index(level=[('point', 'id'), ('observation', 'id')], drop=True)
)
crop_conditions = crop_conditions.loc[~crop_conditions.index.duplicated()]
crop_conditions = (
    crop_conditions.set_index(
        [
            ('field', 'crop'),
            ('field', 'name'),
            ('field', 'desc'),
        ],
        append=True
    ).reorder_levels(
        [
            ('field', 'crop'),
            ('field', 'name'),
            ('field', 'desc'),
            ('set', 'date'),
        ],
        axis='index')
    .sort_index()
)
crop_conditions.head()

## Merge Parts

### aphid & natural enemy observations

In [None]:
observation_base = (
    pandas.concat([aphid_df, ne2])
    .loc[:, list(aphid_df.columns.values) + list(ne2.columns.values)]
    .sort_index()
)
observation_base.head()

## Prepare DataFrames

### sum by site

In [23]:
sum_by_site = df[['field', 'set']]
sum_by_site

Unnamed: 0_level_0,Unnamed: 1_level_0,scope,field,field,field,field,field,field,field,set,set,set,set,set,set,set,set,set
Unnamed: 0_level_1,Unnamed: 1_level_1,variable,client_displayText,client_company,client_name,crop,date,desc,name,obsName,desc,growthStage,totalA1,totalA2,totalA3,totalA4,totalSets,completeSets
"(set, date)","(point, id)","(observation, id)",Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
2017-07-14 12:31:24.194,0.0,0.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,1.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,2.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,3.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,4.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,5.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,5.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,5.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,5.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0
2017-07-14 12:31:24.194,0.0,5.0,"AAFC SRDC, Tyler Wist",AAFC SRDC,Tyler Wist,CROPS.WHEAT,2017-07-14T12:30:31.587,Next to soybeans and canola,Sef wheat,Tyler,,6.0,187.0,0.0,0.0,0.0,1.0,0.0


### sum by set

In [None]:
sum_by_set = (
    observations_base
    .sum(level=0)
#     .join(crop_conditions_indexed)
#     .reset_index()
)
sum_by_set.head(5)

### sum by point

In [None]:
sum_by_point = (
    observations_base.sum(
        level=[
            ('set', 'date'),
            ('point', 'id'),
        ]
    )
#     .join( crop_conditions_indexed )
#     .reset_index()
)
sum_by_point.head(7)

### all cleaned data

In [None]:
# columns = 
# list(df[[
#     ('set', 'date'),
#     ('observation', 'complete'),
#     ('observation', 'disabled'),
#     ('point', 'id'),
#     ('observation', 'id'),
# ]].columns) +
# list(df[].columns)
crop_conditions_indexed = df.loc[
    df[('observation', 'aphid_subtotal')].notna() | df[('point', 'natural_enemy_subtotal')].notna(),
    [
    'field',
    'set',
]
]#.set_index(index_columns)
crop_conditions_indexed.head(7)

In [None]:
joined_df = (
    observation_base
    .join(crop_conditions_indexed)
#     .reset_index()
)
joined_df#.sample(7).sort_index()

### crop conditions

In [None]:
crop_conditions_reindexed = crop_conditions_indexed.reset_index(
    level=[
        ('observation', 'id'),
        ('point', 'id'),
    ],
    drop=True
)
crop_conditions_reindexed = (
    crop_conditions_reindexed[~crop_conditions_reindexed.index.duplicated()]
    .set_index(
        [
            ('field', 'crop'),
            ('field', 'name'),
            ('field', 'desc'),
        ], append=True)
    .reorder_levels(
        [
            ('field', 'crop'),
            ('field', 'name'),
            ('field', 'desc'),
            ('set', 'date'),
        ], axis='index')
    .sort_index()
#     .reset_index()
)
crop_conditions_reindexed.head()

# save file

In [None]:
sum_by_point.columns = sum_by_point.columns.map('_'.join)
joined_df.columns = joined_df.columns.map('_'.join)
with pandas.ExcelWriter('out_df.xlsx') as file_writer:
    sites_df.to_excel(
        file_writer,
        sheet_name='2017 - CAM - sites',
        freeze_panes=(1, 1),
        index_label='row'
    )
    sum_by_site.to_excel(
        file_writer,
        sheet_name='2017 - CAM - sum by site',
        freeze_panes=(1, 1),
        index_label='row'
    )
    sum_by_set.to_excel(
        file_writer,
        sheet_name='2017 - CAM - sum by time',
        freeze_panes=(1, 1),
        index_label='row'
    )
    sum_by_point.to_excel(
        file_writer,
        sheet_name='2017 - CAM - sum by point',
        freeze_panes=(1, 1),
        index_label='row'
    )
    joined_df.to_excel(
        file_writer,
        sheet_name='2017 - CAM - all observations',
        freeze_panes=(1, 1),
        index_label='row'
    )
    file_writer.save()

In [None]:
# @todo: insert enemy and aphid names