# init

## import Python modules

In [1]:
import pandas, IPython.core.display

## open file

In [2]:
src = pandas.ExcelFile('data/real/2017/CAM data from iPads/2017 CAM data from iPads.xlsx')

In [3]:
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [4]:
sheets_to_process = {
    sheet_name.split(' ')[-1]: src.parse(sheet_name)
    for sheet_name in src.sheet_names
    if any(
        (
            person_name in sheet_name
            for person_name
            in ['Erl', 'Tyler']
        )
    )
}
sheets_to_process.keys()

dict_keys(['Erl', 'Tyler'])

# cleanup

## normalize column names

In [5]:
sheets_needing_column_rename = (
    (sheet_name, sheet)
    for sheet_name, sheet
    in sheets_to_process.items()
    if any(
        (str(column).find(' ')
         for column in sheet.columns)
    )
)
sheets_to_process = {
    sheet_name: sheet.rename(
        mapper=lambda x: str(x).split(' ')[0],
        axis='columns')
    for sheet_name, sheet in sheets_needing_column_rename
}

## concatenate sheets

In [6]:
df = pandas.concat(sheets_to_process,
                   names=['worksheet_name', 'index'])

In [7]:
# correct worksheet_row index for offset
df = df.reset_index()
df['worksheet_row'] = df.pop('index') + 2

## reindex columns

In [8]:
names = ['field', 'scope', 'variable']
columns = (
    ('fields__oSets__date',                              'set', 'date'),
    ('fields__oSets__oPoints__id',                       'point', 'id'),
#     ('fields__oSets__oPoints__name',                     'point', 'name'), 
    ('fields__oSets__oPoints__observations__id',         'observation', 'id'),
#     ('fields__oSets__oPoints__observations__name',       'observation', 'name'),

    ('fields__oSets__oPoints__observations__enum',       'point', 'natural_enemy_subtotal'),
    ('fields__oSets__oPoints__observations__|',          'natural_enemy_count', 'name'),
    ('fields__oSets__oPoints__observations__|__number',  'natural_enemy_count', 'count'),
    ('fields__oSets__oPoints__observations__complete',   'observation', 'complete'),
    ('fields__oSets__oPoints__observations__disabled',   'observation', 'disabled'),
    ('fields__oSets__oPoints__observations__a1__number', 'aphid_count', 'a1'),
    ('fields__oSets__oPoints__observations__a2__number', 'aphid_count', 'a2'),
    ('fields__oSets__oPoints__observations__a3__number', 'aphid_count', 'a3'),
    ('fields__oSets__oPoints__observations__anum',       'observation', 'aphid_subtotal'),
    ('fields__oSets__obsName',                           'set', 'obsName'),
    ('fields__oSets__desc',                              'set', 'desc'),
    ('fields__oSets__growthStage',                       'set', 'growthStage'),
    ('fields__oSets__results',                           'set_results', 'results'),
    ('fields__oSets__totalA1',                           'set_results', 'totalA1'),
    ('fields__oSets__totalA2',                           'set_results', 'totalA2'),
    ('fields__oSets__totalA3',                           'set_results', 'totalA3'),
    ('fields__oSets__totalA4',                           'set_results', 'totalA4'),
    ('fields__oSets__totalSets',                         'set_results', 'totalSets'),
    ('fields__oSets__completeSets',                      'set_results', 'completeSets'),
    ('fields__client__company',                          'field', 'client_company'),
    ('fields__client__displayText',                      'field', 'client_displayText'),
    ('fields__client__fname',                            'field', 'client_fname'),
    ('fields__client__lname',                            'field', 'client_lname'),
    ('fields__client__name',                             'field', 'client_name'),
    ('fields__crop',                                     'field', 'crop'),
    ('fields__date',                                     'field', 'date'),
    ('fields__desc',                                     'field', 'desc'),
    ('fields__image',                                    'field', 'image'),
    ('fields__name',                                     'field', 'name'),
)
columns = pandas.MultiIndex.from_tuples(columns, names=names)
df = df.reindex(columns=columns, level='field')
df.columns = df.columns.droplevel(level='field')

## convert_datetime

In [9]:
for column in [('set','date'), ('field','date')]:
    df[column] = pandas.to_datetime(df[column], infer_datetime_format=True)

## forward fill

In [10]:
fill_columns = [
    ('point', 'id'),
    ('observation', 'id'),
] + list(df[['field', 'set']].columns.values)
df.loc[:, fill_columns] = df.loc[:, fill_columns].ffill()

In [11]:
df = df.set_index(([('set', 'date'), ('point', 'id')])).sort_index().reset_index()
df

scope,set,point,observation,point,natural_enemy_count,natural_enemy_count,observation,observation,aphid_count,aphid_count,...,field,field,field,field,field,field,field,field,field,field
variable,date,id,id,natural_enemy_subtotal,name,count,complete,disabled,a1,a2,...,client_company,client_displayText,client_fname,client_lname,client_name,crop,date,desc,image,name
0,2017-07-14 12:31:24.194,0.0,0.0,,,,,,7.0,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
1,2017-07-14 12:31:24.194,0.0,1.0,,,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
2,2017-07-14 12:31:24.194,0.0,2.0,,,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
3,2017-07-14 12:31:24.194,0.0,3.0,,,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
4,2017-07-14 12:31:24.194,0.0,4.0,,,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
5,2017-07-14 12:31:24.194,0.0,5.0,2.0,e1,1.0,1.0,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
6,2017-07-14 12:31:24.194,0.0,5.0,,e2,1.0,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
7,2017-07-14 12:31:24.194,0.0,5.0,,e3,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
8,2017-07-14 12:31:24.194,0.0,5.0,,e4,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
9,2017-07-14 12:31:24.194,0.0,5.0,,e5,,,,,,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat


## split frame into natural enemies, observations

In [12]:
base_df = df.loc[df[('observation', 'aphid_subtotal')].notna() | df[('point', 'natural_enemy_subtotal')].notna(), ['field', 'set', 'point', 'observation', 'set_results']]
base_df

scope,set,point,observation,point,observation,observation,observation,set,set,set,...,field,field,field,field,field,field,field,field,field,field
variable,date,id,id,natural_enemy_subtotal,complete,disabled,aphid_subtotal,obsName,desc,growthStage,...,client_company,client_displayText,client_fname,client_lname,client_name,crop,date,desc,image,name
0,2017-07-14 12:31:24.194,0.0,0.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
1,2017-07-14 12:31:24.194,0.0,1.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
2,2017-07-14 12:31:24.194,0.0,2.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
3,2017-07-14 12:31:24.194,0.0,3.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
4,2017-07-14 12:31:24.194,0.0,4.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
5,2017-07-14 12:31:24.194,0.0,5.0,2.0,1.0,,,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
14,2017-07-14 12:31:24.194,1.0,0.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
15,2017-07-14 12:31:24.194,1.0,1.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
16,2017-07-14 12:31:24.194,1.0,2.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
17,2017-07-14 12:31:24.194,1.0,3.0,,,,0.0,Tyler,,6.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat


In [13]:
aphid_df = df.loc[: ,['aphid_count']].fillna(0)
aphid_df

scope,aphid_count,aphid_count,aphid_count
variable,a1,a2,a3
0,7.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [14]:
ne_columns = list(df[[('set', 'date'), ('point', 'id'), ('observation', 'complete')]].columns) + list(df[['natural_enemy_count']].columns)
ne = df.loc[:, ne_columns]
ne = ne.dropna(subset=df[[('natural_enemy_count', 'name')]].columns)
ne = ne.reset_index()
ne = ne.set_index([('set', 'date'), ('point', 'id'), ('natural_enemy_count', 'name')]).fillna(0)

## natural enemy transpose

In [15]:
ne2 = ne.unstack().reset_index()
ne2['index_base'] = ne2.pop(('index','','e1'))

In [16]:
oc = ne2.pop(('observation', 'complete','e1'))
ne2[('observation', 'complete', '')] = oc

In [17]:
ne3 = ne2.set_index('index_base').loc[df[('observation', 'complete')] > 0, ['set','point','natural_enemy_count']]
ne3.index.name = None
ne3

scope,set,point,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count
variable,date,id,count,count,count,count,count,count,count,count,count
"(natural_enemy_count, name)",Unnamed: 1_level_2,Unnamed: 2_level_2,e1,e2,e3,e4,e5,e6,e7,e8,e9
5,2017-07-14 12:31:24.194,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,2017-07-14 12:31:24.194,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,2017-07-14 12:31:24.194,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,2017-07-14 12:31:24.194,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,2017-07-14 12:31:24.194,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,2017-07-18 10:31:22.263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,2017-07-18 10:31:22.263,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,2017-07-18 10:31:22.263,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117,2017-07-18 10:31:22.263,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131,2017-07-18 10:31:22.263,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## flatten column names

In [18]:
new_names2 = [tuple([c[0], c[2] or c[1], None]) for c in ne2.columns.values]
ne2.columns = pandas.MultiIndex.from_tuples(new_names2).droplevel(2)
ne2

Unnamed: 0_level_0,set,point,index,index,index,index,index,index,index,index,...,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,natural_enemy_count,index_base,observation
Unnamed: 0_level_1,date,id,e2,e3,e4,e5,e6,e7,e8,e9,...,e2,e3,e4,e5,e6,e7,e8,e9,Unnamed: 20_level_1,complete
0,2017-07-14 12:31:24.194,0.0,6,7,8,9,10,11,12,13,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1.0
1,2017-07-14 12:31:24.194,1.0,20,21,22,23,24,25,26,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1.0
2,2017-07-14 12:31:24.194,2.0,34,35,36,37,38,39,40,41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33,1.0
3,2017-07-14 12:31:24.194,3.0,48,49,50,51,52,53,54,55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47,1.0
4,2017-07-14 12:31:24.194,4.0,62,63,64,65,66,67,68,69,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61,1.0
5,2017-07-18 10:31:22.263,0.0,76,77,78,79,80,81,82,83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75,1.0
6,2017-07-18 10:31:22.263,1.0,90,91,92,93,94,95,96,97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89,1.0
7,2017-07-18 10:31:22.263,2.0,104,105,106,107,108,109,110,111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,103,1.0
8,2017-07-18 10:31:22.263,3.0,118,119,120,121,122,123,124,125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117,1.0
9,2017-07-18 10:31:22.263,4.0,132,133,134,135,136,137,138,139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,1.0


## merge parts

In [19]:
joined_df = base_df.join(ne2[['natural_enemy_count']]).join(aphid_df)
joined_df.columns = joined_df.columns.map('_'.join)
joined_df = joined_df.reindex(columns=[
    'set_date',
    'point_id',
    'observation_id',
    'observation_complete',
    'observation_disabled',
    'observation_aphid_subtotal',
    'point_natural_enemy_subtotal',
    'natural_enemy_count_e1',
    'natural_enemy_count_e2',
    'natural_enemy_count_e3',
    'natural_enemy_count_e4',
    'natural_enemy_count_e5',
    'natural_enemy_count_e6',
    'natural_enemy_count_e7',
    'natural_enemy_count_e8',
    'natural_enemy_count_e9',
    'aphid_count_a1',
    'aphid_count_a2',
    'aphid_count_a3',
    'set_obsName',
    'set_desc',
    'set_growthStage',
    'field_client_company',
    'field_client_displayText',
    'field_client_fname',
    'field_client_lname',
    'field_client_name',
    'field_crop',
    'field_date',
    'field_desc',
    'field_image',
    'field_name',
])
joined_df.sort_index()

Unnamed: 0,set_date,point_id,observation_id,observation_complete,observation_disabled,observation_aphid_subtotal,point_natural_enemy_subtotal,natural_enemy_count_e1,natural_enemy_count_e2,natural_enemy_count_e3,...,field_client_company,field_client_displayText,field_client_fname,field_client_lname,field_client_name,field_crop,field_date,field_desc,field_image,field_name
0,2017-07-14 12:31:24.194,0.0,0.0,,,0.0,,1.0,1.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
1,2017-07-14 12:31:24.194,0.0,1.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
2,2017-07-14 12:31:24.194,0.0,2.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
3,2017-07-14 12:31:24.194,0.0,3.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
4,2017-07-14 12:31:24.194,0.0,4.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
5,2017-07-14 12:31:24.194,0.0,5.0,1.0,,,2.0,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
14,2017-07-14 12:31:24.194,1.0,0.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
15,2017-07-14 12:31:24.194,1.0,1.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
16,2017-07-14 12:31:24.194,1.0,2.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat
17,2017-07-14 12:31:24.194,1.0,3.0,,,0.0,,0.0,0.0,0.0,...,AAFC SRDC,"AAFC SRDC, Tyler Wist",Tyler,Wist,Tyler Wist,CROPS.WHEAT,2017-07-14 12:30:31.587,Next to soybeans and canola,img/wheat.png,Sef wheat


# save file

In [20]:
with pandas.ExcelWriter('data/real/2017/CAM data from iPads/out_df.xlsx') as file_writer:
    joined_df.to_excel(file_writer, sheet_name='2017 - CAM', freeze_panes=(1,0))
    file_writer.save()