# init

In [1]:
import pandas, IPython.core.display

## open file

In [2]:
src = pandas.ExcelFile('data/real/2017/CAM data from iPads/2017 CAM data from iPads.xlsx')

In [3]:
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [4]:
sheets_to_process = {sheet_name.split(' ')[-1]: src.parse(sheet_name) for sheet_name in src.sheet_names
    if any([person_name in sheet_name for person_name in ['Erl', 'Tyler']])}

# rename columns in sheets with multi-word column names

In [5]:
sheets_to_rename = [sheet for sheet in sheets_to_process.values()
                    if any([str(column).find(' ') for column in sheet.columns])]
for sheet_to_rename in sheets_to_rename:
    sheet_to_rename.rename(mapper=lambda x: str(x).split(' ')[0], axis='columns', inplace=True)

In [6]:
for sheet_to_rename in sheets_to_rename:
    sheet_to_rename.rename(mapper=lambda x: str(x).split(' ')[0], axis='columns', inplace=True)

# concatenate sheets

In [7]:
df = pandas.concat(sheets_to_process).reset_index(drop=True)

# cleanup

In [8]:
observations_columns = {
#     'Sheet', 
#     'Row',
#     'clients__company',
#     'clients__displayText',
#     'clients__fname',
#     'clients__lname',
#     'clients__name',
    'fields__client__company': 'company',
#     'fields__client__displayText',
    'fields__client__fname': 'client_name_first',
    'fields__client__lname': 'client_name_last',
#     'fields__client__name', 
    'fields__crop': 'crop',
#     'fields__date',
    'fields__desc': 'field_description',
#     'fields__image',
    'fields__name': 'field_name',
#     'fields__oSets__completeSets',
    'fields__oSets__date': 'observation_time', 
#     'fields__oSets__dateCompare',
    'fields__oSets__desc': 'observation_memo',
    'fields__oSets__growthStage': 'field_growth_stage_zadoks',
    'fields__oSets__oPoints__id': 'observation_point_id',
#     'fields__oSets__oPoints__location__coords__accuracy',
#     'fields__oSets__oPoints__location__coords__altitude',
#     'fields__oSets__oPoints__location__coords__altitudeAccuracy',
#     'fields__oSets__oPoints__location__coords__heading',
#     'fields__oSets__oPoints__location__coords__latitude',
#     'fields__oSets__oPoints__location__coords__longitude',
#     'fields__oSets__oPoints__location__coords__speed',
#     'fields__oSets__oPoints__location__timestamp',
#     'fields__oSets__oPoints__name',
    'fields__oSets__oPoints__observations__a1__number': 'count_aphid_1',
    'fields__oSets__oPoints__observations__a2__number': 'count_aphid_2',
    'fields__oSets__oPoints__observations__a3__number': 'count_aphid_3',
#     'fields__oSets__oPoints__observations__anum': 'total_aphid',
#     'fields__oSets__oPoints__observations__complete',
#     'fields__oSets__oPoints__observations__disabled',
#     'fields__oSets__oPoints__observations__eVnum',
    'fields__oSets__oPoints__observations__enum': 'total_ne',
#     'fields__oSets__oPoints__observations__id',
    'fields__oSets__oPoints__observations__name': 'observation_name',
    'fields__oSets__oPoints__observations__|': 'count_natural_enemy_name',
    'fields__oSets__oPoints__observations__|__number': 'count_natural_enemy_count',
    'fields__oSets__obsName': 'observer_name',
#     'fields__oSets__results',
    'fields__oSets__totalA1': 'total_aphid_1', 
    'fields__oSets__totalA2': 'total_aphid_2',
    'fields__oSets__totalA3': 'total_aphid_3', 
#     'fields__oSets__totalA4',
#     'fields__oSets__totalSets',
#     'observers'
}

In [9]:
df2 = df[list(observations_columns)].rename(columns=observations_columns)

## cleanup

### convert_datetime

In [10]:
df2.observation_time = pandas.to_datetime(df2.observation_time, infer_datetime_format=True)

### split string to columns

Information in the column could be used as an index, to separate the data we need to unstack (natural enemy counts) from the data that's already unstacked (aphid counts).

In [11]:
df2 = df2.join(df2.pop('observation_name').str.extract(
    '(?P<observation_subject>Aphid|Natural Enemy) Observation.*', expand=True))

In [12]:
df2

Unnamed: 0,company,client_name_first,client_name_last,crop,field_description,field_name,observation_time,observation_memo,field_growth_stage_zadoks,observation_point_id,...,count_aphid_2,count_aphid_3,total_ne,count_natural_enemy_name,count_natural_enemy_count,observer_name,total_aphid_1,total_aphid_2,total_aphid_3,observation_subject
0,,,,,,,2017-08-02 13:12:09.542,,7.0,0.0,...,,,,,,Tyler,,,,Aphid
1,,,,,,,NaT,,,,...,,,,,,,,,,Aphid
2,,,,,,,NaT,,,,...,,,,,,,,,,Aphid
3,,,,,,,NaT,,,,...,,,,,,,,,,Aphid
4,,,,,,,NaT,,,,...,,,,,,,,,,Aphid
5,,,,,,,NaT,,,,...,,,0.0,e1,,,,,,Natural Enemy
6,,,,,,,NaT,,,,...,,,,e2,,,,,,
7,,,,,,,NaT,,,,...,,,,e3,,,,,,
8,,,,,,,NaT,,,,...,,,,e4,,,,,,
9,,,,,,,NaT,,,,...,,,,e5,,,,,,


### ffill columns

In [13]:
index_columns = [
    'observation_subject',
    'observation_time',
    'observation_point_id',
#     'observation_id',
]
set_column_names = [
    'company', 'client_name_first', 'client_name_last',
    'crop', 'field_description', 'field_name',
    'field_growth_stage_zadoks',
    'total_aphid_1',
    'total_aphid_2',
    'total_aphid_3', 
    'total_ne',
]
observation_addendum_columns = [
    'observation_memo',
    'observer_name', 
]

In [14]:
ffill_columns = index_columns
df2[ffill_columns] = df2[ffill_columns].ffill()

### set indices

In [15]:
df2 = df2.set_index(index_columns)

In [16]:
df2.xs('2017-08-02 11:43:20.708', level='observation_time')

Unnamed: 0_level_0,Unnamed: 1_level_0,company,client_name_first,client_name_last,crop,field_description,field_name,observation_memo,field_growth_stage_zadoks,count_aphid_1,count_aphid_2,count_aphid_3,total_ne,count_natural_enemy_name,count_natural_enemy_count,observer_name,total_aphid_1,total_aphid_2,total_aphid_3
observation_subject,observation_point_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Aphid,0.0,,,,,,,,8.0,1.0,,,,,,Mikki,25.0,0.0,0.0
Aphid,0.0,,,,,,,,,,,,,,,,,,
Aphid,0.0,,,,,,,,,,,,,,,,,,
Aphid,0.0,,,,,,,,,,,,,,,,,,
Aphid,0.0,,,,,,,,,17.0,,,,,,,,,
Natural Enemy,0.0,,,,,,,,,,,,0.0,e1,,,,,
Natural Enemy,0.0,,,,,,,,,,,,,e2,,,,,
Natural Enemy,0.0,,,,,,,,,,,,,e3,,,,,
Natural Enemy,0.0,,,,,,,,,,,,,e4,,,,,
Natural Enemy,0.0,,,,,,,,,,,,,e5,,,,,


### split frame into points, aphids, and natural enemies

In [17]:
count_aphid_columns = [
    'count_aphid_1',
    'count_aphid_2',
    'count_aphid_3',
]
count_natural_enemy_columns = [
    [
        'count_natural_enemy_name',
        'count_natural_enemy_count',
    ],
    [
        'count_natural_enemy_1',
        'count_natural_enemy_2',
        'count_natural_enemy_3',
        'count_natural_enemy_4',
        'count_natural_enemy_5',
        'count_natural_enemy_6',
        'count_natural_enemy_7',
        'count_natural_enemy_8',
        'count_natural_enemy_9',
    ],
]

In [18]:
points_df = df2.reset_index().groupby(by='observation_time').first()[
    set_column_names + observation_addendum_columns + ['observation_point_id']]

In [19]:
points_df = points_df.set_index('observation_point_id', append=True)

In [20]:
aphid_df = df2.loc['Aphid'][count_aphid_columns]
natural_enemy_df = df2.loc['Natural Enemy'].set_index('count_natural_enemy_name', append=True)

### unstack natural enemies

In [21]:
natural_enemy_df = natural_enemy_df['count_natural_enemy_count'].unstack(level='count_natural_enemy_name')

### zero fill natural enemies and aphids

In [22]:
natural_enemy_df = natural_enemy_df.fillna(0)

In [23]:
aphid_df = aphid_df.fillna(0)

### @todo label natural enemies

### @todo label aphids

### @todo apply total aphids per subject to "natural enemies" row, because that row has natural enemy totals already
@todo name totals row

In [24]:
all_df = pandas.concat([points_df, aphid_df], keys=['key','discrete'], names=['row_type']).sort_index()

In [25]:
sums = aphid_df.groupby(['observation_time', 'observation_point_id']).sum()

In [26]:
sums = pandas.concat([sums], keys=['key'], names=['row_type'])

In [27]:
all_df.loc['key', count_aphid_columns] = sums

In [28]:
points_df.sort_index().loc['2017-08-18 13:33:03.584']

Unnamed: 0_level_0,Unnamed: 1_level_0,company,client_name_first,client_name_last,crop,field_description,field_name,field_growth_stage_zadoks,total_aphid_1,total_aphid_2,total_aphid_3,total_ne,observation_memo,observer_name
observation_time,observation_point_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-08-18 13:33:03.584,0.0,,,,,,,9.0,68.0,4.0,0.0,0.0,,Stean


In [29]:
aphid_df.sort_index().loc['2017-08-18 13:33:03.584']

Unnamed: 0_level_0,Unnamed: 1_level_0,count_aphid_1,count_aphid_2,count_aphid_3
observation_time,observation_point_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-08-18 13:33:03.584,0.0,0.0,0.0,0.0
2017-08-18 13:33:03.584,0.0,0.0,27.0,0.0
2017-08-18 13:33:03.584,0.0,0.0,2.0,0.0
2017-08-18 13:33:03.584,0.0,0.0,13.0,0.0
2017-08-18 13:33:03.584,0.0,0.0,21.0,0.0
2017-08-18 13:33:03.584,1.0,0.0,0.0,0.0
2017-08-18 13:33:03.584,1.0,0.0,0.0,0.0
2017-08-18 13:33:03.584,1.0,0.0,0.0,0.0
2017-08-18 13:33:03.584,1.0,0.0,0.0,0.0
2017-08-18 13:33:03.584,1.0,0.0,0.0,0.0


In [33]:
all_df.sort_index().xs('2017-08-18 13:33:03.584', level='observation_time')

Unnamed: 0_level_0,Unnamed: 1_level_0,client_name_first,client_name_last,company,count_aphid_1,count_aphid_2,count_aphid_3,crop,field_description,field_growth_stage_zadoks,field_name,observation_memo,observer_name,total_aphid_1,total_aphid_2,total_aphid_3,total_ne
row_type,observation_point_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
discrete,0.0,,,,0.0,0.0,0.0,,,,,,,,,,
discrete,0.0,,,,0.0,27.0,0.0,,,,,,,,,,
discrete,0.0,,,,0.0,2.0,0.0,,,,,,,,,,
discrete,0.0,,,,0.0,13.0,0.0,,,,,,,,,,
discrete,0.0,,,,0.0,21.0,0.0,,,,,,,,,,
discrete,1.0,,,,0.0,0.0,0.0,,,,,,,,,,
discrete,1.0,,,,0.0,0.0,0.0,,,,,,,,,,
discrete,1.0,,,,0.0,0.0,0.0,,,,,,,,,,
discrete,1.0,,,,0.0,0.0,0.0,,,,,,,,,,
discrete,1.0,,,,0.0,0.0,0.0,,,,,,,,,,


In [34]:
# all_df.xs('key', level='row_type')
all_df = all_df.reorder_levels(
    ['observation_time', 'observation_point_id', 'row_type']).sort_index()
# all_df.loc[all_df['total_aphid_2'] > 0,['count_aphid_1', 'count_aphid_2', 'count_aphid_3', 'total_aphid_1', 'total_aphid_2', 'total_aphid_3']]
all_df.loc['2017-08-18 13:33:03.584']
# .xs('key', level='row_type')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,client_name_first,client_name_last,company,count_aphid_1,count_aphid_2,count_aphid_3,crop,field_description,field_growth_stage_zadoks,field_name,observation_memo,observer_name,total_aphid_1,total_aphid_2,total_aphid_3,total_ne
observation_time,observation_point_id,row_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-08-18 13:33:03.584,0.0,discrete,,,,0.0,0.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,0.0,discrete,,,,0.0,27.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,0.0,discrete,,,,0.0,2.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,0.0,discrete,,,,0.0,13.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,0.0,discrete,,,,0.0,21.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,0.0,key,,,,0.0,63.0,0.0,,,9.0,,,Stean,68.0,4.0,0.0,0.0
2017-08-18 13:33:03.584,1.0,discrete,,,,0.0,0.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,1.0,discrete,,,,0.0,0.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,1.0,discrete,,,,0.0,0.0,0.0,,,,,,,,,,
2017-08-18 13:33:03.584,1.0,discrete,,,,0.0,0.0,0.0,,,,,,,,,,


In [None]:
all_df = all_df.merge(pandas.concat([natural_enemy_df], keys=['key'], names=[
                      'row_type']), how='outer', left_index=True, right_index=True)

In [None]:
pandas.options.display.max_columns = 55
all_df.reorder_levels(['observation_time', 'observation_point_id', 'row_type'])#.sort_index().xs('key', level='row_type')

### join points, aphids, natural enemies

In [None]:
pandas.merge(points_df, aphid_df, left_index=True, right_index=True).merge(natural_enemy_df, left_index=True, right_index=True)

In [None]:
idf[['natural_enemy_name', 'natural_enemy_count']]

In [None]:
ne = idf.xs('Natural Enemy', level='observation_subject', drop_level=False)

In [None]:
ne.set_index('natural_enemy_name', append=True, inplace=True)

In [None]:
ne['natural_enemy_count'].dropna()

### fill NaN with 0, unstack enemy counts

In [None]:
ne2 = ne['natural_enemy_count'].fillna(value=0).unstack()

In [None]:
df = convert_datetime(df)

### Remove old column from "left" frame

In [None]:
natural_enemy_column_names = [
    column_name for column_name in idf.columns if 'natural_enemy_' in column_name]

In [None]:
for column_name in natural_enemy_column_names:
    print(column_name)
    del idf[column_name]

### remove duplicates created by old index

These won't be needed anymore, and won't be in the "right" frame we're about to merge.

In [None]:
index_columns = ['datetime', 'point_id', 'observation_id','observation_subject']
idf2 = idf.reset_index().drop_duplicates(subset=index_columns).set_index(index_columns)

### merge unstacked natural enemy counts into main frame

In [None]:
idf3 = pandas.merge(idf2, ne2, right_index=True, left_index=True, how='outer')

In [None]:
idf3[get_descendant_column_names(get_column_tree(idf2)['fields']['oSets']['oPoints'])+enemy_num_cols]

# observations

In [None]:
sets_columns = get_child_column_names(get_column_tree(idf2)['fields']['oSets']['oPoints']['observations'])
idf3[sets_columns + enemy_num_cols]#.dropna(how='all')

In [None]:
idf4 = idf3.copy() 
# idf4.index = idf4.index.droplevel(level='observation_id')

In [None]:
idf4#.sort_index()

In [None]:
idf5 = idf4.reorder_levels(('observation_subject', 'datetime', 'point_id', 'observation_id'))#.sort_index()

In [None]:
columns = list(set(idf5.columns) - set(aphid_num_cols) - set(enemy_num_cols))
idf5[columns] = idf5[columns].fillna(method='ffill')

In [None]:
idf5.loc['Aphid',aphid_num_cols].fillna(0)

In [None]:
idf5[aphid_num_cols] = idf5.loc['Aphid',aphid_num_cols].fillna(0)
idf5

In [None]:
idf5.index = idf5.index.droplevel(level='observation_id')

In [None]:
idf5

In [None]:
idxcols = {'fields__oSets__date':                               'datetime',
           'fields__oSets__oPoints__id':                        'point_id',
           'fields__oSets__oPoints__observations__name':        'observations_name',
           'fields__oSets__oPoints__observations__|':           'natural_enemy_name',
           'fields__oSets__oPoints__observations__|__number':   'natural_enemy_count',
           }
idf = df2.rename(columns=idxcols)
for column in ['datetime', 'point_id', 'observations_name']:
    idf[column] = idf[column].ffill()