# init

In [290]:
import pandas, IPython.core.display

## open file

In [291]:
src = pandas.ExcelFile('data/real/2017/CAM data from iPads/2017 CAM data from iPads.xlsx')

In [292]:
src.sheet_names

['2017 CAM data Erl',
 'schema (WIP reverse engineer)',
 '2017 CAM iPad data Tyler',
 'Combined iPad 2017 CAM data']

## select sheets

In [293]:
sheets_to_process = {sheet_name.split(' ')[-1]: src.parse(sheet_name) for sheet_name in src.sheet_names
    if any([person_name in sheet_name for person_name in ['Erl', 'Tyler']])}

# rename columns in sheets with multi-word column names

In [294]:
sheets_to_rename = [sheet for sheet in sheets_to_process.values()
                    if any([str(column).find(' ') for column in sheet.columns])]
for sheet_to_rename in sheets_to_rename:
    sheet_to_rename.rename(mapper=lambda x: str(x).split(' ')[0], axis='columns', inplace=True)

In [295]:
for sheet_to_rename in sheets_to_rename:
    sheet_to_rename.rename(mapper=lambda x: str(x).split(' ')[0], axis='columns', inplace=True)

# concatenate sheets

In [329]:
df = pandas.concat(sheets_to_process, names=['worksheet_name', 'worksheet_row'])#.reset_index(drop=True)

# cleanup

In [297]:
observations_columns = {
#     'Sheet', 
#     'Row',
#     'clients__company',
#     'clients__displayText',
#     'clients__fname',
#     'clients__lname',
#     'clients__name',
    'fields__client__company': 'company',
#     'fields__client__displayText',
    'fields__client__fname': 'client_name_first',
    'fields__client__lname': 'client_name_last',
#     'fields__client__name', 
    'fields__crop': 'crop',
#     'fields__date',
    'fields__desc': 'field_description',
#     'fields__image',
    'fields__name': 'field_name',
#     'fields__oSets__completeSets',
    'fields__oSets__date': 'observation_time', 
#     'fields__oSets__dateCompare',
    'fields__oSets__desc': 'observation_memo',
    'fields__oSets__growthStage': 'field_growth_stage_zadoks',
    'fields__oSets__oPoints__id': 'observation_point_id',
#     'fields__oSets__oPoints__location__coords__accuracy',
#     'fields__oSets__oPoints__location__coords__altitude',
#     'fields__oSets__oPoints__location__coords__altitudeAccuracy',
#     'fields__oSets__oPoints__location__coords__heading',
#     'fields__oSets__oPoints__location__coords__latitude',
#     'fields__oSets__oPoints__location__coords__longitude',
#     'fields__oSets__oPoints__location__coords__speed',
#     'fields__oSets__oPoints__location__timestamp',
#     'fields__oSets__oPoints__name',
    'fields__oSets__oPoints__observations__a1__number': 'count_aphid_1',
    'fields__oSets__oPoints__observations__a2__number': 'count_aphid_2',
    'fields__oSets__oPoints__observations__a3__number': 'count_aphid_3',
    'fields__oSets__oPoints__observations__anum': 'total_aphid',
#     'fields__oSets__oPoints__observations__complete',
#     'fields__oSets__oPoints__observations__disabled',
#     'fields__oSets__oPoints__observations__eVnum',
    'fields__oSets__oPoints__observations__enum': 'total_ne',
#     'fields__oSets__oPoints__observations__id',
    'fields__oSets__oPoints__observations__name': 'observation_name',
    'fields__oSets__oPoints__observations__|': 'count_natural_enemy_name',
    'fields__oSets__oPoints__observations__|__number': 'count_natural_enemy_count',
    'fields__oSets__obsName': 'observer_name',
#     'fields__oSets__results',
    'fields__oSets__totalA1': 'set_total_aphid_1', 
    'fields__oSets__totalA2': 'set_total_aphid_2',
    'fields__oSets__totalA3': 'set_total_aphid_3', 
#     'fields__oSets__totalA4',
#     'fields__oSets__totalSets',
#     'observers'
}

In [330]:
df2 = df[list(observations_columns)].rename(columns=observations_columns)
df2.observation_name = df2.observation_name.astype('category')

## convert_datetime

In [331]:
df2.observation_time = pandas.to_datetime(df2.observation_time, infer_datetime_format=True)

## split string to columns

Information in the column could be used as an index, to separate the data we need to unstack (natural enemy counts) from the data that's already unstacked (aphid counts).

In [332]:
df2 = df2.join(df2.pop('observation_name').str.extract(
    '(?P<observation_subject>.+) Observation.*', expand=True))

## ffill columns

In [333]:
index_columns = [
    'observation_subject',
    'observation_time',
    'observation_point_id',
#     'observation_id',
]
set_column_names = [
    'company', 'client_name_first', 'client_name_last',
    'crop', 'field_description', 'field_name',
    'field_growth_stage_zadoks',
    'set_total_aphid_1',
    'set_total_aphid_2',
    'set_total_aphid_3', 
]
point_column_names = [
    'total_aphid',
    'total_ne',
]
observation_addendum_columns = [
    'observation_memo',
    'observer_name', 
]

In [334]:
ffill_columns = index_columns + ['company', 'client_name_first', 'client_name_last',
                                 'crop', 'field_description', 'field_name',
                                 'field_growth_stage_zadoks',
                                ]
df2[ffill_columns] = df2[ffill_columns].ffill()

## set indices

In [335]:
df2 = df2.set_index(index_columns, append=True)

In [339]:
pandas.options.display.max_rows = (3+9)*5*10
df2 = df2.reset_index(level=['worksheet_name', 'worksheet_row'])

In [340]:
df2.reorder_levels(['observation_time','observation_point_id','observation_subject']).sort_index().loc[:, [column for column in df2.columns if 'aphid' in column]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count_aphid_1,count_aphid_2,count_aphid_3,total_aphid,set_total_aphid_1,set_total_aphid_2,set_total_aphid_3
observation_time,observation_point_id,observation_subject,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-07-14 12:31:24.194,0.0,Aphid,7.0,,,0.0,,,
2017-07-14 12:31:24.194,0.0,Aphid,,,,0.0,,,
2017-07-14 12:31:24.194,0.0,Aphid,,,,0.0,,,
2017-07-14 12:31:24.194,0.0,Aphid,,,,0.0,,,
2017-07-14 12:31:24.194,0.0,Aphid,,,,0.0,,,
2017-07-14 12:31:24.194,0.0,Natural Enemy,,,,,,,
2017-07-14 12:31:24.194,0.0,Natural Enemy,,,,,,,
2017-07-14 12:31:24.194,0.0,Natural Enemy,,,,,,,
2017-07-14 12:31:24.194,0.0,Natural Enemy,,,,,,,
2017-07-14 12:31:24.194,0.0,Natural Enemy,,,,,,,


## split frame into points, aphids, and natural enemies

In [306]:
count_aphid_columns = [
    'count_aphid_1',
    'count_aphid_2',
    'count_aphid_3',
]
count_natural_enemy_columns = [
    [
        'count_natural_enemy_name',
        'count_natural_enemy_count',
    ],
    [
        'count_natural_enemy_1',
        'count_natural_enemy_2',
        'count_natural_enemy_3',
        'count_natural_enemy_4',
        'count_natural_enemy_5',
        'count_natural_enemy_6',
        'count_natural_enemy_7',
        'count_natural_enemy_8',
        'count_natural_enemy_9',
    ],
]

In [341]:
df2.columns

Index(['worksheet_name', 'worksheet_row', 'company', 'client_name_first',
       'client_name_last', 'crop', 'field_description', 'field_name',
       'observation_memo', 'field_growth_stage_zadoks', 'count_aphid_1',
       'count_aphid_2', 'count_aphid_3', 'total_aphid', 'total_ne',
       'count_natural_enemy_name', 'count_natural_enemy_count',
       'observer_name', 'set_total_aphid_1', 'set_total_aphid_2',
       'set_total_aphid_3'],
      dtype='object')

In [365]:
points_df = df2.reset_index().groupby(by=('observation_time','observation_point_id')).first()[
    set_column_names + point_column_names + observation_addendum_columns + ['worksheet_name', 'worksheet_row']]

In [354]:
aphid_df = df2.loc['Aphid'][count_aphid_columns]

In [355]:
natural_enemy_df = df2.loc['Natural Enemy'].set_index('count_natural_enemy_name', append=True)

## unstack natural enemies

In [356]:
natural_enemy_df = natural_enemy_df['count_natural_enemy_count'].unstack(level='count_natural_enemy_name')

## zero fill natural enemies and aphids

In [357]:
natural_enemy_df = natural_enemy_df.fillna(0)

In [358]:
aphid_df = aphid_df.fillna(0)

## @todo label natural enemies

## @todo label aphids

## apply total aphids per subject to "natural enemies" row, because that row has natural enemy totals already

In [348]:
# all_df = all_df.reset_index()

In [359]:
sums = aphid_df.groupby(['observation_time', 'observation_point_id']).sum()

In [360]:
sums['row_type'] = 'point'
sums = sums.set_index('row_type', append=True)
sums.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count_aphid_1,count_aphid_2,count_aphid_3
observation_time,observation_point_id,row_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-07-14 12:31:24.194,0.0,point,7.0,0.0,0.0
2017-07-14 12:31:24.194,1.0,point,0.0,0.0,0.0
2017-07-14 12:31:24.194,2.0,point,0.0,0.0,0.0
2017-07-14 12:31:24.194,3.0,point,0.0,0.0,0.0
2017-07-14 12:31:24.194,4.0,point,0.0,0.0,0.0
2017-07-18 10:31:22.263,0.0,point,0.0,0.0,2.0
2017-07-18 10:31:22.263,1.0,point,0.0,0.0,0.0
2017-07-18 10:31:22.263,2.0,point,6.0,0.0,0.0
2017-07-18 10:31:22.263,3.0,point,0.0,0.0,0.0
2017-07-18 10:31:22.263,4.0,point,0.0,0.0,0.0


In [366]:
points_df['row_type'] = 'point'
points_df = points_df.set_index('row_type', append=True)#.reorder_levels(('observation_time', 'row_type', 'observation_point_id'))
points_df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,company,client_name_first,client_name_last,crop,field_description,field_name,field_growth_stage_zadoks,set_total_aphid_1,set_total_aphid_2,set_total_aphid_3,total_aphid,total_ne,observation_memo,observer_name,worksheet_name,worksheet_row
observation_time,observation_point_id,row_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-07-14 12:31:24.194,0.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,2.0,,Tyler,Tyler,0
2017-07-14 12:31:24.194,1.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,14
2017-07-14 12:31:24.194,2.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,28
2017-07-14 12:31:24.194,3.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,42
2017-07-14 12:31:24.194,4.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,56
2017-07-18 10:31:22.263,0.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,8.0,0.0,0.0,0.0,0.0,,Tyler,Tyler,70
2017-07-18 10:31:22.263,1.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,84
2017-07-18 10:31:22.263,2.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,98
2017-07-18 10:31:22.263,3.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,112
2017-07-18 10:31:22.263,4.0,point,AAFC SRDC,Tyler,Wist,CROPS.WHEAT,Next to soybeans and canola,Sef wheat,6.0,,,,0.0,0.0,,,Tyler,126


In [367]:
all_df = points_df.sort_index().merge(sums.sort_index(), left_index=True, right_index=True).sort_index()
all_df[[column for column in all_df.columns if any([word in column for word in ['total','aphid']])] + ['worksheet_name', 'worksheet_row']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,set_total_aphid_1,set_total_aphid_2,set_total_aphid_3,total_aphid,total_ne,count_aphid_1,count_aphid_2,count_aphid_3,worksheet_name,worksheet_row
observation_time,observation_point_id,row_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-07-14 12:31:24.194,0.0,point,,,,0.0,2.0,7.0,0.0,0.0,Tyler,0
2017-07-14 12:31:24.194,1.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,14
2017-07-14 12:31:24.194,2.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,28
2017-07-14 12:31:24.194,3.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,42
2017-07-14 12:31:24.194,4.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,56
2017-07-18 10:31:22.263,0.0,point,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,Tyler,70
2017-07-18 10:31:22.263,1.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,84
2017-07-18 10:31:22.263,2.0,point,,,,0.0,0.0,6.0,0.0,0.0,Tyler,98
2017-07-18 10:31:22.263,3.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,112
2017-07-18 10:31:22.263,4.0,point,,,,0.0,0.0,0.0,0.0,0.0,Tyler,126


### join points, aphids, natural enemies

In [289]:
pandas.merge(points_df, aphid_df, left_index=True, right_index=True).merge(natural_enemy_df, left_index=True, right_index=True)

NotImplementedError: merging with more than one level overlap on a multi-index is not implemented

In [None]:
idf[['natural_enemy_name', 'natural_enemy_count']]

In [None]:
ne = idf.xs('Natural Enemy', level='observation_subject', drop_level=False)

In [None]:
ne.set_index('natural_enemy_name', append=True, inplace=True)

In [None]:
ne['natural_enemy_count'].dropna()

### fill NaN with 0, unstack enemy counts

In [None]:
ne2 = ne['natural_enemy_count'].fillna(value=0).unstack()

In [None]:
df = convert_datetime(df)

### Remove old column from "left" frame

In [None]:
natural_enemy_column_names = [
    column_name for column_name in idf.columns if 'natural_enemy_' in column_name]

In [None]:
for column_name in natural_enemy_column_names:
    print(column_name)
    del idf[column_name]

### remove duplicates created by old index

These won't be needed anymore, and won't be in the "right" frame we're about to merge.

In [None]:
index_columns = ['datetime', 'point_id', 'observation_id','observation_subject']
idf2 = idf.reset_index().drop_duplicates(subset=index_columns).set_index(index_columns)

### merge unstacked natural enemy counts into main frame

In [None]:
idf3 = pandas.merge(idf2, ne2, right_index=True, left_index=True, how='outer')

In [None]:
idf3[get_descendant_column_names(get_column_tree(idf2)['fields']['oSets']['oPoints'])+enemy_num_cols]