In [1]:
from typing import Tuple, Mapping, ValuesView

import pandas
from pandas import DataFrame, Series

from helper import get_sheets, normalize_str, alphanumeric_lower

# # Load frames

filename: str = '../../data/2016-sweep-vs-tiller/2016 combination.xlsx'
HEAD_COUNTS: str = 'Head Counts'
SHEET2: str = 'Sheet2'
sheet_names: Tuple[str, str] = (HEAD_COUNTS, SHEET2,)
sheets_to_compare: Mapping[str, DataFrame] = get_sheets(
    filename,
    sheet_names,
)
frames: ValuesView[
    DataFrame] = sheets_to_compare.values()  # assumes preserved order
sheet2: DataFrame = sheets_to_compare[SHEET2]
head_counts: DataFrame = sheets_to_compare[HEAD_COUNTS]


# clean

## sheet2

In [None]:
# all columns lowercase, no leading or trailing spaces
sheet2.columns = sheet2.columns.str.strip().str.lower()

# replace bad date column with good data
sheet2.drop(columns='date', inplace=True)
sheet2.rename(
    columns={'collection_date': 'date'},
    inplace=True,
)
sheet2['date'] = pandas.to_datetime(
    sheet2['date'],
    format='%d_%m_%Y',
)

## head_counts

In [None]:
# all columns lowercase, no leading or trailing spaces
head_counts.columns = head_counts.columns.str.strip().str.lower()

# replace bad date column with good data
head_counts['date'] = pandas.to_datetime(
    head_counts['date'],
    format='%d/%m/%Y',
)

## all

In [None]:
for frame in frames:
    frame.rename(
        columns={'field_name': 'field'},  # @todo: check if loop is necessary
        inplace=True,
    )
    frame.crop = frame.crop.apply(normalize_str)
site_values: DataFrame = (
    pandas.concat(
        (
            frame[['site']] for frame in frames
        ),
        keys=sheet_names,
        names=['Sheet Name', 'index', ],
    ).drop_duplicates().sort_values('site')
)
preferred_site_id: Series = pandas.Series(
    name='site',
    data={
        alphanumeric_lower(item): item
        for item in [
            'Alvena',
            'Clavet',
            'Indian Head',
            'Kernan',
            'Llewellyn',
            'Meadow Lake',
            'Melfort',
            'Outlook',
            'SEF',
            'Wakaw',
            'Yellow Creek',
        ]
    },
)
preferred_site_id.index.set_names(['site_index'], inplace=True)
frames_site_values_ = tuple(frames) + (site_values,)
for frame in frames_site_values_:
    frame['site_index'] = frame.site.apply(alphanumeric_lower)
    frame.set_index('site_index', append=True, inplace=True)
for frame in frames:
    frame.loc[:, 'site'] = (
        preferred_site_id.to_frame().combine_first(frame).loc[:, 'site']
    )
    frame.reset_index(
        level='site_index',
        drop=True,
        inplace=True,
    )
    frame.field = (
        frame.field.str.extract(
            pat=r'(?P<text>\D*)(?P<number>\d*)').loc[:, 'number'].apply(
            pandas.to_numeric, downcast='integer'))
