# Process Data from 2016 into a consistent format.

> This notebook brings the 2016 into alignment with the desired format with respect to field name, type, and grouping.

In [None]:
# Imports ----
import re
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle

In [None]:
from g2fd.internal import *

In [None]:
#| default_exp internal

In [None]:
# 2016
year_string = '2016'

meta_path = './data/raw/G2F_Planting_Season_2016_v2/z._2016_supplemental_info/g2f_2016_field_metadata.csv' 
phno_path = './data/raw/G2F_Planting_Season_2016_v2/a._2016_hybrid_phenotypic_data/g2f_2016_hybrid_data_clean.csv' 
# geno_path = None,  
wthr_path = './data/raw/G2F_Planting_Season_2016_v2/b._2016_weather_data/g2f_2016_weather_data.csv'
soil_path = './data/raw/G2F_Planting_Season_2016_v2/c._2016_soil_data/g2f_2016_soil_data_clean.csv'
mgmt_path = './data/raw/G2F_Planting_Season_2016_v2/z._2016_supplemental_info/g2f_2016_agronomic_information.csv'

meta = pd.read_csv(meta_path, encoding = "ISO-8859-1", low_memory=False)
phno = pd.read_csv(phno_path, encoding = "ISO-8859-1", low_memory=False)
wthr = pd.read_csv(wthr_path, encoding = "ISO-8859-1", low_memory=False)
soil = pd.read_csv(soil_path, encoding = "ISO-8859-1", low_memory=False)
mgmt = pd.read_csv(mgmt_path, encoding = "ISO-8859-1", low_memory=False)

In [None]:
# load dicts for column renaming
meta_name_dict = mk_name_dict(name = 'meta')
phno_name_dict = mk_name_dict(name = 'phno')
soil_name_dict = mk_name_dict(name = 'soil')
wthr_name_dict = mk_name_dict(name = 'wthr')
mgmt_name_dict = mk_name_dict(name = 'mgmt')

# Rename
**Naming rules:**
- One dict for each input df
- Comment out anything that shouldn't be changed
- Upper_Upper_Unit_\$unit
- Upper_$number
- No special characters


In [None]:
(find_unrecognized_columns(df = meta, dct = meta_name_dict),
find_unrecognized_columns(df = phno, dct = phno_name_dict),
find_unrecognized_columns(df = soil, dct = soil_name_dict),
find_unrecognized_columns(df = wthr, dct = wthr_name_dict),
find_unrecognized_columns(df = mgmt, dct = mgmt_name_dict))

([], [], [], [], [])

In [None]:
# prlst2dct([])

# soil
# Sample_Type

# wthr
# Drop_Record_Index
# Drop_Record_Index2
meta

Unnamed: 0,Experiment_Code,Treatment,City,Farm,Field,Trial_ID (Assigned by collaborator for internal reference),"Soil_Taxonomic_ID and horizon description, if known","Weather_Station_Serial_Number (Last four digits, e.g. m2700s#####)",Weather_Station_Latitude (in decimal numbers NOT DMS),Weather_Station_Longitude (in decimal numbers NOT DMS),Date_weather_station_placed,Date_weather_station_removed,Previous_Crop,Pre-plant_tillage_method(s),In-season_tillage_method(s),Plot_length (center-alley to center-alley in feet),Alley_length (in inches),Row_spacing (in inches),Type_of_planter (fluted cone; belt cone; air planter),Number_kernels_planted_per_plot (>200 seed/pack for cone planters),System_Determining_Moisture,Pounds_Needed_Soil_Moisture,Latitude_of_Field_Corner_#1 (lower left),Longitude_of_Field_Corner_#1 (lower left),Latitude_of_Field_Corner_#2 (lower right),Longitude_of_Field_Corner_#2 (lower right),Latitude_of_Field_Corner_#3 (upper right),Longitude_of_Field_Corner_#3 (upper right),Latitude_of_Field_Corner_#4 (upper left),Longitude_of_Field_Corner_#4 (upper left),Cardinal_Heading_Pass_1,Local_Check_#1_Pedigree,Local_Check_#1_Source,Local_Check_#2_Pedigree,Local_Check_#2_Source,Local_Check_#3_Pedigree,Local_Check_#3_Source,Local_Check_#4_Pedigree,Local_Check_#4_Source,Local_Check_#5_Pedigree,Local_Check_#5_Source,Issue/comment_#1,Issue/comment_#2,Issue/comment_#3,Issue/comment_#4,Issue/comment_#5,Issue/comment_#6,Issue/comment_#7,Issue/comment_#8,Issue/comment_#9,Issue/comment_#10
0,ARH1,,Mariana,,,Mariana,,8658,34.728333,-90.760278,4/22/16,9/1/16,corn,"Cultivate, hip and row",none,25.0,60.0,38.0,fluted cone,98.0,combine,,34.729884,-90.759742,34.729894,-90.76094,34.729151,-90.759824,34.729151,-90.760916,,Pioneer1319YHR,UA extension agent,Pioneer1637YHR,UA extension agent,Pioneer2160YHR,UA extension agent,DKC66-87,UA extension agent,DKC67-72,UA extension agent,Large pigweed in field at harvest August 30 an...,,,,,,,,,
1,ARH2,Standard,Jonesboro,Arkansas State University,PSSC1,ARH2,,10794,35.838876,-90.664864,5/2/16,11/11/16,soybeans,"cultivate, hip, row",none,14.5,42.0,30.0,fluted cone,44.0,hand held,0.2,35.838816,-90.665006,35.838818,-90.66559,35.838411,-90.66559,35.838411,-90.66502,,Pioneer1319YHR,UA extension agent,Pioneer1637YHR,UA extension agent,Pioneer2160YHR,UA extension agent,DKC66-87,UA extension agent,DKC67-72,UA extension agent,goose feeding has impacted the stand of most p...,flash flooding caused stem lodging across th...,2 passes with hipper roller (4/18/16)\n,2 passes with the field cultivator (4/15/16),4 passes with disk (4/15/16),,,,,
2,DEH1,,Georgetown,Davis Farms,,DEH1,,9079,38.648453,-75.45161,4/28/16,9/14/16,Soybean,,"Disked, chisel plow & final disking",17.4,28.0,30.0,air planter,75.0,Almaco seed specter II,3.5,38.64865,-75.45159,38.64878,-75.45188,38.64781,-75.45277,38.64757,-75.428,243.0,DKC54-40RIB GENVT2P,DKC54-40RIB GENVT2P,DKC62-08RIB GENSS,DKC62-08RIB GENSS,DKC63-87RIB GENVT2P,DKC63-87RIB GENVT2P,DKC64-69RIB GENVT3P,DKC64-69RIB GENVT3P,DKC65-19RIB GENVT3P,DKC65-19RIB GENVT3P,On 6-14-2016 the battery for the weather stati...,On 8-26-2016 the battery for the weather stati...,,,,,,,,
3,GAH1,Standard,Tifton,Bellflower,11,,Tifton loamy sand/Dothan loamy sand,8427,31.50743,-83.558661,4/12/16,8/24/16,peanut,conventional,plow,20.0,72.0,36.0,fluted cone,72.0,Almaco,2,31.50822,-83.559182,31.507852,-83.559413,31.507663,-83.557656,31.507935,-83.55794,121.0,DKC69-72,DeKalb,DKC69-29,DeKalb,P2023R,Pioneer,P1498R,Pioneer,P31G65,Pioneer,Field is not rectangular; it is curved to cont...,,,,,,,,,
4,GAH2,Standard,"Watkinsville, GA",Iron Horse Plant Sciences Farm,A Field / Between Creeks Field,,Augusta A / Cartecey A,ms2700s08631,33.71722,-83.31133,5/25/16,10/27/16,Soybean,Conventional disc tillage,none,15.0,48.0,30.0,Fluted cone,52.0,John Deere Moisture Chek-Plus SW08120,<1,33.71736,-83.31141,33.71722,-83.31133,33.71722,-83.31068,33.71738,-83.31069,90.0,DKC69-72 (DeKalb),Joseph Knoll,DKC69-29 (DeKalb),Joseph Knoll,P2023R (Pioneer),Joseph Knoll,P1498R (Pioneer),Joseph Knoll,P31G65 (Pioneer),Joseph Knoll,07-07-2016 Field had major weed infestation. W...,05-25-2016 Planter jammed and failed to plant ...,06-01-2016 Runoff gulley washed away fertilize...,Stalk lodging and root lodging were not taken ...,Test weight was not taken due to a miscommunic...,,,,,
5,IAH1,Standard,Crawfordsville,SERF,Field 14,CRW14,,9080,41.198693,-91.486205,4/22/16,10/1/16,soybean,field cultivator,none,20.0,30.0,30.0,air planter,83.0,Almaco,3.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,IAH2,Standard,Carroll,Thielen,Field 6,CRL06,,9085,42.06591,-94.727448,4/25/16,10/11/16,soybean,field cultivator,none,20.0,30.0,30.0,air planter,83.0,Almaco,3.5,,,,,,,,,,,,,,,,,,,,weather station rain bucket plugged when removed,wind storm causing root lodging and greensnap ...,,,,,,,,
7,IAH3,Standard,Keystone,Polhman,Field 2,KYS02,,9082,41.987383,-92.260156,4/23/16,10/6/16,soybean,none,none,20.0,30.0,30.0,air planter,83.0,Almaco,3.5,,,,,,,,,,,,,,,,,,,,field is continuous no-till,,,,,,,,,
8,IAH4,Standard,Ames,Kitchen,KTC01,KTC01,,11859,41.997502,-93.699899,4/26/16,10/17/16,soybean,field cultivator,none,20.0,30.0,30.0,air planter,83.0,Almaco,3.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,ILH1,Standard,"Champaign, IL",Maxwell Farm,MF-500,,Flanagan Silt Loam (154A),8653,40.061188,-88.234042,5/21/16,10/10/16,Soybeans,Chisel,,20.8,40.0,30.0,air planter,33.0,Harvest Master,,40.061188,-88.233272,40.061188,-88.234491,40.060261,-88.234491,40.060261,-88.233272,,P1417AMX,,P1197AMXT,,P0339AMT,,P1311AMXT,,DKC62-77,,,,,,,,,,,


In [None]:
meta = meta.rename(columns=meta_name_dict)
phno = phno.rename(columns=phno_name_dict)
soil = soil.rename(columns=soil_name_dict)
wthr = wthr.rename(columns=wthr_name_dict)
mgmt = mgmt.rename(columns=mgmt_name_dict)

# add indicator columns to help with debugging merge
meta['meta'] = True
phno['phno'] = True
soil['soil'] = True
wthr['wthr'] = True
mgmt['mgmt'] = True

In [None]:
[e.shape for e in [meta, phno, soil, wthr, mgmt]]

[(29, 52), (16377, 39), (37, 34), (250480, 27), (141, 9)]

# Sanatize ID columns as needed


In [None]:
# soil = sanitize_Experiment_Codes(
#     df = soil, 
#     simple_renames = {
#         'W1H1': 'WIH1', 
#         'W1H2': 'WIH2', 
#         'W1H3': 'WIH3'
#     }, 
#     split_renames = {
#         'NEH2_NEH3': ['NEH2', 'NEH3']
#     })

# wthr = sanitize_Experiment_Codes(
#     df = wthr, 
#     simple_renames = {
#     }, 
#     split_renames = {
#         'NEH2_NEH3': ['NEH2', 'NEH3'],
#         'NYH3_NYS1': ['NYS1', 'NYH3'],
#         'TXH1_TXH3': ['TXH1', 'TXH3']
#     })

In [None]:
# confirm everything's okay
print(
  'meta', find_unrecognized_experiments(meta.Experiment_Code, return_all_exps=False), 
'\nphno', find_unrecognized_experiments(phno.Experiment_Code, return_all_exps=False),
'\nsoil', find_unrecognized_experiments(soil.Experiment_Code, return_all_exps=False),
'\nwthr', find_unrecognized_experiments(wthr.Experiment_Code, return_all_exps=False),
'\nmgmt', find_unrecognized_experiments(mgmt.Experiment_Code, return_all_exps=False),
'\nall ', find_unrecognized_experiments([], return_all_exps=True)
)  

meta ['NEH4'] 
phno ['NEH4'] 
soil ['AZH1', 'ILH2', 'NYH4'] 
wthr [] 
mgmt [] 
all  ['ARH1', 'ARH2', 'COH1', 'DEH1', 'GAH1', 'GAH2', 'GEH1', 'GEH2', 'IAH1', 'IAH2', 'IAH2 ', 'IAH3', 'IAH3 ', 'IAH4', 'IAH4 ', 'ILH1', 'INH1', 'KSH1', 'KSH2', 'KSH3', 'MIH1', 'MNH1', 'MOH1', 'MOH1 ', 'MOH1-Rep1', 'MOH1-Rep2', 'NCH1', 'NEH1', 'NEH2', 'NEH3', 'NYH1', 'NYH1', 'NYH2', 'NYH3', 'NYS1', 'OHH1', 'ONH1', 'ONH2', 'SCH1', 'TXH1', 'TXH1-Dry', 'TXH1-Early', 'TXH1-Late', 'TXH2', 'TXH3', 'TXH4', 'W1H1', 'W1H2', 'WIH1', 'WIH2', 'WIH3']


In [None]:
# # Find minimum cols needed to index all rows
# df = phno
# id_cols = ['Year', 'Experiment_Code', 'Range', 'Pass', 'Plot',]
# candidate_cols = ['State', 'City',
#                  'Experiment', 'Source', 'Pedigree', 'Family', 'Tester', 'Replicate',
#                   'Block',  'Plot_ID']
# target = df.shape[0]

# output = pd.DataFrame(zip(
#     candidate_cols,
#     [df.loc[:, id_cols+[e]].drop_duplicates().shape[0] for e in candidate_cols]
#    ), columns=['Additional_ID', 'Uniq_Vals'])

# output.assign(At_Target=lambda x:x.Uniq_Vals == target)


# Rearrange columns

In [None]:
# separate static and dynamic values
sval = phno.merge(soil, how = 'outer')
sval = sval.merge(meta, how = 'outer') # This introduces 3 sites that have no data
# sval.shape # used to confirm nrow = #20574 + 3

# these tables are different enought we'll keep them separate
# mgmt
# unfortunately we need multiples because at least one field treats different passes differently
mgmt = phno.loc[:, ['Year', 'Experiment_Code', 'Range', 'Pass', 'Plot', 'phno']
               ].drop_duplicates().merge(mgmt, how = 'outer')
# confirm there are no rows in mgmt that are not in phno
temp = mgmt.loc[(~mgmt.phno & mgmt.mgmt), :]
if 0 != temp.shape[0]:
    print(temp)
else:
    mgmt = mgmt.loc[mgmt.mgmt.notna(), :].drop(columns = 'phno')


# wthr
# There's only ever one weather station so we have to worry about imputation but not duplicates

In [None]:
# Set each id col to a string
for i in ['Year', 'Experiment_Code', 'Range', 'Pass', 'Plot']:
    sval[i] = sval[i].astype('string')
    mgmt[i]  =  mgmt[i].astype('string')
    
    if i not in ['Range', 'Pass', 'Plot']:
        wthr[i]  =  wthr[i].astype('string')

# Sanitize Non-ID columns



## Sanitization functions

The pattern to use is:
 1. Alter the dataframe
 1. Test the dataframe against expectations
 
The main tasks that need to be completed are:
 1. Identify values that can't be converted to the expected data type. The "find_unconvertable_" family of functions should be used. 
     1. `find_unconvertable_datetimes`
     
 1. For simple renaming (e.g. misspellings) or splitting non-tidy data into two rows ("entry1-entry2" -> "entry1", "entry2") use `sanitize_col` 
 1. Move values that are ambigous but pertain to data imputation to "Imputation_Notes" using `relocate_to_Imputation_Notes`
 1. If new columns need to be added (e.g. mgmt.Ingredient for parsed components of Product (e.g. elements) ) this should be accomplished with `safe_create_col`.
 1. Any one off changes should be accomplised manually. 
 1. Confirm columns match the expected types with `check_df_dtype_expectations`, and report mismatches. 


These steps should be completed for each dataframe in turn to minimize the cognitive load of the reader. 

## Sanitization: Column data type expectations
Note: to handle missing values some columns that would otherwise be ints are floats

In [None]:
sval_col_dtypes = mk_dtype_dict(name = 'sval')
wthr_col_dtypes = mk_dtype_dict(name = 'wthr')
mgmt_col_dtypes = mk_dtype_dict(name = 'mgmt')

# Sanitization: Alter entries

## Static values (within season)

### Datetime containing columns

In [None]:
# # convert the date cols into datetime. Lean on pd.to_datetime() to infer the format, assume that each site uses the same format.

# for e in ['Planted_Unit_Datetime', 
#     'Harvested_Unit_Datetime', 
#     'Anthesis_Unit_Datetime', 
#     'Silking_Unit_Datetime', 
#     'Recieved_Date_Unit_Datetime', 
#     'Processed_Date_Unit_Datetime', 
#     'Weather_Station_Placed_Unit_Datetime', 
#     'Weather_Station_Removed_Unit_Datetime'
#     ]:
# # find_unconvertable_datetimes(df_col=sval[e], pattern='%Y-%m-%d %H:%M', index=False)

#     sval['Datetime_Temp'] = pd.to_datetime(np.nan)

#     for code in list(sval.Experiment_Code.drop_duplicates()):
#     # code = list(sval.Experiment_Code.drop_duplicates())[0]
#         sval.loc[sval.Experiment_Code == code, 'Datetime_Temp'
#                  ] = pd.to_datetime(sval.loc[sval.Experiment_Code == code, e])

#     sval.loc[:, e] = sval.loc[:, 'Datetime_Temp'] 

# sval = sval.drop(columns = 'Datetime_Temp')

In [None]:
# # -> floats

# # [find_unconvertable_numerics(df_col = sval[e], index = False) for e in [
# #     'Alley_Length_Unit_Inches',
# # 'Row_Spacing_Unit_Inches',
# # 'Pounds_Needed_Soil_Moisture'
# # ]]

sval = sanitize_col(
    df = sval, 
    col = 'Pounds_Needed_Soil_Moisture', 
    simple_renames= {'3 to 4':'3.5'}, 
    split_renames= {})

In [None]:
# convert types
for e in ['Alley_Length_Unit_Inches', 'Row_Spacing_Unit_Inches', 'Pounds_Needed_Soil_Moisture',
         'Anthesis_Unit_Days', 'Silking_Unit_Days', 'Kernels_Per_Plot']:
    err_list = find_unconvertable_numerics(df_col = sval[e], index = False)
    if err_list != []:
        print(e)
        print(err_list)
    else:
        sval[e] = sval[e].astype('float')

Pounds_Needed_Soil_Moisture
['<1', 'Unknown', '~5 lbs', '2.5 lbs.']


In [None]:
# to bool
sval = sanitize_col(
    df = sval, 
    col = 'Discarded', 
    simple_renames= {
        'Yes':'True',
        'yes':'True'}, 
    split_renames= {})

# set missing to false
sval.loc[sval.Discarded.isna(), 'Discarded'] = 'False'
sval.Discarded = sval.Discarded.map({'True': True, 'False': False})

### Simple Columns

In [None]:
# to float
# sval.Pounds_Needed_Soil_Moisture.astype(float)

In [None]:
# to bool
sval['phno'] = sval['phno'].astype('bool')
sval['soil'] = sval['soil'].astype('bool')
sval['meta'] = sval['meta'].astype('bool')

# to string
sval = cols_astype_string(
    df = sval, 
    col_list = [key for key in sval_col_dtypes.keys() if sval_col_dtypes[key] == 'string'])

sval.Year = year_string
sval.Year = sval.Year.astype('string')

### Check Success

In [None]:
checkpoint = check_df_dtype_expectations(df = sval, dtype_dct = sval_col_dtypes)

if sum(checkpoint.Pass)/checkpoint.shape[0] == 1:
    pass
else:
    print(checkpoint.loc[~checkpoint.Pass, ]) 
    print()

98/123 Columns pass.
                                   Column    dtype  Expected_dtype   Pass
2                       Drop_Record_Index  float64             NaN  False
10                            Local_Check   object             NaN  False
11                      Plot_Length_Unit_  float64             NaN  False
12                     Alley_Length_Unit_  float64             NaN  False
13                      Row_Spacing_Unit_  float64             NaN  False
14                        Plot_Area_Unit_  float64             NaN  False
16                       Packets_Per_Plot  float64             NaN  False
19                  Planted_Unit_Datetime   object  datetime64[ns]  False
20                Harvested_Unit_Datetime   object  datetime64[ns]  False
21                 Anthesis_Unit_Datetime   object  datetime64[ns]  False
22                  Silking_Unit_Datetime   object  datetime64[ns]  False
37                      Additional_Metics  float64             NaN  False
39               

## Weather

### Datetime

In [None]:
# # instead of writing regexes to figure out the mose likely format for each datetime, we assume each experiment will be consistent withing that experiment
# # and let pd figure it out.
# # wthr['Datetime_Temp'] = pd.to_datetime(np.nan)

# # for code in list(wthr.loc[:, 'Experiment_Code'].drop_duplicates()):
# #     wthr.loc[wthr.Experiment_Code == code, 'Datetime_Temp'] = pd.to_datetime(wthr.loc[wthr.Experiment_Code == code, 'Datetime'], errors='coerce')


# # ... or we use the fields in the df to make a consistent format
# wthr = cols_astype_string(
#     df = wthr, 
#     col_list = ['Year', 'Month', 'Day', 'Time'])

# wthr = sanitize_col(
#     df = wthr,
#     col = 'Time', 
#     simple_renames= {'24:00:00': '00:00:00'}, # this could be day + 24 h instead of a miscoded day + 0 h
#     split_renames= {})

# wthr['Datetime_Temp'] = wthr['Year']+'-'+wthr['Month']+'-'+wthr['Day']+' '+wthr['Time']

# # convert types
# err_list = find_unconvertable_datetimes(df_col=wthr['Datetime_Temp'], pattern='%Y-%m-%d %H:%M', index=False)
# if err_list != []:
#     print(err_list)
# else:
#     wthr.Datetime_Temp = pd.to_datetime(pd.Series(wthr.Datetime_Temp), errors='coerce')
#     wthr.Datetime = wthr.Datetime_Temp
#     wthr = wthr.drop(columns= 'Datetime_Temp')

### Simple Columns 

In [None]:
# to string
wthr = cols_astype_string(
    df = wthr, 
    col_list = [key for key in wthr_col_dtypes.keys() if wthr_col_dtypes[key] == 'string'])

wthr.Year = year_string
wthr.Year = wthr.Year.astype('string')

### Check Success

In [None]:
checkpoint = check_df_dtype_expectations(df = wthr, dtype_dct = wthr_col_dtypes)

if sum(checkpoint.Pass)/checkpoint.shape[0] == 1:
    pass
else:
    print(checkpoint.loc[~checkpoint.Pass, ])

22/27 Columns pass.
                    Column    dtype Expected_dtype   Pass
21  Photoperiod_Unit_Hours  float64            NaN  False
22            Data_Cleaned   object            NaN  False
23          Fields_Cleaned   object            NaN  False
24         Cleaning_Method   object            NaN  False
25        Weather_Comments   object            NaN  False


## Management

### Date_Datetime

In [None]:
mgmt = relocate_to_Imputation_Notes(df = mgmt, col = 'Date_Datetime', val_list= ['Before Planting'])

In [None]:
# mgmt = sanitize_col(
#     df = mgmt, 
#     col = 'Date_Datetime', 
#     simple_renames= {}, 
#     split_renames= {'6/24/21 for all but plots in pass 2; 7/5/21 for pass 2' : [
#                         '6/24/21 for all but plots in pass 2', '7/5/21 for pass 2']})

In [None]:
# # make corrections too one-off to fix with a funciton. 
# mask = ((mgmt.Date_Datetime == '6/24/21 for all but plots in pass 2') & (mgmt.Pass != 2.))
# mgmt.loc[mask, 'Date_Datetime'] = '6/24/21'
# # since we split without specifiying pass we need to remove any rows that still have the search string.
# # and overwrite the df
# mask = (mgmt.Date_Datetime == '6/24/21 for all but plots in pass 2')
# mgmt = mgmt.loc[~mask, :].copy()

# mask = ((mgmt.Date_Datetime == '7/5/21 for pass 2') & (mgmt.Pass == 2.))
# mgmt.loc[mask, 'Date_Datetime'] = '7/5/21'
# mask = (mgmt.Date_Datetime == '7/5/21 for pass 2')
# mgmt = mgmt.loc[~mask, :].copy()

In [None]:
# convert types
err_list = find_unconvertable_datetimes(df_col=mgmt.Date_Datetime, pattern='%m/%d/%y', index=False)
if err_list != []:
    print(err_list)
else:
    mgmt.Date_Datetime = pd.to_datetime(pd.Series(mgmt.Date_Datetime), format = '%m/%d/%y', errors='coerce')

['Friday, July 8, 2016', 'Friday, March 4, 2016', 'Thursday, May 19, 2016', 'Wednesday, June 1, 2016', 'Friday, April 22, 2016', 'Sunday, June 12, 2016', 'Tuesday, July 12, 2016', 'Tuesday, May 10, 2016', 'Friday, April 15, 2016', 'Monday, May 9, 2016', 'Friday, May 13, 2016', 'Monday, April 11, 2016', 'Monday, May 23, 2016', 'Thursday, June 30, 2016', 'Friday, May 20, 2016', 'Tuesday, June 14, 2016', 'Friday, June 24, 2016', 'Tuesday, April 19, 2016', 'Tuesday, May 31, 2016', 'Thursday, June 23, 2016', 'Wednesday, May 4, 2016', 'Thursday, May 5, 2016', 'Friday, October 21, 2016', 'Saturday, July 2, 2016', 'Tuesday, April 5, 2016', 'Tuesday, July 5, 2016', 'Wednesday, April 6, 2016', 'Friday, June 17, 2016', 'Tuesday, February 16, 2016', 'Thursday, August 4, 2016', 'Tuesday, June 28, 2016', 'Thursday, July 28, 2016', 'Monday, March 21, 2016', 'Tuesday, May 17, 2016', 'Wednesday, March 30, 2016', 'Thursday, April 28, 2016', 'Friday, April 29, 2016', 'Wednesday, May 25, 2016', 'Wednesday

### Amount_Per_Acre

In [None]:
# mgmt.loc[find_unconvertable_numerics(df_col = mgmt['Amount_Per_Acre'], index = True), ]

In [None]:
# mgmt = sanitize_col(
#     df = mgmt, 
#     col = 'Amount_Per_Acre', 
#     simple_renames= {'170 lb (actual N)': '170 (N)'}, 
#     split_renames= {'51.75, 40.7, 111.7 (N,P,K)': ['51.75 (N)', '40.7 (P)', '111.7 (K)'],
#                     '31-150-138': ['31 (N)', '150 (P)', '138 (K)'],
#                     '16 (N), 41 (P)': ['16 (N)', '41 (P)']})

In [None]:
# mgmt = safe_create_col(mgmt, "Ingredient")
# mask = mgmt.Ingredient.isna()
# mgmt.loc[mask, 'Ingredient'] = mgmt.loc[mask, 'Product']

# # assume each string is formated as 'val (key)'. `sanitize_col` should be used to enforce this.
# for e in ['150 (P)', '36.6 (N)', '138 (K)', '111.7 (K)', '41 (P)', '16 (N)', '170 (N)', '35.7 (N)', '51.75 (N)', '31 (N)', '40.7 (P)']:
#     val = re.findall('^\d+[.]*\d*', e)[0]
#     key = re.findall('\(.+\)',      e)[0].replace('(', '').replace(')', '')
    
#     mask = (mgmt['Amount_Per_Acre'] == e)
#     mgmt.loc[mask, 'Ingredient'] = key
#     mgmt.loc[mask, 'Amount_Per_Acre'] = val

In [None]:
# convert types
err_list = find_unconvertable_numerics(df_col = mgmt['Amount_Per_Acre'], index = False)
if err_list != []:
    print(err_list)
else:
    mgmt.Amount_Per_Acre = pd.to_numeric(mgmt.Amount_Per_Acre, errors='coerce')

['30 Actual P', '50 GPA', '150+5 Zn', '0.50%', '20-9-3', '1/2 ton/A', '7.5 N, 30 P, 30 K', '60 Actual N', '91 lb', '4.5 pt./A', '375 lbs./A', '1.00%', '2 lbs./A', '1.5 qt./A', 'NPK 27,26,0\n', '134-0-0', '100 Actual N', '90 Actual K', '6.5 lbs./A', '60 gallons/acre', '.75 inch', '180 N']


### Ingredient
This is to be the cleaned up version of the "Product" column

In [None]:
# list(mgmt.loc[:, 'Ingredient'].drop_duplicates())

### Simple Columns

In [None]:
# to bool
mgmt['mgmt'] = mgmt['mgmt'].astype('bool')

# to string
for e in [ee for ee in ['Application', 'Product', 'Ingredient', 'Unit', 'Imputation_Notes'] if ee in mgmt.columns]:
    mgmt[e] = mgmt[e].astype('string')
    

mgmt.Year = year_string
mgmt.Year = mgmt.Year.astype('string')

### Check Success

In [None]:
check_df_dtype_expectations(df = mgmt, dtype_dct = mgmt_col_dtypes)

10/14 Columns pass.


Unnamed: 0,Column,dtype,Expected_dtype,Pass
0,Year,string,string,True
1,Experiment_Code,string,string,True
2,Range,string,string,True
3,Pass,string,string,True
4,Plot,string,string,True
5,Drop_Record_Index,float64,,False
6,Drop_Record_Index2,float64,,False
7,Application,string,string,True
8,Product,string,string,True
9,Date_Datetime,object,datetime64[ns],False


# Publish



In [None]:
# write_out_pkl(obj = sval, path = './data/interim/'+year_string+'sval.pickle')
# write_out_pkl(obj = wthr, path = './data/interim/'+year_string+'wthr.pickle')
# write_out_pkl(obj = mgmt, path = './data/interim/'+year_string+'mgmt.pickle')