# Wrangling Soil Test data from University of Kentucky's Soil Lab

Use Microsoft Access to export data into CSV text file with FIPS code add and quary to select just County by County name. Export as soildata_fips.txt.

#### import python libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings

#### set file path to get data to work on

In [2]:
filePath = Path('data')
fileOut = Path('project-data')
file_soil = filePath.joinpath('soildata_fips.txt')

#### Read data into pandas

In [3]:
soil = pd.read_csv(file_soil, dtype='str')

#### Check that file is read into memory

In [4]:
soil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190126 entries, 0 to 1190125
Data columns (total 14 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   FIPS_NO  1190126 non-null  object
 1   YEAR     1190126 non-null  object
 2   FM       1190052 non-null  object
 3   COUNTY   1190126 non-null  object
 4   AREA     1190126 non-null  object
 5   PH       1187607 non-null  object
 6   BUPH     1056246 non-null  object
 7   P        1187473 non-null  object
 8   K        1187494 non-null  object
 9   CA       969266 non-null   object
 10  MG       969725 non-null   object
 11  ZN       967041 non-null   object
 12  ACRES    525128 non-null   object
 13  CROP     1183431 non-null  object
dtypes: object(14)
memory usage: 127.1+ MB


In [5]:
soil.tail()

Unnamed: 0,FIPS_NO,YEAR,FM,COUNTY,AREA,PH,BUPH,P,K,CA,MG,ZN,ACRES,CROP
1190121,239.0,2019.0,A,WOODFORD,Bluegrass,5.0,6.3,62.0,319.0,1489.0,223.0,3.5,1.0,Wildlife Food Plot
1190122,239.0,2019.0,A,WOODFORD,Bluegrass,5.9,6.7,46.0,257.0,5247.0,268.0,2.1,2.0,Wildlife Food Plot
1190123,239.0,2019.0,A,WOODFORD,Bluegrass,6.8,7.0,75.0,243.0,12047.0,281.0,1.2,2.0,Wildlife Food Plot
1190124,239.0,2019.0,A,WOODFORD,Bluegrass,5.3,6.6,60.0,407.0,3304.0,396.0,2.8,,Wildlife Food Plot
1190125,239.0,2019.0,A,WOODFORD,Bluegrass,5.0,6.3,59.0,377.0,4341.0,349.0,2.0,1.5,Wildlife Food Plot


#### Remove decimal from FIPS_NO and Year, can't convert to an integer because of pivot table columns later in processing. Convert PH, BUPH, P, K, and Acres into Float type.

In [6]:
df = soil.copy()

In [7]:
df.FIPS_NO = df.FIPS_NO.astype('str').replace('\.00','',regex=True)
df.YEAR = df.YEAR.astype('str').replace('\.00','',regex=True)
df.PH = df.PH.astype('float')
df.BUPH = df.BUPH.astype('float')
df.P = df.P.astype('float')
df.K = df.K.astype('float')
df.ACRES = df.ACRES.astype('float')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190126 entries, 0 to 1190125
Data columns (total 14 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   FIPS_NO  1190126 non-null  object 
 1   YEAR     1190126 non-null  object 
 2   FM       1190052 non-null  object 
 3   COUNTY   1190126 non-null  object 
 4   AREA     1190126 non-null  object 
 5   PH       1187607 non-null  float64
 6   BUPH     1056246 non-null  float64
 7   P        1187473 non-null  float64
 8   K        1187494 non-null  float64
 9   CA       969266 non-null   object 
 10  MG       969725 non-null   object 
 11  ZN       967041 non-null   object 
 12  ACRES    525128 non-null   float64
 13  CROP     1183431 non-null  object 
dtypes: float64(5), object(9)
memory usage: 127.1+ MB


In [8]:
df.info()
print(df.head())
df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190126 entries, 0 to 1190125
Data columns (total 14 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   FIPS_NO  1190126 non-null  object 
 1   YEAR     1190126 non-null  object 
 2   FM       1190052 non-null  object 
 3   COUNTY   1190126 non-null  object 
 4   AREA     1190126 non-null  object 
 5   PH       1187607 non-null  float64
 6   BUPH     1056246 non-null  float64
 7   P        1187473 non-null  float64
 8   K        1187494 non-null  float64
 9   CA       969266 non-null   object 
 10  MG       969725 non-null   object 
 11  ZN       967041 non-null   object 
 12  ACRES    525128 non-null   float64
 13  CROP     1183431 non-null  object 
dtypes: float64(5), object(9)
memory usage: 127.1+ MB
  FIPS_NO  YEAR FM COUNTY                AREA    PH  BUPH      P      K  \
0       1  1990  A  ADAIR  Eastern Pennyroyal  7.15  7.23   28.0  158.0   
1       1  1990  A  ADAIR  Eastern Pennyroy

Unnamed: 0,FIPS_NO,YEAR,FM,COUNTY,AREA,PH,BUPH,P,K,CA,MG,ZN,ACRES,CROP
1190121,239,2019,A,WOODFORD,Bluegrass,5.0,6.3,62.0,319.0,1489.0,223.0,3.5,1.0,Wildlife Food Plot
1190122,239,2019,A,WOODFORD,Bluegrass,5.9,6.7,46.0,257.0,5247.0,268.0,2.1,2.0,Wildlife Food Plot
1190123,239,2019,A,WOODFORD,Bluegrass,6.8,7.0,75.0,243.0,12047.0,281.0,1.2,2.0,Wildlife Food Plot
1190124,239,2019,A,WOODFORD,Bluegrass,5.3,6.6,60.0,407.0,3304.0,396.0,2.8,,Wildlife Food Plot
1190125,239,2019,A,WOODFORD,Bluegrass,5.0,6.3,59.0,377.0,4341.0,349.0,2.0,1.5,Wildlife Food Plot


#### Drop CA, MG, ZN

In [9]:
df = df.drop(['CA','MG','ZN'], axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190126 entries, 0 to 1190125
Data columns (total 11 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   FIPS_NO  1190126 non-null  object 
 1   YEAR     1190126 non-null  object 
 2   FM       1190052 non-null  object 
 3   COUNTY   1190126 non-null  object 
 4   AREA     1190126 non-null  object 
 5   PH       1187607 non-null  float64
 6   BUPH     1056246 non-null  float64
 7   P        1187473 non-null  float64
 8   K        1187494 non-null  float64
 9   ACRES    525128 non-null   float64
 10  CROP     1183431 non-null  object 
dtypes: float64(5), object(6)
memory usage: 99.9+ MB


#### Check the maximum and minimum values for P and K 

In [11]:
print("max P =", df.P.max(), "min P =",df.P.min())
print("max K" , df.K.max(), "min K =", df.K.min())

max P = 21658.0 min P = -9.0
max K 60452.0 min K = -26.0


#### Remove values less than zero and above 9999

In [12]:
df = df[~(df['P'] < 0)]
df = df[~(df['K'] < 0)]
df = df[~(df['P'] >= 9999)]
df = df[~(df['K'] >= 9999)]


In [13]:
print("max P =", df.P.max(), "min P =",df.P.min())
print("max K" , df.K.max(), "min K =", df.K.min())

max P = 9778.0 min P = 0.0
max K 9964.0 min K = 1.0


#### Select agricultural "A" and commercial "C" types from FM column. Append df together.

In [14]:
df1 = df.loc[(df['FM'] == 'A')]
df2 = df.loc[(df['FM'] == 'C')]
df3 = df1.append(df2, ignore_index=True)

In [15]:
print(df1.info())
print(df2.info())
print(df3.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 941637 entries, 0 to 1190125
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   FIPS_NO  941637 non-null  object 
 1   YEAR     941637 non-null  object 
 2   FM       941637 non-null  object 
 3   COUNTY   941637 non-null  object 
 4   AREA     941637 non-null  object 
 5   PH       940288 non-null  float64
 6   BUPH     836405 non-null  float64
 7   P        940284 non-null  float64
 8   K        940295 non-null  float64
 9   ACRES    511570 non-null  float64
 10  CROP     938347 non-null  object 
dtypes: float64(5), object(6)
memory usage: 86.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21910 entries, 153 to 1190012
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   FIPS_NO  21910 non-null  object 
 1   YEAR     21910 non-null  object 
 2   FM       21910 non-null  object 
 3   COUNTY   21910 

#### Drop null values from CROP, P, K.

In [16]:
df3.drop(df3[df3['CROP'].isnull()].index, inplace=True)
df3.drop(df3[df3['P'].isnull()].index, inplace=True)
df3.drop(df3[df3['K'].isnull()].index, inplace=True)
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 958826 entries, 0 to 963546
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   FIPS_NO  958826 non-null  object 
 1   YEAR     958826 non-null  object 
 2   FM       958826 non-null  object 
 3   COUNTY   958826 non-null  object 
 4   AREA     958826 non-null  object 
 5   PH       958813 non-null  float64
 6   BUPH     854104 non-null  float64
 7   P        958826 non-null  float64
 8   K        958826 non-null  float64
 9   ACRES    517097 non-null  float64
 10  CROP     958826 non-null  object 
dtypes: float64(5), object(6)
memory usage: 87.8+ MB


#### Resort and index dataframe.

In [17]:
df = df3[['FIPS_NO','COUNTY','AREA','YEAR','CROP','ACRES', 'PH', 'BUPH', 'P', 'K', ]]
order_by_cols = ['FIPS_NO','YEAR','CROP']
df = df.sort_values(by=order_by_cols, ascending=[True,True,True]).copy()
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,FIPS_NO,COUNTY,AREA,YEAR,CROP,ACRES,PH,BUPH,P,K
0,1,ADAIR,Eastern Pennyroyal,1990,Alfalfa,18.0,7.15,7.23,28.0,158.0
1,1,ADAIR,Eastern Pennyroyal,1990,Alfalfa,15.0,6.95,7.22,88.0,134.0
2,1,ADAIR,Eastern Pennyroyal,1990,Alfalfa,16.0,6.26,6.94,70.0,256.0
3,1,ADAIR,Eastern Pennyroyal,1990,Alfalfa,6.0,5.67,6.69,161.0,611.0
4,1,ADAIR,Eastern Pennyroyal,1990,Alfalfa,25.0,7.26,7.47,105.0,315.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958826 entries, 0 to 958825
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   FIPS_NO  958826 non-null  object 
 1   COUNTY   958826 non-null  object 
 2   AREA     958826 non-null  object 
 3   YEAR     958826 non-null  object 
 4   CROP     958826 non-null  object 
 5   ACRES    517097 non-null  float64
 6   PH       958813 non-null  float64
 7   BUPH     854104 non-null  float64
 8   P        958826 non-null  float64
 9   K        958826 non-null  float64
dtypes: float64(5), object(5)
memory usage: 73.2+ MB


#### Find unique CROP types. 

In [19]:
croptypes = df.CROP.unique()
croptypes

array(['Alfalfa', 'Alfalfa/Cool Season', 'Burley Tobacco', 'Clover/Grass',
       'Cole Crops (broccoli, etc.)', 'Corn', 'Corn, Sweet', 'Cucumbers',
       'Fescue', 'No Info Given', 'Orchardgrass', 'Other Vegetables',
       'Peppers (bell & pimento)', 'Red Clover', 'Timothy', 'Tomatoes',
       'White Clover', 'White Clover/Grass', 'Rye', 'Soybeans',
       'Tobacco Beds', 'Wheat', 'Oats', 'Red Clover/Grass',
       'Warm Season Grass', 'Blueberries', 'Fescue/Lespedeza (multiple)',
       'Forage Sorghum', 'Strawberries', 'Cool Season Grass',
       'Evergreen Shrubs, Broadleaved', 'Sudangrass',
       'Timothy/Red Clover', 'Lespedeza', 'Other Fruit & Nuts',
       'Small Grains/Corn', 'Small Grains/Soybeans', 'Squash & Pumpkins',
       'Birdsfoot Trefoil', 'Grain Sorghum', 'Lespedeza/Grass', 'Annuals',
       'Fescue/Lespedeza', 'Forage Crops', 'Millet',
       'Orchardgrass/Red Clover', 'Apples', 'Grapes', 'Peaches',
       'Small Grains', 'Bermudagrass, common', 'Sweet Potatoes',

## Select CROP based on AGR-1 crop types.

## Corn

In [20]:
df_corn = df.loc[(df['CROP'] == 'Corn')]
print(df_corn.info())
df_corn.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173155 entries, 155 to 958668
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   FIPS_NO  173155 non-null  object 
 1   COUNTY   173155 non-null  object 
 2   AREA     173155 non-null  object 
 3   YEAR     173155 non-null  object 
 4   CROP     173155 non-null  object 
 5   ACRES    89557 non-null   float64
 6   PH       173155 non-null  float64
 7   BUPH     148991 non-null  float64
 8   P        173155 non-null  float64
 9   K        173155 non-null  float64
dtypes: float64(5), object(5)
memory usage: 14.5+ MB
None


Unnamed: 0,FIPS_NO,COUNTY,AREA,YEAR,CROP,ACRES,PH,BUPH,P,K
155,1,ADAIR,Eastern Pennyroyal,1990,Corn,15.0,7.13,7.29,37.0,146.0
156,1,ADAIR,Eastern Pennyroyal,1990,Corn,12.0,7.24,7.29,93.0,105.0
157,1,ADAIR,Eastern Pennyroyal,1990,Corn,27.0,5.91,6.85,25.0,252.0
158,1,ADAIR,Eastern Pennyroyal,1990,Corn,14.0,5.81,6.74,24.0,121.0
159,1,ADAIR,Eastern Pennyroyal,1990,Corn,7.0,5.39,6.67,92.0,283.0


#### Create dataframe for nutrients phosphorus (P) and potassium (K).

In [21]:
df_corn_nu = df_corn[['FIPS_NO','COUNTY','YEAR','P','K']].copy()
print(df_corn_nu.head())


    FIPS_NO COUNTY  YEAR     P      K
155       1  ADAIR  1990  37.0  146.0
156       1  ADAIR  1990  93.0  105.0
157       1  ADAIR  1990  25.0  252.0
158       1  ADAIR  1990  24.0  121.0
159       1  ADAIR  1990  92.0  283.0


#### Corn,  Set categories for P and K values to very low, low, medium, high, very high. Base values from AGR-1.

#### Categories for P
        Cat      Title       Break
        -------------------------------------
        VL       very low    P<= 5
        L        low         P>5 & P<=27
        M        medium      P>27 & P<=60
        H        high        P>60

#### Categories for K
        Cat      Title      Break
       --------------------------------------
        VL       very low   K< 100
        L        low        K>=100 & K <=190
        M        medium     K>=191 & K <=300
        H        high       K>=301 & K <=420
        VH       very high  K>420

In [22]:
df_corn_nu['CAT_P'] = ''
df_corn_nu['CAT_P'] = np.where(df_corn_nu.P <= 5, 'VL', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where(((df_corn_nu.P > 5) & (df_corn_nu.P <= 27)), 'L', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where(((df_corn_nu.P > 27) & (df_corn_nu.P <= 60)), 'M', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where((df_corn_nu.P > 60), 'H', df_corn_nu.CAT_P)
df_corn_nu.head()

Unnamed: 0,FIPS_NO,COUNTY,YEAR,P,K,CAT_P
155,1,ADAIR,1990,37.0,146.0,M
156,1,ADAIR,1990,93.0,105.0,H
157,1,ADAIR,1990,25.0,252.0,L
158,1,ADAIR,1990,24.0,121.0,L
159,1,ADAIR,1990,92.0,283.0,H


In [23]:
df_corn_nu['CAT_K'] = ''
df_corn_nu['CAT_K'] = np.where(df_corn_nu.K <= 100, 'VL', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 100) & (df_corn_nu.K <= 190)), 'L', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 190) & (df_corn_nu.K <= 300)), 'M', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 300) & (df_corn_nu.K <= 420)), 'H', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where((df_corn_nu.K > 420), 'VH', df_corn_nu.CAT_K)
df_corn_nu.head()

Unnamed: 0,FIPS_NO,COUNTY,YEAR,P,K,CAT_P,CAT_K
155,1,ADAIR,1990,37.0,146.0,M,L
156,1,ADAIR,1990,93.0,105.0,H,L
157,1,ADAIR,1990,25.0,252.0,L,M
158,1,ADAIR,1990,24.0,121.0,L,L
159,1,ADAIR,1990,92.0,283.0,H,M


#### Create pivot table to sort categories by year and County for each nutrient.

In [24]:
warnings.filterwarnings("ignore")
df_corn_p = np.round( df_corn_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_P'], values=['P'],aggfunc=(np.average,len),fill_value=0),2)
df_corn_k = np.round( df_corn_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_K'], values=['K'],aggfunc=(np.average,len),fill_value=0),2)
print(df_corn_p.head())
print(df_corn_k.head())

               P                                                               \
         average                                                                
YEAR        1990                       1991                       1992          
CAT_P          H      L      M   VL       H      L      M   VL       H      L   
COUNTY                                                                          
ADAIR     119.47  17.50  39.91  0.0  171.62  21.00  45.75  0.0  138.56  23.25   
ALLEN     115.67  17.00  42.67  0.0  130.78  18.25  44.50  0.0  147.89  19.20   
ANDERSON  244.80  10.67  55.00  5.0  333.94  22.00  44.78  0.0  290.56  20.00   
BALLARD   109.08   0.00  48.25  0.0  108.33  18.50  42.79  0.0  107.22  20.50   
BARREN    121.90  18.11  42.50  0.0  120.20  14.22  42.66  0.0  130.43  17.67   

          ...                                         
          ...  len                                    
YEAR      ... 2017    2018           2019             
CAT_P     ...    M VL   

## Unpivot table and save to CSV file

#### Create column names from pivot table data

In [25]:
df_corn_p.columns
df_corn_k.columns

MultiIndex([('K', 'average', '1990',  'H'),
            ('K', 'average', '1990',  'L'),
            ('K', 'average', '1990',  'M'),
            ('K', 'average', '1990', 'VH'),
            ('K', 'average', '1990', 'VL'),
            ('K', 'average', '1991',  'H'),
            ('K', 'average', '1991',  'L'),
            ('K', 'average', '1991',  'M'),
            ('K', 'average', '1991', 'VH'),
            ('K', 'average', '1991', 'VL'),
            ...
            ('K',     'len', '2018',  'H'),
            ('K',     'len', '2018',  'L'),
            ('K',     'len', '2018',  'M'),
            ('K',     'len', '2018', 'VH'),
            ('K',     'len', '2018', 'VL'),
            ('K',     'len', '2019',  'H'),
            ('K',     'len', '2019',  'L'),
            ('K',     'len', '2019',  'M'),
            ('K',     'len', '2019', 'VH'),
            ('K',     'len', '2019', 'VL')],
           names=[None, None, 'YEAR', 'CAT_K'], length=300)

In [26]:
df_corn_p.columns = list(map("_".join,df_corn_p.columns))
df_corn_k.columns = list(map("_".join,df_corn_k.columns))

In [27]:
print(df_corn_p.columns)
print(df_corn_k.columns)

Index(['P_average_1990_H', 'P_average_1990_L', 'P_average_1990_M',
       'P_average_1990_VL', 'P_average_1991_H', 'P_average_1991_L',
       'P_average_1991_M', 'P_average_1991_VL', 'P_average_1992_H',
       'P_average_1992_L',
       ...
       'P_len_2017_M', 'P_len_2017_VL', 'P_len_2018_H', 'P_len_2018_L',
       'P_len_2018_M', 'P_len_2018_VL', 'P_len_2019_H', 'P_len_2019_L',
       'P_len_2019_M', 'P_len_2019_VL'],
      dtype='object', length=240)
Index(['K_average_1990_H', 'K_average_1990_L', 'K_average_1990_M',
       'K_average_1990_VH', 'K_average_1990_VL', 'K_average_1991_H',
       'K_average_1991_L', 'K_average_1991_M', 'K_average_1991_VH',
       'K_average_1991_VL',
       ...
       'K_len_2018_H', 'K_len_2018_L', 'K_len_2018_M', 'K_len_2018_VH',
       'K_len_2018_VL', 'K_len_2019_H', 'K_len_2019_L', 'K_len_2019_M',
       'K_len_2019_VH', 'K_len_2019_VL'],
      dtype='object', length=300)


In [28]:
df_corn_p.columns = df_corn_p.columns.str.replace("P_average_", "")
df_corn_p.columns = df_corn_p.columns.str.replace("P_len", "count")
df_corn_k.columns = df_corn_k.columns.str.replace("K_average_","")
df_corn_k.columns = df_corn_k.columns.str.replace("K_len","count")
print(df_corn_p.columns)
print(df_corn_k.columns)

Index(['1990_H', '1990_L', '1990_M', '1990_VL', '1991_H', '1991_L', '1991_M',
       '1991_VL', '1992_H', '1992_L',
       ...
       'count_2017_M', 'count_2017_VL', 'count_2018_H', 'count_2018_L',
       'count_2018_M', 'count_2018_VL', 'count_2019_H', 'count_2019_L',
       'count_2019_M', 'count_2019_VL'],
      dtype='object', length=240)
Index(['1990_H', '1990_L', '1990_M', '1990_VH', '1990_VL', '1991_H', '1991_L',
       '1991_M', '1991_VH', '1991_VL',
       ...
       'count_2018_H', 'count_2018_L', 'count_2018_M', 'count_2018_VH',
       'count_2018_VL', 'count_2019_H', 'count_2019_L', 'count_2019_M',
       'count_2019_VH', 'count_2019_VL'],
      dtype='object', length=300)


#### Reindex unpivot table 

In [29]:
df_corn_p = df_corn_p.reset_index()
df_corn_k = df_corn_k.reset_index()
print(df_corn_p.head())
print(df_corn_k.head())

     COUNTY  1990_H  1990_L  1990_M  1990_VL  1991_H  1991_L  1991_M  1991_VL  \
0     ADAIR  119.47   17.50   39.91      0.0  171.62   21.00   45.75      0.0   
1     ALLEN  115.67   17.00   42.67      0.0  130.78   18.25   44.50      0.0   
2  ANDERSON  244.80   10.67   55.00      5.0  333.94   22.00   44.78      0.0   
3   BALLARD  109.08    0.00   48.25      0.0  108.33   18.50   42.79      0.0   
4    BARREN  121.90   18.11   42.50      0.0  120.20   14.22   42.66      0.0   

   1992_H  ...  count_2017_M  count_2017_VL  count_2018_H  count_2018_L  \
0  138.56  ...            20              0            31             7   
1  147.89  ...             4              0             1             0   
2  290.56  ...             0              0             2             2   
3  107.22  ...            10              0             7             0   
4  130.43  ...             4              0            10             3   

   count_2018_M  count_2018_VL  count_2019_H  count_2019_L  co

#### Save categorized data to file. Separate by crop and nutrient type (P and K) with count by category.

In [30]:
file_out_p = fileOut.joinpath('corn_p_levels.csv')  # path and filename
df_corn_p.to_csv(file_out_p, index=False)  # output to csv
file_out_k = fileOut.joinpath('corn_k_levels.csv')  # path and filename
df_corn_k.to_csv(file_out_k, index=False)  # output to csv
print ('total number of records written to CSV:','{:,}'.format(len(df_corn_p)),'\n')
print ('total number of records written to CSV:','{:,}'.format(len(df_corn_k)),'\n')

total number of records written to CSV: 120 

total number of records written to CSV: 120 



## Soybeans

#### Create list to select Soybeans from database.

In [31]:
soy_sel = ['Soybeans', 'Small Grains/Soybeans', 'Wheat/Soybeans', 'Canola/Soybeans', 'Rye/Soybeans', 'Oats/Soybeans', 'Barley/Soybeans', 'Triticale/Soybeans']
soy_sel.sort()
print(soy_sel)

['Barley/Soybeans', 'Canola/Soybeans', 'Oats/Soybeans', 'Rye/Soybeans', 'Small Grains/Soybeans', 'Soybeans', 'Triticale/Soybeans', 'Wheat/Soybeans']


#### Select soybeans from dataset.

In [32]:
df_soy = df[df.CROP.isin(soy_sel)]
df_soy_nu = df_soy[['FIPS_NO','COUNTY','YEAR','P','K']].copy()
print(df_soy_nu.head())

     FIPS_NO COUNTY  YEAR      P      K
628        1  ADAIR  1991  238.0  318.0
1879       1  ADAIR  1995   83.0  173.0
1880       1  ADAIR  1995   59.0  150.0
1881       1  ADAIR  1995   65.0  152.0
1882       1  ADAIR  1995  148.0  317.0


#### Soybeans, Set categories for P and K values to very low, low, medium, high, very high. Base values from AGR-1.

#### Categories for P
        Cat      Title       Break
        -------------------------------------
        VL       very low    P<= 5
        L        low         P>5 & P<=27
        M        medium      P>27 & P<=60
        H        high        P>60

#### Categories for K
        Cat      Title      Break
       --------------------------------------
        VL       very low   K< 100
        L        low        K>=100 & K <=190
        M        medium     K>=191 & K <=300
        H        high       K>300

In [35]:
df_soy_nu['CAT_P'] = ''
df_soy_nu['CAT_P'] = np.where(df_soy_nu.P <= 5, 'VL', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where(((df_soy_nu.P > 5) & (df_soy_nu.P <= 27)), 'L', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where(((df_soy_nu.P > 27) & (df_soy_nu.P <= 60)), 'M', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where((df_soy_nu.P > 60), 'H', df_soy_nu.CAT_P)

df_soy_nu['CAT_K'] = ''
df_soy_nu['CAT_K'] = np.where(df_soy_nu.K <= 99, 'VL', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 99) & (df_soy_nu.K <= 190)), 'L', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 190) & (df_soy_nu.K <= 300)), 'M', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 300) , 'H', df_soy_nu.CAT_K)


SyntaxError: unexpected EOF while parsing (<ipython-input-35-99642818431a>, line 10)

In [None]:
warnings.filterwarnings("ignore")
df_soy_p = np.round( df_soy_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_P'], values=['P'],aggfunc=(np.average,len),fill_value=0),2)
df_soy_k = np.round( df_soy_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_K'], values=['K'],aggfunc=(np.average,len),fill_value=0),2)

In [None]:
df_soy_p.columns
df_soy_k.columns
df_soy_p.columns = list(map("_".join,df_soy_p.columns))
df_soy_k.columns = list(map("_".join,df_soy_k.columns))
df_soy_p.columns = df_soy_p.columns.str.replace("P_average_", "")
df_soy_p.columns = df_soy_p.columns.str.replace("P_len", "count")
df_soy_k.columns = df_soy_k.columns.str.replace("K_average_","")
df_soy_k.columns = df_soy_k.columns.str.replace("K_len","count")
df_soy_p = df_soy_p.reset_index()
df_soy_k = df_soy_k.reset_index()
file_out_p = fileOut.joinpath('soy_p_levels.csv')  # path and filename
df_soy_p.to_csv(file_out_p, index=False)  # output to csv
file_out_k = fileOut.joinpath('soy_k_levels.csv')  # path and filename
df_soy_k.to_csv(file_out_k, index=False)  # output to csv
print ('total number of records written to CSV:','{:,}'.format(len(df_soy_p)),'\n')
print ('total number of records written to CSV:','{:,}'.format(len(df_soy_k)),'\n')