# Wrangling Soil Test data from University of Kentucky's Soil Lab

Use Microsoft Access to export data into CSV text file with FIPS code add and quary to select just County by County name. Export as soildata_fips.txt.

#### import python libraries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings

#### set file path to get data to work on

In [None]:
filePath = Path('data')
fileOut = Path('project-data')
file_soil = filePath.joinpath('soildata_fips.txt')

#### Read data into pandas

In [None]:
soil = pd.read_csv(file_soil, dtype='str')

#### Check that file is read into memory

In [None]:
soil.info()

In [None]:
soil.tail()

#### Remove decimal from FIPS_NO and Year, can't convert to an integer because of pivot table columns later in processing. Convert PH, BUPH, P, K, and Acres into Float type.

In [None]:
df = soil.copy()

In [None]:
df.FIPS_NO = df.FIPS_NO.astype('str').replace('\.00','',regex=True)
df.YEAR = df.YEAR.astype('str').replace('\.00','',regex=True)
df.PH = df.PH.astype('float')
df.BUPH = df.BUPH.astype('float')
df.P = df.P.astype('float')
df.K = df.K.astype('float')
df.ACRES = df.ACRES.astype('float')
df.info()

In [None]:
df.info()
print(df.head())
df.tail()

#### Drop CA, MG, ZN

In [None]:
df = df.drop(['CA','MG','ZN'], axis=1)

In [None]:
df.info()

#### Check the maximum and minimum values for P and K 

In [None]:
print("max P =", df.P.max(), "min P =",df.P.min())
print("max K" , df.K.max(), "min K =", df.K.min())

#### Remove values less than zero and above 9999

In [None]:
df = df[~(df['P'] < 0)]
df = df[~(df['K'] < 0)]
df = df[~(df['P'] >= 9999)]
df = df[~(df['K'] >= 9999)]


In [None]:
print("max P =", df.P.max(), "min P =",df.P.min())
print("max K" , df.K.max(), "min K =", df.K.min())

#### Select agricultural "A" and commercial "C" types from FM column. Append df together.

In [None]:
df1 = df.loc[(df['FM'] == 'A')]
df2 = df.loc[(df['FM'] == 'C')]
df3 = df1.append(df2, ignore_index=True)

In [None]:
print(df1.info())
print(df2.info())
print(df3.info())


#### Drop null values from CROP, P, K.

In [None]:
df3.drop(df3[df3['CROP'].isnull()].index, inplace=True)
df3.drop(df3[df3['P'].isnull()].index, inplace=True)
df3.drop(df3[df3['K'].isnull()].index, inplace=True)
df3.info()

#### Resort and index dataframe.

In [None]:
df = df3[['FIPS_NO','COUNTY','AREA','YEAR','CROP','ACRES', 'PH', 'BUPH', 'P', 'K', ]]
order_by_cols = ['FIPS_NO','YEAR','CROP']
df = df.sort_values(by=order_by_cols, ascending=[True,True,True]).copy()
df.reset_index(drop=True,inplace=True)
df.head()

In [None]:
df.info()

#### Find unique CROP types. 

In [None]:
croptypes = df.CROP.unique()
croptypes

## Select CROP based on AGR-1 crop types.

## Corn

In [None]:
df_corn = df.loc[(df['CROP'] == 'Corn')]
print(df_corn.info())
df_corn.head()

#### Create dataframe for nutrients phosphorus (P) and potassium (K).

In [None]:
df_corn_nu = df_corn[['FIPS_NO','COUNTY','YEAR','P','K']].copy()
print(df_corn_nu.head())


#### Corn,  Set categories for P and K values to very low, low, medium, high, very high. Base values from AGR-1.

#### Categories for P
        Cat      Title       Break
        -------------------------------------
        VL       very low    P<= 5
        L        low         P>5 & P<=27
        M        medium      P>27 & P<=60
        H        high        P>60

#### Categories for K
        Cat      Title      Break
       --------------------------------------
        VL       very low   K< 100
        L        low        K>=100 & K <=190
        M        medium     K>=191 & K <=300
        H        high       K>=301 & K <=420
        VH       very high  K>420

In [None]:
df_corn_nu['CAT_P'] = ''
df_corn_nu['CAT_P'] = np.where(df_corn_nu.P <= 5, 'VL', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where(((df_corn_nu.P > 5) & (df_corn_nu.P <= 27)), 'L', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where(((df_corn_nu.P > 27) & (df_corn_nu.P <= 60)), 'M', df_corn_nu.CAT_P)
df_corn_nu['CAT_P'] = np.where((df_corn_nu.P > 60), 'H', df_corn_nu.CAT_P)
df_corn_nu.head()

In [None]:
df_corn_nu['CAT_K'] = ''
df_corn_nu['CAT_K'] = np.where(df_corn_nu.K <= 100, 'VL', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 100) & (df_corn_nu.K <= 190)), 'L', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 190) & (df_corn_nu.K <= 300)), 'M', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where(((df_corn_nu.K > 300) & (df_corn_nu.K <= 420)), 'H', df_corn_nu.CAT_K)
df_corn_nu['CAT_K'] = np.where((df_corn_nu.K > 420), 'VH', df_corn_nu.CAT_K)
df_corn_nu.head()

#### Create pivot table to sort categories by year and County for each nutrient.

In [None]:
warnings.filterwarnings("ignore")
df_corn_p = np.round( df_corn_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_P'], values=['P'],aggfunc=(np.average,len),fill_value=0),2)
df_corn_k = np.round( df_corn_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_K'], values=['K'],aggfunc=(np.average,len),fill_value=0),2)
print(df_corn_p.head())
print(df_corn_k.head())

## Unpivot table and save to CSV file

#### Create column names from pivot table data

In [None]:
df_corn_p.columns
df_corn_k.columns

In [None]:
df_corn_p.columns = list(map("_".join,df_corn_p.columns))
df_corn_k.columns = list(map("_".join,df_corn_k.columns))

In [None]:
print(df_corn_p.columns)
print(df_corn_k.columns)

In [None]:
df_corn_p.columns = df_corn_p.columns.str.replace("P_average_", "")
df_corn_p.columns = df_corn_p.columns.str.replace("P_len", "count")
df_corn_k.columns = df_corn_k.columns.str.replace("K_average_","")
df_corn_k.columns = df_corn_k.columns.str.replace("K_len","count")
print(df_corn_p.columns)
print(df_corn_k.columns)

#### Reindex unpivot table 

In [None]:
df_corn_p = df_corn_p.reset_index()
df_corn_k = df_corn_k.reset_index()
print(df_corn_p.head())
print(df_corn_k.head())

#### Save categorized data to file. Separate by crop and nutrient type (P and K) with count by category.

In [None]:
file_out_p = fileOut.joinpath('corn_p_levels.csv')  # path and filename
df_corn_p.to_csv(file_out_p, index=False)  # output to csv
file_out_k = fileOut.joinpath('corn_k_levels.csv')  # path and filename
df_corn_k.to_csv(file_out_k, index=False)  # output to csv
print ('total number of records written to CSV:','{:,}'.format(len(df_corn_p)),'\n')
print ('total number of records written to CSV:','{:,}'.format(len(df_corn_k)),'\n')

## Soybeans

#### Create list to select Soybeans from database.

In [None]:
soy_sel = ['Soybeans', 'Small Grains/Soybeans', 'Wheat/Soybeans', 'Canola/Soybeans', 'Rye/Soybeans', 'Oats/Soybeans', 'Barley/Soybeans', 'Triticale/Soybeans']
soy_sel.sort()
print(soy_sel)

#### Select soybeans from dataset.

In [None]:
df_soy = df[df.CROP.isin(soy_sel)]
df_soy_nu = df_soy[['FIPS_NO','COUNTY','YEAR','P','K']].copy()
print(df_soy_nu.head())

#### Soybeans, Set categories for P and K values to very low, low, medium, high, very high. Base values from AGR-1.

#### Categories for P
        Cat      Title       Break
        -------------------------------------
        VL       very low    P<= 5
        L        low         P>5 & P<=27
        M        medium      P>27 & P<=60
        H        high        P>60

#### Categories for K
        Cat      Title      Break
       --------------------------------------
        VL       very low   K< 100
        L        low        K>=100 & K <=190
        M        medium     K>=191 & K <=300
        H        high       K>300

In [None]:
df_soy_nu['CAT_P'] = ''
df_soy_nu['CAT_P'] = np.where(df_soy_nu.P <= 5, 'VL', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where(((df_soy_nu.P > 5) & (df_soy_nu.P <= 27)), 'L', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where(((df_soy_nu.P > 27) & (df_soy_nu.P <= 60)), 'M', df_soy_nu.CAT_P)
df_soy_nu['CAT_P'] = np.where((df_soy_nu.P > 60), 'H', df_soy_nu.CAT_P)
df_soy_nu.head()
df_soy_nu['CAT_K'] = ''
df_soy_nu['CAT_K'] = np.where(df_soy_nu.K <= 99, 'VL', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 99) & (df_soy_nu.K <= 190)), 'L', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 190) & (df_soy_nu.K <= 300)), 'M', df_soy_nu.CAT_K)
df_soy_nu['CAT_K'] = np.where(((df_soy_nu.K > 300) , 'H', df_soy_nu.CAT_K)
df_soy_nu.head()

In [None]:
warnings.filterwarnings("ignore")
df_soy_p = np.round( df_soy_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_P'], values=['P'],aggfunc=(np.average,len),fill_value=0),2)
df_soy_k = np.round( df_soy_nu.pivot_table(index='COUNTY', columns=['YEAR', 'CAT_K'], values=['K'],aggfunc=(np.average,len),fill_value=0),2)

In [None]:
df_soy_p.columns
df_soy_k.columns
df_soy_p.columns = list(map("_".join,df_soy_p.columns))
df_soy_k.columns = list(map("_".join,df_soy_k.columns))
df_soy_p.columns = df_soy_p.columns.str.replace("P_average_", "")
df_soy_p.columns = df_soy_p.columns.str.replace("P_len", "count")
df_soy_k.columns = df_soy_k.columns.str.replace("K_average_","")
df_soy_k.columns = df_soy_k.columns.str.replace("K_len","count")
df_soy_p = df_soy_p.reset_index()
df_soy_k = df_soy_k.reset_index()
file_out_p = fileOut.joinpath('soy_p_levels.csv')  # path and filename
df_soy_p.to_csv(file_out_p, index=False)  # output to csv
file_out_k = fileOut.joinpath('soy_k_levels.csv')  # path and filename
df_soy_k.to_csv(file_out_k, index=False)  # output to csv
print ('total number of records written to CSV:','{:,}'.format(len(df_soy_p)),'\n')
print ('total number of records written to CSV:','{:,}'.format(len(df_soy_k)),'\n')